Ejemplo n.º 1
0
    def run_once(self, path_outputs=None):
        """
        This is the main test function. It runs the testing procedure for
        every PPD file. Tests are run simultaneously in many threads.

        @param path_outputs: if it is not None, raw outputs sent
                to printers are dumped here; the directory is overwritten if
                already exists (is deleted and recreated)

        @raises error.TestFail if at least one of the tests failed

        """
        # Set directory for output documents
        self._path_output_directory = self._calculate_full_path(path_outputs)
        if self._path_output_directory is not None:
            # Delete whole directory if already exists
            file_utils.rm_dir_if_exists(self._path_output_directory)
            # Create archivers
            self._archivers = dict()
            for doc_name in self._docs:
                path_for_archiver = os.path.join(self._path_output_directory,
                                                 doc_name)
                self._archivers[doc_name] = archiver.Archiver(
                    path_for_archiver, self._ppds, 50)
            # A place for new digests
            self._new_digests = dict()
            for doc_name in self._docs:
                self._new_digests[doc_name] = dict()

        # Runs tests for all PPD files (in parallel)
        outputs = self._processor.run(self._thread_test_PPD, len(self._ppds))

        # Analyses tests' outputs, prints a summary report and builds a list
        # of PPD filenames that failed
        failures = []
        for i, output in enumerate(outputs):
            ppd_file = self._ppds[i]
            if output != True:
                failures.append(ppd_file)
            else:
                output = 'OK'
            line = "%s: %s" % (ppd_file, output)
            logging.info(line)

        # Calculate digests files for output documents (if dumped)
        if self._path_output_directory is not None:
            for doc_name in self._docs:
                path = os.path.join(self._path_output_directory,
                                    doc_name + '.digests')
                helpers.save_digests_file(path, self._new_digests[doc_name],
                                          failures)

        # Raises an exception if at least one test failed
        if len(failures) > 0:
            failures.sort()
            raise error.TestFail('Test failed for %d PPD files: %s' %
                                 (len(failures), ', '.join(failures)))
Ejemplo n.º 2
0
    def extractMedia(self,
                     arname,
                     artype=None,
                     excludeMimetypes=None,
                     clfilter=None,
                     timefilter=None):
        if not self.smsXML:
            raise Exception('cannot extract media from call file')

        archive = archiver.Archiver(arname, type_=artype)
        usedNames = set()

        for node in self.genMmsMedia(excludeMimetypes,
                                     clFilter=clfilter,
                                     timeFilter=timefilter):
            if 'name' in node:
                name = node['name']
                if '.' in name:
                    spl = name.split('.')
                    fname = '.'.join(spl[1:])
                    ext = '.' + spl[0]
                else:
                    fname = ''
                    ext = ''

                incr = 1
                while name in usedNames:
                    name = '%s_%d%s' % (fname, incr, ext)
                    incr += 1
            else:
                mmsparent = node.parent.parent
                ext = mimetype.guessExtension(node['ct'])

                if ext is not None and len(ext) != 0:
                    ext = '.' + ext
                name = '%s-%s%s' % (mmsparent['date'],
                                    mmsparent['contact_name'], ext)

                incr = 1
                while name in usedNames:
                    name = '%s-%s_%d%s' % (mmsparent['date'],
                                           mmsparent['contact_name'], incr,
                                           ext)
                    incr += 1

            data = base64.b64decode(node.attrs['data'])
            try:
                archive.addFile(name, data)
                usedNames.add(name)
            except:
                traceback.print_exc()

        archive.close()
Ejemplo n.º 3
0
def destalinate_job():
    print("Destalinating")
    if "SB_TOKEN" not in os.environ or "API_TOKEN" not in os.environ:
        print("ERR: Missing at least one Slack environment variable.")
    else:
        scheduled_warner = warner.Warner()
        scheduled_archiver = archiver.Archiver()
        scheduled_announcer = announcer.Announcer()
        scheduled_flagger = flagger.Flagger()
        print("Warning")
        scheduled_warner.warn()
        print("Archiving")
        scheduled_archiver.archive()
        print("Announcing")
        scheduled_announcer.announce()
        print("Flagging")
        scheduled_flagger.flag()
        print("OK: destalinated")
    print("END: destalinate_job")
Ejemplo n.º 4
0
def destalinate_job():
    logging.info("Destalinating")
    if not _config.sb_token or not _config.api_token:
        logging.error(
            "Missing at least one required Slack environment variable.\n"
            "Make sure to set DESTALINATOR_SB_TOKEN and DESTALINATOR_API_TOKEN."
        )
    else:
        try:
            archiver.Archiver().archive()
            warner.Warner().warn()
            announcer.Announcer().announce()
            flagger.Flagger().flag()
            logging.info("OK: destalinated")
        except Exception as e:  # pylint: disable=W0703
            raven_client.captureException()
            if not _config.sentry_dsn:
                raise e
    logging.info("END: destalinate_job")
Ejemplo n.º 5
0
def destalinate_job():
    print("Destalinating")
    if "SB_TOKEN" not in os.environ or "API_TOKEN" not in os.environ:
        print("ERR: Missing at least one Slack environment variable.")
    else:
        try:
            scheduled_warner = warner.Warner()
            scheduled_archiver = archiver.Archiver()
            scheduled_announcer = announcer.Announcer()
            scheduled_flagger = flagger.Flagger()
            print("Warning")
            scheduled_warner.warn()
            print("Archiving")
            scheduled_archiver.archive()
            print("Announcing")
            scheduled_announcer.announce()
            print("Flagging")
            scheduled_flagger.flag()
            print("OK: destalinated")
        except Exception as e:  # pylint: disable=W0703
            raven_client.captureException()
            if not os.getenv('SENTRY_DSN'):
                raise e
    print("END: destalinate_job")
Ejemplo n.º 6
0
 def test_tar_guess_archive(self):
     self.create_compare(archiver.Archiver('tmp.tar'), MD5SUM_TAR)
Ejemplo n.º 7
0
 def test_zip_archive(self):
     self.create_compare(archiver.Archiver('tmp.zip', type_='zip'), MD5SUM_ZIP)
Ejemplo n.º 8
0
 def test_tgz_archive(self):
     self.create_compare(archiver.Archiver('tmp.tgz', type_='tgz'), MD5SUM_TGZ)
Ejemplo n.º 9
0
 def test_tar_archive(self):
     self.create_compare(archiver.Archiver('tmp.tar', type_='tar'), MD5SUM_TAR)
Ejemplo n.º 10
0
    def run(self):
        global block, y, es, lists, baddies, config, resendTo, timeout
        ja = []
        jas = []
        print("Thread started")
        mla = None
        ml = ""
        mboxfile = ""
        filename = ""
        xlist_override = None

        foo = archiver.Archiver(parseHTML = parseHTML)
    
        while len(lists) > 0:
            print("%u elements left to slurp" % len(lists))
            block.acquire()
            try:
                mla = lists.pop(0)
            except Exception as err:
                print("Could not pop list: %s" % err)
                block.release()
                return
            if not mla:
                print("Nothing more to do here")
                block.release()
                return
            block.release()
            y += 1
            EY = 1980
            EM = 1
            stime = time.time()
            dFile = False
            if maildir:
                messages = mailbox.Maildir(tmpname)
            elif imap:
                y -= 1 # TODO don't understand the increment above
                imap4 = mla[2]
                def mailgen(list):
                    for uid in list:
                        msgbytes = imap4.uid('fetch', uid, '(RFC822)')[1][0][1]
                        yield email.message_from_bytes(msgbytes)
                messages = mailgen(mla[0])
                xlist_override = mla[1]
            elif filebased:
                
                tmpname = mla[0]
                filename = mla[0]
                xlist_override = mla[1]
                if filename.find(".gz") != -1:
                    print("Decompressing %s..." % filename)
                    try:
                        with open(filename, "rb") as bf:
                            bmd = bf.read()
                            bf.close()
                            bmd = gzip.decompress(bmd)
                            tmpfile = tempfile.NamedTemporaryFile(mode='w+b', buffering=1, delete=False)
                            tmpfile.write(bmd)
                            tmpfile.flush()
                            tmpfile.close()
                            tmpname = tmpfile.name
                            filename = tmpname
                            dFile = True # Slated for deletion upon having been read
                            print("%s -> %u bytes" % (tmpname, len(bmd)))
                    except Exception as err:
                        print("This wasn't a gzip file: %s" % err )
                print("Slurping %s" % filename)
                messages = mailbox.mbox(tmpname)

            else:
                ml = mla[0]
                mboxfile = mla[1]
                xlist_override = list_override
                print("Slurping %s/%s" % (ml, mboxfile))
                m = re.match(r"(\d\d\d\d)(\d\d)", mboxfile)
                EY = 1997
                EM = 1
                if m:
                    EY = int(m.group(1))
                    EM = int(m.group(2))
                ctx = urlopen("%s%s/%s" % (source, ml, mboxfile ))
                inp = ctx.read().decode(ctx.headers.get_content_charset() or 'utf-8', errors='ignore')
    
                tmpname = hashlib.sha224(("%f-%f-%s-%s.mbox" % (random.random(), time.time(), ml, mboxfile)).encode('utf-8') ).hexdigest()
                with open(tmpname, "w") as f:
                    f.write(inp)
                    f.close()
                messages = mailbox.mbox(tmpname)

            count = 0
            LEY = EY

            for message in messages:
                if resendTo:
                    print("Delivering message %s via MTA" % message['message-id'] if 'message-id' in message else '??')
                    s = SMTP('localhost')
                    try:
                        if list_override:
                            message.replace_header('List-ID', list_override)
                        message.replace_header('To', resendTo)
                    except:
                        if list_override:
                            message['List-ID'] = list_override
                    message['cc'] = None
                    s.send_message(message, from_addr=None, to_addrs=(resendTo))
                    continue
                if (time.time() - stime > timeout): # break out after N seconds, it shouldn't take this long..!
                    print("Whoa, this is taking way too long, ignoring %s for now" % tmpname)
                    break

                json, contents = foo.compute_updates(list_override, private, message)

                if json:
                    json_source = {
                        'mid': json['mid'],
                        'message-id': json['message-id'],
                        'source': message.as_bytes().decode('utf-8', errors='replace')
                    }

                    count += 1
                    ja.append(json)
                    jas.append(json_source)
                    if contents:
                        iname = config.get("elasticsearch", "dbname")
                        if not args.dry:
                            for key in contents:
                                es.index(
                                    index=iname,
                                    doc_type="attachment",
                                    id=key,
                                    body = {
                                        'source': contents[key]
                                    }
                                )
                    if len(ja) >= 40:
                        if not args.dry:
                            bulk = BulkThread()
                            bulk.assign(ja, es, 'mbox')
                            bulk.insert()
                        ja = []
                        
                        if not args.dry:
                            bulks = BulkThread()
                            bulks.assign(jas, es, 'mbox_source')
                            bulks.insert()
                        jas = []
                else:
                    baddies += 1

            if filebased:
                print("Parsed %u records from %s" % (count, filename))
                if dFile:
                    os.unlink(tmpname)
            elif imap:
                print("Parsed %u records from imap" % count)
            else:
                print("Parsed %s/%s: %u records from %s" % (ml, mboxfile, count, tmpname))
                os.unlink(tmpname)
                
            y += count
            if not args.dry:
                bulk = BulkThread()
                bulk.assign(ja, es)
                bulk.insert()
            ja = []
            
            if not args.dry:
                bulks = BulkThread()
                bulks.assign(jas, es, 'mbox_source')
                bulks.insert()
            jas = []
        print("Done, %u elements left to slurp" % len(lists))
Ejemplo n.º 11
0
    def run(self):
        global block, y, es, lists, baddies, config, resendTo, timeout, dedupped, dedup
        self.name = Thread.getName(self)
        ja = []
        jas = []
        self.printid("Thread started")
        mla = None
        ml = ""
        mboxfile = ""
        filename = ""

        archie = archiver.Archiver(parseHTML = parseHTML)
    
        while len(lists) > 0:
            self.printid("%u elements left to slurp" % len(lists))

            block.acquire()
            try:
                mla = lists.pop(0)
                if not mla:
                    self.printid("Nothing more to do here")
                    return
            except Exception as err:
                self.printid("Could not pop list: %s" % err)
                return
            finally:
                block.release()

            EY = 1980
            EM = 1
            stime = time.time()
            dFile = False
            if maildir:
                messages = mailbox.Maildir(tmpname, create=False)
            elif imap:
                imap4 = mla[2]
                def mailgen(list):
                    for uid in list:
                        msgbytes = imap4.uid('fetch', uid, '(RFC822)')[1][0][1]
                        yield email.message_from_bytes(msgbytes)
                messages = mailgen(mla[0])
            elif filebased:
                
                tmpname = mla[0]
                filename = mla[0]
                if filename.find(".gz") != -1:
                    self.printid("Decompressing %s..." % filename)
                    try:
                        with open(filename, "rb") as bf:
                            bmd = bf.read()
                            bf.close() # explicit early close
                            bmd = gzip.decompress(bmd)
                            tmpfile = tempfile.NamedTemporaryFile(mode='w+b', buffering=1, delete=False)
                            tmpfile.write(bmd)
                            tmpfile.flush()
                            tmpfile.close()
                            tmpname = tmpfile.name
                            dFile = True # Slated for deletion upon having been read
                            self.printid("%s -> %u bytes" % (tmpname, len(bmd)))
                    except Exception as err:
                        self.printid("This wasn't a gzip file: %s" % err )
                self.printid("Slurping %s" % filename)
                messages = mailbox.mbox(tmpname, None if noMboxo else MboxoFactory, create=False)

            else:
                ml = mla[0]
                mboxfile = mla[1]
                self.printid("Slurping %s/%s" % (ml, mboxfile))
                m = re.match(r"(\d\d\d\d)(\d\d)", mboxfile)
                EY = 1997
                EM = 1
                if m:
                    EY = int(m.group(1))
                    EM = int(m.group(2))
                ctx = urlopen("%s%s/%s" % (source, ml, mboxfile ))
                inp = ctx.read().decode(ctx.headers.get_content_charset() or 'utf-8', errors='ignore')
    
                tmpname = hashlib.sha224(("%f-%f-%s-%s.mbox" % (random.random(), time.time(), ml, mboxfile)).encode('utf-8') ).hexdigest()
                with open(tmpname, "w") as f:
                    f.write(inp)
                messages = mailbox.mbox(tmpname, None if noMboxo else MboxoFactory, create=False)

            count = 0
            bad = 0
            LEY = EY
            
            
            for key in messages.iterkeys():
                message=messages.get(key)
                # If --filter is set, discard any messages not matching by continuing to next email
                if fromFilter and 'from' in message and message['from'].find(fromFilter) == -1:
                    continue
                if resendTo:
                    self.printid("Delivering message %s via MTA" % message['message-id'] if 'message-id' in message else '??')
                    s = SMTP('localhost')
                    try:
                        if list_override:
                            message.replace_header('List-ID', list_override)
                        message.replace_header('To', resendTo)
                    except:
                        if list_override:
                            message['List-ID'] = list_override
                    message['cc'] = None
                    s.send_message(message, from_addr=None, to_addrs=(resendTo))
                    continue
                if (time.time() - stime > timeout): # break out after N seconds, it shouldn't take this long..!
                    self.printid("Whoa, this is taking way too long, ignoring %s for now" % tmpname)
                    break

                # Don't pass message to archiver unless we have a list id
                if not (list_override or message['list-id']):
                    self.printid("No list id found for %s " % message['message-id'])
                    bad += 1
                    continue

                json, contents = archie.compute_updates(list_override, private, message)
                
                # Not sure this can ever happen
                if json and not (json['list'] and json['list_raw']):
                    self.printid("No list id found for %s " % json['message-id'])
                    bad += 1
                    continue

                # If --dedup is active, try to filter out any messages that already exist on the list
                if json and dedup and message.get('message-id', None):
                    res = es.search(
                        index=dbname,
                        doc_type="mbox",
                        size = 1,
                        _source = ['mid'], # so can report the match source
                        body = {
                            'query': {
                                'bool': {
                                    'must': [
                                        {
                                            'term': {
                                                'message-id': message.get('message-id', None)
                                            }
                                        },
                                        {
                                            'term': {
                                                'list_raw': json['list']
                                            }                                                 
                                        }
                                    ]
                                }
                            }
                        }
                    )
                    if res and res['hits']['total'] > 0:
                        self.printid("Dedupping %s - matched in %s" % (json['message-id'], res['hits']['hits'][0]['_source']['mid']))
                        dedupped += 1
                        continue

                if json:
                    file=messages.get_file(key, True)
                    # If the parsed data is filtered, also need to filter the raw input
                    # so the source agrees with the summary info
                    if message.__class__.__name__ == 'MboxoFactory':
                        file=MboxoReader(file)
                    raw_msg=file.read()
                    file.close()
                    if args.dups:
                        try:
                            duplicates[json['mid']].append(json['message-id'] + " in " + filename)
                        except:
                            duplicates[json['mid']]=[json['message-id'] + " in " + filename]

                    try: # temporary hack to try and find an encoding issue
                        # needs to be replaced by proper exception handling
                        json_source = {
                            'mid': json['mid'], # needed for bulk-insert only, not needed in database
                            'message-id': json['message-id'],
                            'source': archie.mbox_source(raw_msg)
                        }
                    except Exception as e:
                        self.printid("Error '%s' processing id %s msg %s " % (e, json['mid'], json['message-id']))
                        bad += 1
                        continue

                    count += 1
                    ja.append(json)
                    jas.append(json_source)
                    if contents:
                        if not args.dry:
                            for key in contents:
                                es.index(
                                    index=dbname,
                                    doc_type="attachment",
                                    id=key,
                                    body = {
                                        'source': contents[key]
                                    }
                                )
                    if len(ja) >= 40:
                        bulk = BulkThread()
                        bulk.assign(self.name, ja, es, 'mbox')
                        bulk.insert()
                        ja = []
                        
                        bulks = BulkThread()
                        bulks.assign(self.name, jas, es, 'mbox_source')
                        bulks.insert()
                        jas = []
                else:
                    self.printid("Failed to parse: Return=%s Message-Id=%s" % (message.get('Return-Path'), message.get('Message-Id')))
                    bad += 1

            if filebased:
                self.printid("Parsed %u records (failed: %u) from %s" % (count, bad, filename))
                if dFile:
                    os.unlink(tmpname)
            elif imap:
                self.printid("Parsed %u records (failed: %u) from imap" % (count, bad))
            else:
                self.printid("Parsed %s/%s: %u records (failed: %u) from %s" % (ml, mboxfile, count, bad, tmpname))
                os.unlink(tmpname)
                
            y += count
            baddies += bad
            if len(ja) > 0:
                bulk = BulkThread()
                bulk.assign(self.name, ja, es, 'mbox')
                bulk.insert()
            ja = []
            
            if len(jas) > 0:
                bulks = BulkThread()
                bulks.assign(self.name, jas, es, 'mbox_source')
                bulks.insert()
            jas = []
        self.printid("Done, %u elements left to slurp" % len(lists))
    def run(self):
        global goodies, baddies, dedupped
        self.name = Thread.getName(self)
        ja = []
        jas = []
        self.printid("Thread started")
        mla = None
        ml = ""
        mboxfile = ""
        filename = ""
        archie = archiver.Archiver(generator=args.generator,
                                   parse_html=args.html2text,
                                   ignore_body=args.ibody,
                                   verbose=args.verbose)

        while len(lists) > 0:
            self.printid("%u elements left to slurp" % len(lists))

            block.acquire()
            try:
                mla = lists.pop(0)
                if not mla:
                    self.printid("Nothing more to do here")
                    return
            except Exception as err:
                self.printid("Could not pop list: %s" % err)
                return
            finally:
                block.release()

            stime = time.time()
            delete_file = False
            if imap:
                imap4 = mla[2]

                def mailgen(_list):
                    for uid in _list:
                        msgbytes = imap4.uid("fetch", uid, "(RFC822)")[1][0][1]
                        yield email.message_from_bytes(msgbytes)

                messages = mailgen(mla[0])
            elif filebased:

                tmpname = mla[0]
                filename = mla[0]
                if filename.find(".gz") != -1:
                    self.printid("Decompressing %s..." % filename)
                    try:
                        with open(filename, "rb") as bf:
                            bmd = bf.read()
                            bf.close()  # explicit early close
                            bmd = gzip.decompress(bmd)
                            tmpfile = tempfile.NamedTemporaryFile(mode="w+b",
                                                                  buffering=1,
                                                                  delete=False)
                            tmpfile.write(bmd)
                            tmpfile.flush()
                            tmpfile.close()
                            tmpname = tmpfile.name
                            delete_file = True  # Slated for deletion upon having been read
                            self.printid("%s -> %u bytes" %
                                         (tmpname, len(bmd)))
                    except Exception as err:
                        self.printid("This wasn't a gzip file: %s" % err)
                self.printid("Slurping %s" % filename)
                if maildir:
                    messages = mailbox.Maildir(tmpname, create=False)
                else:
                    messages = mailbox.mbox(tmpname,
                                            None if noMboxo else MboxoFactory,
                                            create=False)

            else:
                ml = mla[0]
                mboxfile = mla[1]
                self.printid("Slurping %s/%s" % (ml, mboxfile))
                ctx = urlopen("%s%s/%s" % (source, ml, mboxfile))
                inp = ctx.read().decode(ctx.headers.get_content_charset()
                                        or "utf-8",
                                        errors="ignore")

                tmpname = hashlib.sha224(
                    ("%f-%f-%s-%s.mbox" %
                     (random.random(), time.time(), ml,
                      mboxfile)).encode("utf-8")).hexdigest()
                with open(tmpname, "w") as f:
                    f.write(inp)
                if maildir:
                    messages = mailbox.Maildir(tmpname, create=False)
                else:
                    messages = mailbox.mbox(tmpname,
                                            None if noMboxo else MboxoFactory,
                                            create=False)

            count = 0
            bad = 0

            for key in messages.iterkeys():
                message = messages.get(key)
                file = messages.get_file(key, True)
                # If the parsed data is filtered, also need to filter the raw input
                # so the source agrees with the summary info
                if message.__class__.__name__ == "MboxoFactory":
                    file = MboxoReader(file)
                message_raw = file.read()
                file.close()
                # If --filter is set, discard any messages not matching by continuing to next email
                if (fromFilter and "from" in message
                        and message["from"].find(fromFilter) == -1):
                    continue
                if resendTo:
                    self.printid("Delivering message %s via MTA" %
                                 message["message-id"] if "message-id" in
                                 message else "??")
                    s = SMTP("localhost")
                    try:
                        if list_override:
                            message.replace_header("List-ID", list_override)
                        message.replace_header("To", resendTo)
                    except:
                        if list_override:
                            message["List-ID"] = list_override
                    message["cc"] = None
                    s.send_message(message,
                                   from_addr=None,
                                   to_addrs=(resendTo))
                    continue
                if (
                        time.time() - stime > timeout
                ):  # break out after N seconds, it shouldn't take this long..!
                    self.printid(
                        "Whoa, this is taking way too long, ignoring %s for now"
                        % tmpname)
                    break

                # Don't pass message to archiver unless we have a list id
                if not (list_override or message["list-id"]):
                    self.printid("No list id found for %s " %
                                 message["message-id"])
                    bad += 1
                    continue

                json, contents, _msgdata, _irt = archie.compute_updates(
                    list_override, private, message, message_raw)

                # Not sure this can ever happen
                if json and not (json["list"] and json["list_raw"]):
                    self.printid("No list id found for %s " %
                                 json["message-id"])
                    bad += 1
                    continue

                # If --dedup is active, try to filter out any messages that already exist on the list
                if json and dedup and message.get("message-id", None):
                    res = es.search(
                        index=es.db_mbox,
                        doc_type="_doc",
                        size=1,
                        _source=["mid"],  # so can report the match source
                        body={
                            "query": {
                                "bool": {
                                    "must": [
                                        {
                                            "term": {
                                                "message-id":
                                                message.get(
                                                    "message-id", None)
                                            }
                                        },
                                        {
                                            "term": {
                                                "list_raw": json["list"]
                                            }
                                        },
                                    ]
                                }
                            }
                        },
                    )
                    if res and res["hits"]["total"] > 0:
                        self.printid("Dedupping %s - matched in %s" % (
                            json["message-id"],
                            res["hits"]["hits"][0]["_source"]["mid"],
                        ))
                        dedupped += 1
                        continue

                if json:
                    if args.dups:
                        try:
                            duplicates[json["mid"]].append(json["message-id"] +
                                                           " in " + filename)
                        except:
                            duplicates[json["mid"]] = [
                                json["message-id"] + " in " + filename
                            ]

                    # Mark that we imported this email
                    json["_notes"] = [
                        x for x in json["_notes"] if "ARCHIVE:" not in x
                    ]  # Pop archiver.py note
                    json["_notes"].append([
                        "IMPORT: Email imported as %s at %u" %
                        (json["mid"], time.time())
                    ])

                    try:  # temporary hack to try and find an encoding issue
                        # needs to be replaced by proper exception handling
                        json_source = {
                            "permalinks": json["permalinks"],
                            "mid": json["dbid"],
                            "message-id": json["message-id"],
                            "source": archiver.mbox_source(message_raw),
                        }
                    except Exception as e:
                        self.printid("Error '%s' processing id %s msg %s " %
                                     (e, json["mid"], json["message-id"]))
                        bad += 1
                        continue

                    count += 1
                    ja.append(json)
                    jas.append(json_source)
                    if args.verbose and verbose_logger:
                        # TODO optionally show other fields (e.g. From_ line)
                        verbose_logger.info("MID:%(mid)s MSGID:%(message-id)s",
                                            json)
                    if contents:
                        if not args.dry:
                            for key in contents:
                                es.index(
                                    index=es.db_attachment,
                                    doc_type="_doc",
                                    id=key,
                                    body={"source": contents[key]},
                                )
                    if len(ja) >= 40 and not args.dry:
                        bulk_insert(self.name, ja, es, "mbox")
                        ja = []

                        bulk_insert(self.name, jas, es, "source")
                        jas = []
                else:
                    self.printid("Failed to parse: Return=%s Message-Id=%s" %
                                 (message.get("Return-Path"),
                                  message.get("Message-Id")))
                    bad += 1

            if filebased:
                self.printid("Parsed %u records (failed: %u) from %s" %
                             (count, bad, filename))
                if delete_file:
                    os.unlink(tmpname)
            elif imap:
                self.printid("Parsed %u records (failed: %u) from imap" %
                             (count, bad))
            else:
                self.printid("Parsed %s/%s: %u records (failed: %u) from %s" %
                             (ml, mboxfile, count, bad, tmpname))
                os.unlink(tmpname)

            goodies += count
            baddies += bad
            if len(ja) > 0 and not args.dry:
                bulk_insert(self.name, ja, es, "mbox")
            ja = []

            if len(jas) > 0 and not args.dry:
                bulk_insert(self.name, jas, es, "source")
            jas = []
        self.printid("Done, %u elements left to slurp" % len(lists))
Ejemplo n.º 13
0
#! /usr/bin/env python

import warner
import archiver

import json

if __name__ == "__main__":
    warn_and_archive_warner = warner.Warner()
    warn_and_archive_archiver = archiver.Archiver()
    warn_and_archive_warner.warn()
    warn_and_archive_archiver.archive()
Ejemplo n.º 14
0
from collections import namedtuple

TOOLS = os.path.join(
    os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "tools")
sys.path.append(TOOLS)
import archiver

ARCHIVER = os.path.join(TOOLS, "archiver.py")
import generators

list_override = None  # could affect id
private = False  # does not affect id generation
parseHTML = False  # can this affect id generation?
GENS = generators.generator_names()

archie = archiver.Archiver(parse_html=parseHTML)
fake_args = namedtuple('fakeargs', ['verbose', 'ibody'])(False, None)

for arg in sys.argv[1:]:
    if arg.endswith('.yml') or arg.endswith('.yaml'):
        errors = 0
        with open(arg, 'r') as stream:
            data = yaml.safe_load(stream)
            for test in data['tests']:
                for file in test:
                    print("Testing with %s" % file)
                    mbox = mailbox.mbox(file, None, create=False)
                    scripts = test[file]
                    msgcnt = len(mbox)
                    scrcnt = len(scripts)
                    if msgcnt != scrcnt:
Ejemplo n.º 15
0
 def test_tgz_guess_archive(self):
     self.create_compare(archiver.Archiver('tmp.tgz'), MD5SUM_TGZ)
Ejemplo n.º 16
0
 def test_zip_guess_archive(self):
     self.create_compare(archiver.Archiver('tmp.zip'), MD5SUM_ZIP)
Ejemplo n.º 17
0
"""

# PYTHONPATH is used to give access to archiver.py
# PYTHONPATH=../tools python3 generatortest.py generatortest.yaml

import mailbox
import archiver
import sys
import yaml
from pprint import pprint

list_override = None  # could affect id
private = False  # does not affect id generation
parseHTML = False  # can this affect id generation?

archie = archiver.Archiver(parseHTML=parseHTML)

for arg in sys.argv[1:]:
    if arg.endswith('.yml') or arg.endswith('.yaml'):
        with open(arg, 'r') as stream:
            data = yaml.safe_load(stream)
            for test in data['tests']:
                for file in test:
                    print("Testing with %s" % file)
                    mbox = mailbox.mbox(file, None, create=False)
                    scripts = test[file]
                    msgcnt = len(mbox)
                    scrcnt = len(scripts)
                    if msgcnt != scrcnt:
                        print(
                            "WARN: mbox contains %d messages, but there are %d unit tests"