Esempio n. 1
0
 def render_messages(self):
     """Render all the message bodies into the messageData data files."""
     latest_id = self.db.get_latest_message()['_id']
     for start in range(0, latest_id+1, self.page_size):
         end = start + self.page_size
         eprint("Rendering messages %s to %s..." % (start, end))
         self.dump_jsonp_records('data.messageData-%s-%s.js' % (start, end), [
             {
                 "id": message['_id'],
                 "messageBody": self.apply_redactions(self.get_message_body(message)),
             }
             for message in self.db.yield_all_messages(start=start, end=end)
         ])
Esempio n. 2
0
 def render_index(self):
     """Render the index file."""
     eprint("Rendering index data...")
     self.dump_jsonp_records('data.index.js', [
         {
             "id": message['_id'],
             "subject": self.apply_redactions(unescape_yahoo_html(message.get('subject', '(unknown)'))),
             "authorName": self.apply_redactions(message.get('authorName', '')),
             "profile": self.apply_redactions(message.get('profile', '')),
             "from": self.apply_redactions(mask_email(message.get('from', ''))),
             "timestamp": message.get('postDate', 0),
         }
         for message in self.db.yield_all_messages(start=self.redact_before)
     ])
Esempio n. 3
0
    def dump_files(self):
        """Dump all the group files into the files directory."""
        eprint("Dumping group files...")
        for ent, file_f in self.db.yield_all_files():
            if file_f is None:
                eprint("Skipping '%s', have no data for this file..." % (ent['_id'],))
                continue

            # split to pieces, ignore first empty piece, sanitize each piece, put back together
            sanitized = '/'.join(map(sanitize_filename, ent['_id'].split('/')[1:]))
            full_path = P.join(self.files_dir, sanitized)
            os.makedirs(P.dirname(full_path), exist_ok=True)
            with open(full_path, "wb") as f:
                for chunk in file_f:
                    f.write(chunk)
def merge_arguments(default_args, cfg_args, cmd_args):
    """Given the default arguments, the arguments from the config file, and the command-line arguments,
    merge the arguments in order of increasing precedence (default, config, cmd)

    NOTE: The way it is determined whether a command-line argument was passed was by checking that
    its value is equal to the default argument. As such this will fail if a command-line argument
    is explicitly passed that is the same as the default argument - the config file will take
    precedence in this case.
    """
    result = {**default_args, **cfg_args}

    for key, val in cmd_args.items():
        if val != default_args.get(key):
            result[key] = val
        elif key in cfg_args:
            eprint("Using '%s' from config file" % (key, ))

    return result
Esempio n. 5
0
    def render_templates(self):
        """Render the modules/**/*.html into the template Cache."""
        eprint("Rendering templates...")

        with open(P.join(self.dest_root_dir, 'modules', 'core', 'load-templates.js'), 'w') as f:
            cache_puts = []
            for dirpath, _, fns in os.walk(P.join(self.source_root_dir, 'modules')):
                for fn in fns:
                    if not fn.endswith(".html"):
                        continue
                    with open(P.join(dirpath, fn), "r") as template_f:
                        data = template_f.read()
                    cache_puts.append((template_filename(P.join(dirpath, fn)), data))

            f.write(self.templates['load-templates.js'] % (
                "\n".join(
                    """    $templateCache.put(%s, %s);""" % (json.dumps(fn), json.dumps(data))
                    for fn, data in cache_puts)
            ))
Esempio n. 6
0
def command(arguments):
    cli = pymongo.MongoClient(arguments['--mongo-host'],
                              arguments['--mongo-port'])
    ydb = YahooBackupDB(cli, arguments['<group_name>'])

    msg = ydb.db.messages.find_one({'_id': arguments['<message_id>']})

    fn = '#%d from %s.html' % (msg['_id'], msg['profile'])

    eprint("Dumping message to '%s'..." % fn)

    with open(fn, 'w', encoding='utf8') as f:
        f.write("""\
<head>
<meta charset="UTF-8">
</head>
<body>

<div class="subject">%s</div>
<div class="body">%s</div>

</body>""" % (msg.get('subject', '(unknown)'), html_from_message(msg, True)))
Esempio n. 7
0
    def render_config(self):
        """Render the site configuration file."""
        eprint("Rendering config file...")
        self.dump_jsonp('data.config.js', {
            'groupName': self.group_name,
            'lastMessageTime': self.db.get_latest_message().get('postDate'),
            'lastMessageNumber': self.db.get_latest_message()['_id'],
            'messageDbPageSize': self.page_size,
            'cacheBuster': int(time.time()),
        })

        missing_ids = self.db.missing_message_ids()
        if missing_ids:
            eprint("")
            eprint("WARNING! Backup is not complete, missing %s messages! Site will be incomplete." % (
                len(missing_ids),
            ))
            eprint("")
def command(arguments):
    cli = pymongo.MongoClient(arguments['--mongo-host'], arguments['--mongo-port'])
    db = YahooBackupDB(cli, arguments['<group_name>'])
    scraper = YahooBackupScraper(
        arguments['<group_name>'], arguments['--driver'], arguments['--login'],
        arguments['--password'])

    for file_info in scraper.yield_walk_files():
        if not db.has_file_entry(file_info['filePath']) or not db.has_file_data(file_info['filePath']):
            eprint("Inserting file '%s'..." % file_info['filePath'])
            file_data = requests.get(file_info['url']).content
            db.upsert_file_entry(file_info)
            db.update_file_data(file_info['filePath'], file_data)
        else:
            eprint("Already had file '%s'" % file_info['filePath'])

    eprint("Done processing all files!")
def command(arguments):
    cli = pymongo.MongoClient(arguments['--mongo-host'],
                              arguments['--mongo-port'])
    db = YahooBackupDB(cli, arguments['<group_name>'])
    scraper = YahooBackupScraper(arguments['<group_name>'],
                                 arguments['--driver'], arguments['--login'],
                                 arguments['--password'])

    skipped = [0]

    def print_skipped(min):
        if skipped[0] >= min:
            eprint("Skipped %s messages we already processed" % skipped[0])
            skipped[0] = 0

    last_message = scraper.get_last_message_number()
    cur_message = last_message
    while cur_message >= 1:
        if db.has_updated_message(cur_message):
            skipped[0] += 1
            print_skipped(1000)
            cur_message -= 1
            continue

        msg = scraper.get_message(cur_message)
        db.upsert_message(cur_message, msg)
        if not msg:
            eprint("Message #%s is missing" % (cur_message, ))
        else:
            eprint(
                "Inserted message #%s by %s/%s/%s" %
                (cur_message, msg['authorName'], msg['profile'], msg['from']))

        cur_message -= 1

    print_skipped(0)
    eprint("All messages from the beginning up to #%s have been scraped!" %
           (last_message, ))
    def run(self):
        """Run and dump the entire site."""
        if self.code_only:
            eprint("Dumping code only...")
            self.copy_template_site()
            self.render_templates()
            self.render_config()
            return

        if not check_node():
            sys.exit(
                "node not found - node is required to generate the search indices"
            )

        if os.path.exists(self.dest_root_dir):
            sys.exit(
                "Root site directory already exists. Specify a new directory or delete the existing one."
            )

        self.copy_template_site()

        os.makedirs(self.data_dir)
        os.makedirs(self.files_dir)

        self.render_templates()
        self.render_config()
        self.render_index()
        self.render_messages()
        self.render_search_indices()
        self.dump_files()

        eprint("Site is ready in '%s'!" % self.dest_root_dir)
        if self.failed_render_messages:
            eprint("")
            eprint(
                "NOTE: Failed to render the following messages from the raw email"
            )
            eprint("data. They may not have rendered properly.")
            eprint("")
            eprint("[%s]" %
                   ", ".join(map(str, sorted(self.failed_render_messages))))
 def print_skipped(min):
     if skipped[0] >= min:
         eprint("Skipped %s messages we already processed" % skipped[0])
         skipped[0] = 0