def render_messages(self): """Render all the message bodies into the messageData data files.""" latest_id = self.db.get_latest_message()['_id'] for start in range(0, latest_id+1, self.page_size): end = start + self.page_size eprint("Rendering messages %s to %s..." % (start, end)) self.dump_jsonp_records('data.messageData-%s-%s.js' % (start, end), [ { "id": message['_id'], "messageBody": self.apply_redactions(self.get_message_body(message)), } for message in self.db.yield_all_messages(start=start, end=end) ])
def render_index(self): """Render the index file.""" eprint("Rendering index data...") self.dump_jsonp_records('data.index.js', [ { "id": message['_id'], "subject": self.apply_redactions(unescape_yahoo_html(message.get('subject', '(unknown)'))), "authorName": self.apply_redactions(message.get('authorName', '')), "profile": self.apply_redactions(message.get('profile', '')), "from": self.apply_redactions(mask_email(message.get('from', ''))), "timestamp": message.get('postDate', 0), } for message in self.db.yield_all_messages(start=self.redact_before) ])
def dump_files(self): """Dump all the group files into the files directory.""" eprint("Dumping group files...") for ent, file_f in self.db.yield_all_files(): if file_f is None: eprint("Skipping '%s', have no data for this file..." % (ent['_id'],)) continue # split to pieces, ignore first empty piece, sanitize each piece, put back together sanitized = '/'.join(map(sanitize_filename, ent['_id'].split('/')[1:])) full_path = P.join(self.files_dir, sanitized) os.makedirs(P.dirname(full_path), exist_ok=True) with open(full_path, "wb") as f: for chunk in file_f: f.write(chunk)
def merge_arguments(default_args, cfg_args, cmd_args): """Given the default arguments, the arguments from the config file, and the command-line arguments, merge the arguments in order of increasing precedence (default, config, cmd) NOTE: The way it is determined whether a command-line argument was passed was by checking that its value is equal to the default argument. As such this will fail if a command-line argument is explicitly passed that is the same as the default argument - the config file will take precedence in this case. """ result = {**default_args, **cfg_args} for key, val in cmd_args.items(): if val != default_args.get(key): result[key] = val elif key in cfg_args: eprint("Using '%s' from config file" % (key, )) return result
def render_templates(self): """Render the modules/**/*.html into the template Cache.""" eprint("Rendering templates...") with open(P.join(self.dest_root_dir, 'modules', 'core', 'load-templates.js'), 'w') as f: cache_puts = [] for dirpath, _, fns in os.walk(P.join(self.source_root_dir, 'modules')): for fn in fns: if not fn.endswith(".html"): continue with open(P.join(dirpath, fn), "r") as template_f: data = template_f.read() cache_puts.append((template_filename(P.join(dirpath, fn)), data)) f.write(self.templates['load-templates.js'] % ( "\n".join( """ $templateCache.put(%s, %s);""" % (json.dumps(fn), json.dumps(data)) for fn, data in cache_puts) ))
def command(arguments): cli = pymongo.MongoClient(arguments['--mongo-host'], arguments['--mongo-port']) ydb = YahooBackupDB(cli, arguments['<group_name>']) msg = ydb.db.messages.find_one({'_id': arguments['<message_id>']}) fn = '#%d from %s.html' % (msg['_id'], msg['profile']) eprint("Dumping message to '%s'..." % fn) with open(fn, 'w', encoding='utf8') as f: f.write("""\ <head> <meta charset="UTF-8"> </head> <body> <div class="subject">%s</div> <div class="body">%s</div> </body>""" % (msg.get('subject', '(unknown)'), html_from_message(msg, True)))
def render_config(self): """Render the site configuration file.""" eprint("Rendering config file...") self.dump_jsonp('data.config.js', { 'groupName': self.group_name, 'lastMessageTime': self.db.get_latest_message().get('postDate'), 'lastMessageNumber': self.db.get_latest_message()['_id'], 'messageDbPageSize': self.page_size, 'cacheBuster': int(time.time()), }) missing_ids = self.db.missing_message_ids() if missing_ids: eprint("") eprint("WARNING! Backup is not complete, missing %s messages! Site will be incomplete." % ( len(missing_ids), )) eprint("")
def command(arguments): cli = pymongo.MongoClient(arguments['--mongo-host'], arguments['--mongo-port']) db = YahooBackupDB(cli, arguments['<group_name>']) scraper = YahooBackupScraper( arguments['<group_name>'], arguments['--driver'], arguments['--login'], arguments['--password']) for file_info in scraper.yield_walk_files(): if not db.has_file_entry(file_info['filePath']) or not db.has_file_data(file_info['filePath']): eprint("Inserting file '%s'..." % file_info['filePath']) file_data = requests.get(file_info['url']).content db.upsert_file_entry(file_info) db.update_file_data(file_info['filePath'], file_data) else: eprint("Already had file '%s'" % file_info['filePath']) eprint("Done processing all files!")
def command(arguments): cli = pymongo.MongoClient(arguments['--mongo-host'], arguments['--mongo-port']) db = YahooBackupDB(cli, arguments['<group_name>']) scraper = YahooBackupScraper(arguments['<group_name>'], arguments['--driver'], arguments['--login'], arguments['--password']) skipped = [0] def print_skipped(min): if skipped[0] >= min: eprint("Skipped %s messages we already processed" % skipped[0]) skipped[0] = 0 last_message = scraper.get_last_message_number() cur_message = last_message while cur_message >= 1: if db.has_updated_message(cur_message): skipped[0] += 1 print_skipped(1000) cur_message -= 1 continue msg = scraper.get_message(cur_message) db.upsert_message(cur_message, msg) if not msg: eprint("Message #%s is missing" % (cur_message, )) else: eprint( "Inserted message #%s by %s/%s/%s" % (cur_message, msg['authorName'], msg['profile'], msg['from'])) cur_message -= 1 print_skipped(0) eprint("All messages from the beginning up to #%s have been scraped!" % (last_message, ))
def run(self): """Run and dump the entire site.""" if self.code_only: eprint("Dumping code only...") self.copy_template_site() self.render_templates() self.render_config() return if not check_node(): sys.exit( "node not found - node is required to generate the search indices" ) if os.path.exists(self.dest_root_dir): sys.exit( "Root site directory already exists. Specify a new directory or delete the existing one." ) self.copy_template_site() os.makedirs(self.data_dir) os.makedirs(self.files_dir) self.render_templates() self.render_config() self.render_index() self.render_messages() self.render_search_indices() self.dump_files() eprint("Site is ready in '%s'!" % self.dest_root_dir) if self.failed_render_messages: eprint("") eprint( "NOTE: Failed to render the following messages from the raw email" ) eprint("data. They may not have rendered properly.") eprint("") eprint("[%s]" % ", ".join(map(str, sorted(self.failed_render_messages))))
def print_skipped(min): if skipped[0] >= min: eprint("Skipped %s messages we already processed" % skipped[0]) skipped[0] = 0