def write_sql(self): self.user_dict = {1: True} fd = File.open_input(self.xml_file) logout_fd = File.open_output(self.log_out_file) if self.user_out_file: userout_fd = File.open_output(self.user_out_file) else: userout_fd = None if not self.skip_header(fd): raise WikiContentErr( "failed to find end of mediawiki/siteinfo header in xml file\n" ) eof = False while not eof: eof = self.do_log_item(fd, logout_fd, userout_fd) fd.close() logout_fd.close() if self.user_out_file: userout_fd.close() return
def do_main(): o = {} # stash all opt vars in here # init main opt vars for opt in ['template', 'sql_files', 'mw_version', 'output_dir', 'username', 'password']: o[opt] = None o['project'] = "wikipedia" o['lang_code'] = "en" o['batch_size'] = 500 cwd = Path(os.getcwd()) o['sqlfilter'] = cwd.make_path("sqlfilter") o['wcr'] = cwd.make_path("wikiretriever.py") o['mwxml2sql'] = cwd.make_path("mwxml2sql") # init step opt vars for opt in ['retrieve_titles', 'convert_titles', 'retrieve_content', 'make_stubs', 'convert_xml', 'filter_sql']: o[opt] = True # init file opt vars for opt in ['titles_path', 'mediawiki_titles_path', 'module_titles_path', 'template_titles_path', 'main_titles_with_prefix_path', 'tmpl_titles_with_prefix_path', 'main_content_path', 'template_content_path', 'content_path', 'stubs_path', 'page_ids_path']: o[opt] = None verbose = False # option handling main_options = ["template=", "sqlfiles=", "mwversion=", "lang=", "project=", "batchsize=", "output=", "auth="] cmd_options = ["sqlfilter=", "mwxml2sql=", "wcr="] steps = ["retrievetitles", "converttitles", "retrievecontent", "makestubs", "convertxml", "filtersql"] skip_step_flags = ["no" + s for s in steps] convert_titles_options = ["titles=", "mwtitles=", "mdltitles=", "tmpltitles="] retrieve_content_options = ["titleswithprefix=", "tmpltitleswithprefix="] make_stubs_options = ["maincontent=", "tmplcontent=", "content="] convert_xml_filter_sql_options = ["stubs=", "pageids="] files = [fopt[:-1] for fopt in convert_titles_options + retrieve_content_options + make_stubs_options + convert_xml_filter_sql_options] misc_flags = ["verbose", "help", "extendedhelp"] all_options = (main_options + cmd_options + skip_step_flags + convert_titles_options + retrieve_content_options + make_stubs_options + convert_xml_filter_sql_options + misc_flags) try: (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", all_options) except getopt.GetoptError as e: usage(e.msg) for (opt, val) in options: # main opts if opt == "--template": o['template'] = val elif opt == "--sqlfiles": o['sql_files'] = val elif opt == "--mwversion": o['mw_version'] = val elif opt == "--lang": o['lang_code'] = val elif opt == "--project": o['project'] = val elif opt == "--batchsize": if not val.isdigit(): usage("batch size must be a number") o['batch_size'] = int(val) elif opt == "--output": o['output_dir'] = val elif opt == "--auth": if ':' in val: o['username'], o['password'] = val.split(':') else: o['username'] = val # command opts elif opt == "--sqlfilter": o['sqlfilter'] = val elif opt == "--mwxml2sql": o['mwxml2sql'] = val elif opt == "--wcr": o['wcr'] = val # step options elif opt.startswith("--no"): process_step_option(opt[4:], o) # file options elif opt[2:] in files: process_file_option(opt[2:], val, o) # misc flags elif opt == "--verbose": verbose = True elif opt == "--help": usage("Options help:\n") elif opt == "--extendedhelp": usage("Options help:\n", True) else: usage("Unknown option specified: %s" % opt) if len(remainder) > 0: usage("Unknown option specified: <%s>" % remainder[0]) # output files will have this date in their names date = time.strftime("%Y-%m-%d-%H%M%S", time.gmtime(time.time())) out = Path(o['output_dir'], o['lang_code'], o['project'], date) # processing begins if o['retrieve_titles']: if not o['wcr']: usage("in retrieve_titles: Missing mandatory option wcr.") if not o['template']: usage("in retrieve_titles: Missing mandatory option template.") if ':' not in o['template']: usage("in retrieve_titles: template option should start with 'Template:' " + "or the equivalent in the wiki's language") if not o['mw_version']: usage("in retrieve_titles: Missing mandatory option mwversion.") if (verbose): sys.stderr.write("Retrieving page titles from wiki\n") r = Retriever(o['wcr'], o['output_dir'], o['lang_code'], o['project'], verbose) if not o['titles_path']: # get titles corresponding to the template o['titles_path'] = r.get_titles_embedded_in(o['template'], out.make_file("main-titles.gz")) if verbose: sys.stderr.write("main content titles file produced: <%s>\n" % o['titles_path']) if not o['mediawiki_titles_path']: # get the mediawiki page titles o['mediawiki_titles_path'] = r.get_titles_in_namespace("8", out.make_file("mw-titles.gz")) if verbose: sys.stderr.write("mediawiki titles file produced: <%s>\n" % o['mediawiki_titles_path']) if not o['module_titles_path']: # get the module (lua) page titles o['module_titles_path'] = r.get_titles_in_namespace("828", out.make_file("mod-titles.gz")) if verbose: sys.stderr.write("modules (lua) titles file produced: <%s>\n" % o['module_titles_path']) if not o['template_titles_path']: # get the template page titles o['template_titles_path'] = r.get_titles_in_namespace("10", out.make_file("tmpl-titles.gz")) if verbose: sys.stderr.write("templates titles file produced: <%s>\n" % o['template_titles_path']) if (verbose): sys.stderr.write("Done retrieving page titles from wiki, have " + "%s, %s, %s and %s\n" % ( o['titles_path'], o['mediawiki_titles_path'], o['module_titles_path'], o['template_titles_path'])) if o['convert_titles']: if (not o['titles_path'] or not o['mediawiki_titles_path'] or not o['module_titles_path'] or not o['template_titles_path']): usage("Missing mandatory option for skipping previous step.", True) if not o['wcr']: usage("Missing mandatory option wcr.") if (verbose): sys.stderr.write("Converting retrieved titles \n") r = Retriever(o['wcr'], o['output_dir'], o['lang_code'], o['project'], verbose) # get namespaces from the api ns_dict = r.get_ns_dict() ns_dict_by_string = {} for nsnum in ns_dict.keys(): ns_dict_by_string[ns_dict[nsnum]] = nsnum if verbose: sys.stderr.write("namespace dicts assembled\n") # get list of titles with prefix, not the talk pages but the actual ones, # (for use for download) - without dups # also create a hash with title, list of ns for this title (it will have # at least one entry in the list) t = Titles(ns_dict, ns_dict_by_string) # check main, file, category, project talk namespaces and convert to # main, file, category, project talk namespaces t.add_related_titles_from_file(o['titles_path'], ["1", "5", "7", "15"], ["0", "4", "6", "14"]) if verbose: sys.stderr.write("page title hash assembled\n") t.add_titles_from_file(o['mediawiki_titles_path'], "8") if verbose: sys.stderr.write("mediawiki titles added to page title hash\n") t.add_titles_from_file(o['module_titles_path'], "828") if verbose: sys.stderr.write("module titles added to page title hash\n") t.add_titles_from_file(o['template_titles_path'], "10") if verbose: sys.stderr.write("template titles added to page title hash\n") t.uniq() o['main_titles_with_prefix_path'] = out.make_path("main-titles-with-nsprefix.gz") out_fd = File.open_output(o['main_titles_with_prefix_path']) for line in t.list: out_fd.write(line + "\n") out_fd.close() o['tmpl_titles_with_prefix_path'] = out.make_path("tmpl-titles-with-nsprefix.gz") out_fd = File.open_output(o['tmpl_titles_with_prefix_path']) for line in t.list_templates: out_fd.write(line + "\n") out_fd.close() if (verbose): sys.stderr.write("Done converting retrieved titles, have %s and %s\n" % (o['main_titles_with_prefix_path'], o['tmpl_titles_with_prefix_path'])) if o['retrieve_content']: if not o['main_titles_with_prefix_path'] or not o['tmpl_titles_with_prefix_path']: usage("in retrieve_content: Missing mandatory option for skipping previous step.", True) if (verbose): sys.stderr.write("Retrieving page content from wiki\n") if not o['template_content_path']: # filter out the template titles from the main_titles_with_prefix_path file # and just download the rest o['template_content_path'] = r.get_content(o['tmpl_titles_with_prefix_path'], out.make_file("template-content.gz")) if verbose: sys.stderr.write("content retrieved from template page titles\n") if not o['main_content_path']: o['main_content_path'] = r.get_content(o['main_titles_with_prefix_path'], out.make_file("rest-content.gz")) if verbose: sys.stderr.write("content retrieved from page titles\n") o['content_path'] = out.make_path("content.gz") File.combine_xml([o['template_content_path'], o['main_content_path']], o['content_path']) if (verbose): sys.stderr.write("Done retrieving page content from wiki, have %s, %s and %s\n" % (o['template_content_path'], o['main_content_path'], o['content_path'])) if o['make_stubs']: if not o['content_path']: usage("in make_stubs: Missing mandatory option for skipping previous step.", True) if (verbose): sys.stderr.write("Generating stub XML file and pageids file from downloaded content\n") s = Stubber(o['output_dir'], verbose) # generate stub XML file for converting sql and list of page ids for filtering sql o['stubs_path'] = out.make_path("stubs.gz") o['page_ids_path'] = out.make_path("pageids.gz") s.write_stub_and_page_ids(o['content_path'], o['stubs_path'], o['page_ids_path']) if (verbose): sys.stderr.write("Done generating stub XML file and pageids file from " + "downloaded content, have %s and %s\n" % ( o['stubs_path'], o['page_ids_path'])) if o['convert_xml']: if not o['content_path']: usage("in convert_xml: Missing mandatory option for skipping previous step.", True) if not o['mwxml2sql']: usage("in convert_xml: Missing mandatory option mwxml2sql.") if (verbose): sys.stderr.write("Converting content to page, revision, text tables\n") c = Converter(o['mwxml2sql'], o['output_dir'], verbose) # convert the content file to page, revision and text tables c.convert_content(o['content_path'], o['stubs_path'], o['mw_version']) if verbose: sys.stderr.write("Done converting content to page, revision, text tables\n") if o['filter_sql']: if not o['page_ids_path']: usage("in filter_sql: Missing mandatory option for skipping previous step.", True) if not o['sql_files']: usage("in filter_sql: Missing mandatory option sqlfiles.") if not o['sqlfilter']: usage("in filter_sql: Missing mandatory option sqlfilter.") if verbose: sys.stderr.write("Filtering sql tables against page ids for import\n") f = Filter(o['sqlfilter'], o['output_dir'], verbose) # filter all the sql tables (which should be in some nice directory) # against the pageids in page_ids_path file for table in ["categorylinks", "externallinks", "imagelinks", "interwiki", "iwlinks", "langlinks", "page_props", "page_restrictions", "pagelinks", "protected_titles", "redirect", "templatelinks"]: sql_filename = o['sql_files'].format(t=table) filtered_filename = os.path.basename(sql_filename) f.filter(sql_filename, filtered_filename, o['page_ids_path']) if (verbose): sys.stderr.write("Done filtering sql tables against page ids for import\n") # the one file we can't filter, it's not by pageid as categories might not have pages # so we'll have to import it wholesale... (or you can ignore them completely) sql_filename = o['sql_files'].format(t='category') new_filename = os.path.join(o['output_dir'], os.path.basename(sql_filename)) if verbose: sys.stderr.write("about to copy %s to %s\n" % (sql_filename, new_filename)) shutil.copyfile(sql_filename, new_filename) if (verbose): sys.stderr.write("Done!\n") sys.exit(0)
def write_stub_and_page_ids(self, content_path, stubs_path, page_ids_path): """Write an XML stub file (omitting text content) and a list of page ids, from a MediaWiki XML page content file. Arguments: content_path -- path to the XML page content file to read stubs_path -- path to the stubs file to write page_ids_path -- path to the page ids file to write""" page_pattern = "^\s*<page>" compiled_page_pattern = re.compile(page_pattern) revision_pattern = "^\s*<revision>" compiled_revision_pattern = re.compile(revision_pattern) id_pattern = "^\s*<id>(?P<i>.+)</id>\s*\n$" compiled_id_pattern = re.compile(id_pattern) text_pattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"' compiled_text_pattern = re.compile(text_pattern) in_fd = File.open_input(content_path) out_fd = File.open_output(stubs_path) outpage_id_fd = File.open_output(page_ids_path) current_title = None current_text_id = None page_id = None expect_rev_id = False expect_page_id = False for line in in_fd: # FIXME we could jus calculate text len if the output is missing # the bytes attr. (as in dumps not from Special:Export) # format in content file: # <text <text xml:space="preserve" bytes="78"> # format wanted for stubs file: # <text id="11248" bytes="9" /> if '<' in line: result = compiled_text_pattern.match(line) if result: line = result.group("s") + '<text id="%s" bytes="%s" />\n' % ( current_text_id, result.group("b")) out_fd.write(line) continue elif '</text' in line: continue result = compiled_page_pattern.match(line) if result: expect_page_id = True out_fd.write(line) continue result = compiled_revision_pattern.match(line) if result: expect_rev_id = True out_fd.write(line) continue if expect_page_id: result = compiled_id_pattern.match(line) if result: outpage_id_fd.write("1:%s\n" % result.group("i")) expect_page_id = False out_fd.write(line) continue if expect_rev_id: result = compiled_id_pattern.match(line) if result: current_text_id = result.group("i") expect_rev_id = False out_fd.write(line) continue out_fd.write(line) else: continue # these are lines of text, we can skip them in_fd.close() out_fd.close() outpage_id_fd.close()