def writeSql(self): self.userDict = { 1: True } fd = File.openInput(self.xmlFile) logOutFd = File.openOutput(self.logOutFile) if self.userOutFile: userOutFd = File.openOutput(self.userOutFile) else: userOutFd = None if not self.skipHeader(fd): raise WikiContentErr("failed to find end of mediawiki/siteinfo header in xml file\n") eof = False while not eof: eof = self.doLogItem(fd, logOutFd, userOutFd) fd.close() logOutFd.close() if self.userOutFile: userOutFd.close() return
def write_sql(self): self.user_dict = {1: True} fd = File.open_input(self.xml_file) logout_fd = File.open_output(self.log_out_file) if self.user_out_file: userout_fd = File.open_output(self.user_out_file) else: userout_fd = None if not self.skip_header(fd): raise WikiContentErr( "failed to find end of mediawiki/siteinfo header in xml file\n" ) eof = False while not eof: eof = self.do_log_item(fd, logout_fd, userout_fd) fd.close() logout_fd.close() if self.user_out_file: userout_fd.close() return
def addRelatedTitlesFromFile(self, filename, relatedNsList, nsList): """Read list of titles from file, for those in one of the specified namespaces, convert the title to one from its related namespace (i.e. if it was in Category talk, convert to Category, if it was in File talk, convert to File, etc.) and add to title list and dict. Arguments: filename -- full path to list of titles relatedNsList -- list of namespaces wanted, e.g. [ "4", "6", "12" ] nsList -- list of namespaces to convert from, in the same order as the related NsList, e.g. [ "5", "7", "13" ]""" # don't pass templates in here, we do those separately # because it could be a huge list and we want the user # to be able to save and reuse it fd = File.openInput(filename) for line in fd: line = line.strip() sep = line.find(":") if sep != -1: prefix = line[:sep] if prefix in self.nsDictByString: # main, file, category, project talk namespaces if self.nsDictByString[prefix] in relatedNsList: noPrefixTitle = line[sep + 1:] # convert to file, category, project namespace relatedNs = str(int(self.nsDictByString[prefix]) - 1) if (self.nsDict[relatedNs]): newTitle = self.nsDict[ relatedNs] + ":" + noPrefixTitle else: newTitle = noPrefixTitle # main namespace titles self.list.append(newTitle) if noPrefixTitle in self.dict: self.dict[noPrefixTitle][relatedNs] = True else: self.dict[noPrefixTitle] = {relatedNs: True} # file, category, project talk namespaces elif self.nsDictByString[prefix] in nsList: ns = self.nsDictByString[prefix] noPrefixTitle = line[sep + 1:] self.list.append(noPrefixTitle) if noPrefixTitle in self.dict: self.dict[noPrefixTitle][ns] = True else: self.dict[noPrefixTitle] = {ns: True} elif "0" in nsList: # main namespace, won't be caught above self.list.append(line) if line in self.dict: self.dict[line]["0"] = True else: self.dict[line] = {"0": True} fd.close()
def addRelatedTitlesFromFile(self, filename, relatedNsList, nsList): """Read list of titles from file, for those in one of the specified namespaces, convert the title to one from its related namespace (i.e. if it was in Category talk, convert to Category, if it was in File talk, convert to File, etc.) and add to title list and dict. Arguments: filename -- full path to list of titles relatedNsList -- list of namespaces wanted, e.g. [ "4", "6", "12" ] nsList -- list of namespaces to convert from, in the same order as the related NsList, e.g. [ "5", "7", "13" ]""" # don't pass templates in here, we do those separately # because it could be a huge list and we want the user # to be able to save and reuse it fd = File.openInput(filename) for line in fd: line = line.strip() sep = line.find(":") if sep != -1: prefix = line[:sep] if prefix in self.nsDictByString: # main, file, category, project talk namespaces if self.nsDictByString[prefix] in relatedNsList: noPrefixTitle = line[sep+1:] # convert to file, category, project namespace relatedNs = str(int(self.nsDictByString[prefix]) - 1) if (self.nsDict[relatedNs]): newTitle = self.nsDict[relatedNs] + ":" + noPrefixTitle else: newTitle = noPrefixTitle # main namespace titles self.list.append(newTitle) if noPrefixTitle in self.dict: self.dict[noPrefixTitle][relatedNs] = True else: self.dict[noPrefixTitle] = { relatedNs : True } # file, category, project talk namespaces elif self.nsDictByString[prefix] in nsList: ns = self.nsDictByString[prefix] noPrefixTitle = line[sep+1:] self.list.append(noPrefixTitle) if noPrefixTitle in self.dict: self.dict[noPrefixTitle][ns] = True else: self.dict[noPrefixTitle] = { ns : True } elif "0" in nsList: # main namespace, won't be caught above self.list.append(line) if line in self.dict: self.dict[line]["0"] = True else: self.dict[line] = { "0" : True } fd.close()
def getTitlesDict(self,sqlFile): """Arguments: sqlFile -- file containing pageid whitespace nsnum whitespace pagetitle where the title is expected to be sql escaped and can be enclosed with single quotes""" fd = File.openInput(sqlFile) t = {} for line in fd: (pageid, ns, title) = line.split(' ',3) ns = int(ns) if title in t: t[title][ns] = pageid else: t[title] = { ns: pageid } return t
def add_titles_from_file(self, filename, ns): """add titles from a file to the title list and dict. Note that template titles get added to a different title list than the rest, for separate processing Arguments: filename -- full path to file containing page titles ns -- number (string of digits) of namespace of page titles to grab from file""" fd = File.open_input(filename) prefix = self.ns_dict[ns] + ":" prefix_len = len(prefix) for line in fd: if line.startswith(prefix): if ns == "10": # special case bleah self.list_templates.append(line[:-1]) # lose newline else: self.list.append(line[:-1]) # lose newline no_prefix_title = line[prefix_len:-1] if no_prefix_title in self.dict: self.dict[no_prefix_title][ns] = True else: self.dict[no_prefix_title] = {ns: True}
def addTitlesFromFile(self, filename, ns): """add titles from a file to the title list and dict. Note that template titles get added to a different title list than the rest, for separate processing Arguments: filename -- full path to file containing page titles ns -- number (string of digits) of namespace of page titles to grab from file""" fd = File.openInput(filename) prefix = self.nsDict[ns] + ":" prefixLen = len(prefix) for line in fd: if line.startswith(prefix): if ns == "10": # special case bleah self.listTemplates.append(line[:-1]) # lose newline else: self.list.append(line[:-1]) # lose newline noPrefixTitle = line[prefixLen:-1] if noPrefixTitle in self.dict: self.dict[noPrefixTitle][ns] = True else: self.dict[noPrefixTitle] = { ns : True }
def do_main(): o = {} # stash all opt vars in here # init main opt vars for opt in ['template', 'sql_files', 'mw_version', 'output_dir', 'username', 'password']: o[opt] = None o['project'] = "wikipedia" o['lang_code'] = "en" o['batch_size'] = 500 cwd = Path(os.getcwd()) o['sqlfilter'] = cwd.make_path("sqlfilter") o['wcr'] = cwd.make_path("wikiretriever.py") o['mwxml2sql'] = cwd.make_path("mwxml2sql") # init step opt vars for opt in ['retrieve_titles', 'convert_titles', 'retrieve_content', 'make_stubs', 'convert_xml', 'filter_sql']: o[opt] = True # init file opt vars for opt in ['titles_path', 'mediawiki_titles_path', 'module_titles_path', 'template_titles_path', 'main_titles_with_prefix_path', 'tmpl_titles_with_prefix_path', 'main_content_path', 'template_content_path', 'content_path', 'stubs_path', 'page_ids_path']: o[opt] = None verbose = False # option handling main_options = ["template=", "sqlfiles=", "mwversion=", "lang=", "project=", "batchsize=", "output=", "auth="] cmd_options = ["sqlfilter=", "mwxml2sql=", "wcr="] steps = ["retrievetitles", "converttitles", "retrievecontent", "makestubs", "convertxml", "filtersql"] skip_step_flags = ["no" + s for s in steps] convert_titles_options = ["titles=", "mwtitles=", "mdltitles=", "tmpltitles="] retrieve_content_options = ["titleswithprefix=", "tmpltitleswithprefix="] make_stubs_options = ["maincontent=", "tmplcontent=", "content="] convert_xml_filter_sql_options = ["stubs=", "pageids="] files = [fopt[:-1] for fopt in convert_titles_options + retrieve_content_options + make_stubs_options + convert_xml_filter_sql_options] misc_flags = ["verbose", "help", "extendedhelp"] all_options = (main_options + cmd_options + skip_step_flags + convert_titles_options + retrieve_content_options + make_stubs_options + convert_xml_filter_sql_options + misc_flags) try: (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", all_options) except getopt.GetoptError as e: usage(e.msg) for (opt, val) in options: # main opts if opt == "--template": o['template'] = val elif opt == "--sqlfiles": o['sql_files'] = val elif opt == "--mwversion": o['mw_version'] = val elif opt == "--lang": o['lang_code'] = val elif opt == "--project": o['project'] = val elif opt == "--batchsize": if not val.isdigit(): usage("batch size must be a number") o['batch_size'] = int(val) elif opt == "--output": o['output_dir'] = val elif opt == "--auth": if ':' in val: o['username'], o['password'] = val.split(':') else: o['username'] = val # command opts elif opt == "--sqlfilter": o['sqlfilter'] = val elif opt == "--mwxml2sql": o['mwxml2sql'] = val elif opt == "--wcr": o['wcr'] = val # step options elif opt.startswith("--no"): process_step_option(opt[4:], o) # file options elif opt[2:] in files: process_file_option(opt[2:], val, o) # misc flags elif opt == "--verbose": verbose = True elif opt == "--help": usage("Options help:\n") elif opt == "--extendedhelp": usage("Options help:\n", True) else: usage("Unknown option specified: %s" % opt) if len(remainder) > 0: usage("Unknown option specified: <%s>" % remainder[0]) # output files will have this date in their names date = time.strftime("%Y-%m-%d-%H%M%S", time.gmtime(time.time())) out = Path(o['output_dir'], o['lang_code'], o['project'], date) # processing begins if o['retrieve_titles']: if not o['wcr']: usage("in retrieve_titles: Missing mandatory option wcr.") if not o['template']: usage("in retrieve_titles: Missing mandatory option template.") if ':' not in o['template']: usage("in retrieve_titles: template option should start with 'Template:' " + "or the equivalent in the wiki's language") if not o['mw_version']: usage("in retrieve_titles: Missing mandatory option mwversion.") if (verbose): sys.stderr.write("Retrieving page titles from wiki\n") r = Retriever(o['wcr'], o['output_dir'], o['lang_code'], o['project'], verbose) if not o['titles_path']: # get titles corresponding to the template o['titles_path'] = r.get_titles_embedded_in(o['template'], out.make_file("main-titles.gz")) if verbose: sys.stderr.write("main content titles file produced: <%s>\n" % o['titles_path']) if not o['mediawiki_titles_path']: # get the mediawiki page titles o['mediawiki_titles_path'] = r.get_titles_in_namespace("8", out.make_file("mw-titles.gz")) if verbose: sys.stderr.write("mediawiki titles file produced: <%s>\n" % o['mediawiki_titles_path']) if not o['module_titles_path']: # get the module (lua) page titles o['module_titles_path'] = r.get_titles_in_namespace("828", out.make_file("mod-titles.gz")) if verbose: sys.stderr.write("modules (lua) titles file produced: <%s>\n" % o['module_titles_path']) if not o['template_titles_path']: # get the template page titles o['template_titles_path'] = r.get_titles_in_namespace("10", out.make_file("tmpl-titles.gz")) if verbose: sys.stderr.write("templates titles file produced: <%s>\n" % o['template_titles_path']) if (verbose): sys.stderr.write("Done retrieving page titles from wiki, have " + "%s, %s, %s and %s\n" % ( o['titles_path'], o['mediawiki_titles_path'], o['module_titles_path'], o['template_titles_path'])) if o['convert_titles']: if (not o['titles_path'] or not o['mediawiki_titles_path'] or not o['module_titles_path'] or not o['template_titles_path']): usage("Missing mandatory option for skipping previous step.", True) if not o['wcr']: usage("Missing mandatory option wcr.") if (verbose): sys.stderr.write("Converting retrieved titles \n") r = Retriever(o['wcr'], o['output_dir'], o['lang_code'], o['project'], verbose) # get namespaces from the api ns_dict = r.get_ns_dict() ns_dict_by_string = {} for nsnum in ns_dict.keys(): ns_dict_by_string[ns_dict[nsnum]] = nsnum if verbose: sys.stderr.write("namespace dicts assembled\n") # get list of titles with prefix, not the talk pages but the actual ones, # (for use for download) - without dups # also create a hash with title, list of ns for this title (it will have # at least one entry in the list) t = Titles(ns_dict, ns_dict_by_string) # check main, file, category, project talk namespaces and convert to # main, file, category, project talk namespaces t.add_related_titles_from_file(o['titles_path'], ["1", "5", "7", "15"], ["0", "4", "6", "14"]) if verbose: sys.stderr.write("page title hash assembled\n") t.add_titles_from_file(o['mediawiki_titles_path'], "8") if verbose: sys.stderr.write("mediawiki titles added to page title hash\n") t.add_titles_from_file(o['module_titles_path'], "828") if verbose: sys.stderr.write("module titles added to page title hash\n") t.add_titles_from_file(o['template_titles_path'], "10") if verbose: sys.stderr.write("template titles added to page title hash\n") t.uniq() o['main_titles_with_prefix_path'] = out.make_path("main-titles-with-nsprefix.gz") out_fd = File.open_output(o['main_titles_with_prefix_path']) for line in t.list: out_fd.write(line + "\n") out_fd.close() o['tmpl_titles_with_prefix_path'] = out.make_path("tmpl-titles-with-nsprefix.gz") out_fd = File.open_output(o['tmpl_titles_with_prefix_path']) for line in t.list_templates: out_fd.write(line + "\n") out_fd.close() if (verbose): sys.stderr.write("Done converting retrieved titles, have %s and %s\n" % (o['main_titles_with_prefix_path'], o['tmpl_titles_with_prefix_path'])) if o['retrieve_content']: if not o['main_titles_with_prefix_path'] or not o['tmpl_titles_with_prefix_path']: usage("in retrieve_content: Missing mandatory option for skipping previous step.", True) if (verbose): sys.stderr.write("Retrieving page content from wiki\n") if not o['template_content_path']: # filter out the template titles from the main_titles_with_prefix_path file # and just download the rest o['template_content_path'] = r.get_content(o['tmpl_titles_with_prefix_path'], out.make_file("template-content.gz")) if verbose: sys.stderr.write("content retrieved from template page titles\n") if not o['main_content_path']: o['main_content_path'] = r.get_content(o['main_titles_with_prefix_path'], out.make_file("rest-content.gz")) if verbose: sys.stderr.write("content retrieved from page titles\n") o['content_path'] = out.make_path("content.gz") File.combine_xml([o['template_content_path'], o['main_content_path']], o['content_path']) if (verbose): sys.stderr.write("Done retrieving page content from wiki, have %s, %s and %s\n" % (o['template_content_path'], o['main_content_path'], o['content_path'])) if o['make_stubs']: if not o['content_path']: usage("in make_stubs: Missing mandatory option for skipping previous step.", True) if (verbose): sys.stderr.write("Generating stub XML file and pageids file from downloaded content\n") s = Stubber(o['output_dir'], verbose) # generate stub XML file for converting sql and list of page ids for filtering sql o['stubs_path'] = out.make_path("stubs.gz") o['page_ids_path'] = out.make_path("pageids.gz") s.write_stub_and_page_ids(o['content_path'], o['stubs_path'], o['page_ids_path']) if (verbose): sys.stderr.write("Done generating stub XML file and pageids file from " + "downloaded content, have %s and %s\n" % ( o['stubs_path'], o['page_ids_path'])) if o['convert_xml']: if not o['content_path']: usage("in convert_xml: Missing mandatory option for skipping previous step.", True) if not o['mwxml2sql']: usage("in convert_xml: Missing mandatory option mwxml2sql.") if (verbose): sys.stderr.write("Converting content to page, revision, text tables\n") c = Converter(o['mwxml2sql'], o['output_dir'], verbose) # convert the content file to page, revision and text tables c.convert_content(o['content_path'], o['stubs_path'], o['mw_version']) if verbose: sys.stderr.write("Done converting content to page, revision, text tables\n") if o['filter_sql']: if not o['page_ids_path']: usage("in filter_sql: Missing mandatory option for skipping previous step.", True) if not o['sql_files']: usage("in filter_sql: Missing mandatory option sqlfiles.") if not o['sqlfilter']: usage("in filter_sql: Missing mandatory option sqlfilter.") if verbose: sys.stderr.write("Filtering sql tables against page ids for import\n") f = Filter(o['sqlfilter'], o['output_dir'], verbose) # filter all the sql tables (which should be in some nice directory) # against the pageids in page_ids_path file for table in ["categorylinks", "externallinks", "imagelinks", "interwiki", "iwlinks", "langlinks", "page_props", "page_restrictions", "pagelinks", "protected_titles", "redirect", "templatelinks"]: sql_filename = o['sql_files'].format(t=table) filtered_filename = os.path.basename(sql_filename) f.filter(sql_filename, filtered_filename, o['page_ids_path']) if (verbose): sys.stderr.write("Done filtering sql tables against page ids for import\n") # the one file we can't filter, it's not by pageid as categories might not have pages # so we'll have to import it wholesale... (or you can ignore them completely) sql_filename = o['sql_files'].format(t='category') new_filename = os.path.join(o['output_dir'], os.path.basename(sql_filename)) if verbose: sys.stderr.write("about to copy %s to %s\n" % (sql_filename, new_filename)) shutil.copyfile(sql_filename, new_filename) if (verbose): sys.stderr.write("Done!\n") sys.exit(0)
def write_stub_and_page_ids(self, content_path, stubs_path, page_ids_path): """Write an XML stub file (omitting text content) and a list of page ids, from a MediaWiki XML page content file. Arguments: content_path -- path to the XML page content file to read stubs_path -- path to the stubs file to write page_ids_path -- path to the page ids file to write""" page_pattern = "^\s*<page>" compiled_page_pattern = re.compile(page_pattern) revision_pattern = "^\s*<revision>" compiled_revision_pattern = re.compile(revision_pattern) id_pattern = "^\s*<id>(?P<i>.+)</id>\s*\n$" compiled_id_pattern = re.compile(id_pattern) text_pattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"' compiled_text_pattern = re.compile(text_pattern) in_fd = File.open_input(content_path) out_fd = File.open_output(stubs_path) outpage_id_fd = File.open_output(page_ids_path) current_title = None current_text_id = None page_id = None expect_rev_id = False expect_page_id = False for line in in_fd: # FIXME we could jus calculate text len if the output is missing # the bytes attr. (as in dumps not from Special:Export) # format in content file: # <text <text xml:space="preserve" bytes="78"> # format wanted for stubs file: # <text id="11248" bytes="9" /> if '<' in line: result = compiled_text_pattern.match(line) if result: line = result.group("s") + '<text id="%s" bytes="%s" />\n' % ( current_text_id, result.group("b")) out_fd.write(line) continue elif '</text' in line: continue result = compiled_page_pattern.match(line) if result: expect_page_id = True out_fd.write(line) continue result = compiled_revision_pattern.match(line) if result: expect_rev_id = True out_fd.write(line) continue if expect_page_id: result = compiled_id_pattern.match(line) if result: outpage_id_fd.write("1:%s\n" % result.group("i")) expect_page_id = False out_fd.write(line) continue if expect_rev_id: result = compiled_id_pattern.match(line) if result: current_text_id = result.group("i") expect_rev_id = False out_fd.write(line) continue out_fd.write(line) else: continue # these are lines of text, we can skip them in_fd.close() out_fd.close() outpage_id_fd.close()
if verbose: sys.stderr.write("mediawiki titles added to page title hash\n") t.addTitlesFromFile(o['moduleTitlesPath'], "828") if verbose: sys.stderr.write("module titles added to page title hash\n") t.addTitlesFromFile(o['templateTitlesPath'], "10") if verbose: sys.stderr.write("template titles added to page title hash\n") t.uniq() o['mainTitlesWithPrefixPath'] = out.makePath( "main-titles-with-nsprefix.gz") outFd = File.openOutput(o['mainTitlesWithPrefixPath']) for line in t.list: outFd.write(line + "\n") outFd.close() o['tmplTitlesWithPrefixPath'] = out.makePath( "tmpl-titles-with-nsprefix.gz") outFd = File.openOutput(o['tmplTitlesWithPrefixPath']) for line in t.listTemplates: outFd.write(line + "\n") outFd.close() if (verbose): sys.stderr.write( "Done converting retrieved titles, have %s and %s\n" % (o['mainTitlesWithPrefixPath'], o['tmplTitlesWithPrefixPath']))
def writeStubAndPageIds(self, contentPath, stubsPath, pageIdsPath): """Write an XML stub file (omitting text content) and a list of page ids, from a MediaWiki XML page content file. Arguments: contentPath -- path to the XML page content file to read stubsPath -- path to the stubs file to write pageIdsPath -- path to the page ids file to write""" pagePattern = "^\s*<page>" compiledPagePattern = re.compile(pagePattern) revisionPattern = "^\s*<revision>" compiledRevisionPattern = re.compile(revisionPattern) idPattern = "^\s*<id>(?P<i>.+)</id>\s*\n$" compiledIdPattern = re.compile(idPattern) textPattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"' compiledTextPattern = re.compile(textPattern) inFd = File.openInput(contentPath) outFd = File.openOutput(stubsPath) outPageIdFd = File.openOutput(pageIdsPath) currentTitle = None currentTextId = None pageId = None expectRevId = False expectPageId = False for line in inFd: # FIXME we could jus calculate text len if the output is missing # the bytes attr. (as in dumps not from Special:Export) # format in content file: # <text <text xml:space="preserve" bytes="78"> # format wanted for stubs file: # <text id="11248" bytes="9" /> if '<' in line: result = compiledTextPattern.match(line) if result: line = result.group("s") + '<text id="%s" bytes="%s" />\n' % (currentTextId, result.group("b")) outFd.write(line) continue elif '</text' in line: continue result = compiledPagePattern.match(line) if result: expectPageId = True outFd.write(line) continue result = compiledRevisionPattern.match(line) if result: expectRevId = True outFd.write(line) continue if expectPageId: result = compiledIdPattern.match(line) if result: outPageIdFd.write("1:%s\n" % result.group("i")) expectPageId = False outFd.write(line) continue if expectRevId: result = compiledIdPattern.match(line) if result: currentTextId = result.group("i") expectRevId = False outFd.write(line) continue outFd.write(line) else: continue # these are lines of text, we can skip them inFd.close() outFd.close() outPageIdFd.close()
t.addTitlesFromFile(o['mediawikiTitlesPath'], "8") if verbose: sys.stderr.write("mediawiki titles added to page title hash\n") t.addTitlesFromFile(o['moduleTitlesPath'], "828") if verbose: sys.stderr.write("module titles added to page title hash\n") t.addTitlesFromFile(o['templateTitlesPath'], "10") if verbose: sys.stderr.write("template titles added to page title hash\n") t.uniq() o['mainTitlesWithPrefixPath'] = out.makePath("main-titles-with-nsprefix.gz") outFd = File.openOutput(o['mainTitlesWithPrefixPath']) for line in t.list: outFd.write(line + "\n") outFd.close() o['tmplTitlesWithPrefixPath'] = out.makePath("tmpl-titles-with-nsprefix.gz") outFd = File.openOutput(o['tmplTitlesWithPrefixPath']) for line in t.listTemplates: outFd.write(line + "\n") outFd.close() if (verbose): sys.stderr.write("Done converting retrieved titles, have %s and %s\n" % (o['mainTitlesWithPrefixPath'], o['tmplTitlesWithPrefixPath'])) if o['retrieveContent']: if not o['mainTitlesWithPrefixPath'] or not o['tmplTitlesWithPrefixPath']: