def getAllTitles(self): """Retrieve page titles from wiki in accordance with arguments given to constructor, in batches, writing them out to a file. On error (failure to rerieve some titles), raises WikiRetrieveErr exception.""" self.more = True if self.startDate: self.dateFormatter = Date() self.startDateString = self.dateFormatter.formatDate( self.startDate) self.endDateString = self.dateFormatter.formatDate(self.endDate) self.startDateSecs = self.dateFormatter.getSecs( self.startDateString) self.endDateSecs = self.dateFormatter.getSecs(self.endDateString) self.outputFd = File.openOutput(self.outFileName) count = 0 while True: count = count + self.batchSize titles = self.getBatchOfTitles() self.writeTitles(titles) if not len(titles): # not always an error break # FIXME is there a possibility that there will be a continue elt and # we'll be served the same titles again? if not self.more: break self.outputFd.close()
def writeSql(self): self.userDict = { 1: True } fd = File.openInput(self.xmlFile) logOutFd = File.openOutput(self.logOutFile) if self.userOutFile: userOutFd = File.openOutput(self.userOutFile) else: userOutFd = None if not self.skipHeader(fd): raise WikiContentErr("failed to find end of mediawiki/siteinfo header in xml file\n") eof = False while not eof: eof = self.doLogItem(fd, logOutFd, userOutFd) fd.close() logOutFd.close() if self.userOutFile: userOutFd.close() return
def getAllTitles(self): """Retrieve page content for all titles in accordance with arguments given to constructor, in batches, writing it out to a file. On error (failure to retrieve some content), raises WikiRetrieveErr exception""" self.outputFd = File.openOutput(self.outFileName) self.inputFd = File.openInput(self.titlesFile) first = True count = 0 eof = False while not eof: linecount = 0 titles = [] while not eof: line = self.inputFd.readline() if line == "": eof = True line = line.strip() if line: titles.append(line) linecount = linecount + 1 if linecount >= self.batchSize: break if (not titles): break count = count + self.batchSize content = self.getBatchOfPageContent(titles) if not len(content): raise WikiRetrieveErr( "content of zero length returned, uh oh.") if first: first = False content = self.stripSiteFooter(content) else: content = self.stripSiteHeaderAndFooter(content) self.outputFd.write(content) # cheap hack self.outputFd.write("</mediawiki>\n") self.outputFd.close() self.inputFd.close()
def writeStubAndPageIds(self, contentPath, stubsPath, pageIdsPath): """Write an XML stub file (omitting text content) and a list of page ids, from a MediaWiki XML page content file. Arguments: contentPath -- path to the XML page content file to read stubsPath -- path to the stubs file to write pageIdsPath -- path to the page ids file to write""" pagePattern = "^\s*<page>" compiledPagePattern = re.compile(pagePattern) revisionPattern = "^\s*<revision>" compiledRevisionPattern = re.compile(revisionPattern) idPattern = "^\s*<id>(?P<i>.+)</id>\s*\n$" compiledIdPattern = re.compile(idPattern) textPattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"' compiledTextPattern = re.compile(textPattern) inFd = File.openInput(contentPath) outFd = File.openOutput(stubsPath) outPageIdFd = File.openOutput(pageIdsPath) currentTitle = None currentTextId = None pageId = None expectRevId = False expectPageId = False for line in inFd: # FIXME we could jus calculate text len if the output is missing # the bytes attr. (as in dumps not from Special:Export) # format in content file: # <text <text xml:space="preserve" bytes="78"> # format wanted for stubs file: # <text id="11248" bytes="9" /> if '<' in line: result = compiledTextPattern.match(line) if result: line = result.group( "s") + '<text id="%s" bytes="%s" />\n' % ( currentTextId, result.group("b")) outFd.write(line) continue elif '</text' in line: continue result = compiledPagePattern.match(line) if result: expectPageId = True outFd.write(line) continue result = compiledRevisionPattern.match(line) if result: expectRevId = True outFd.write(line) continue if expectPageId: result = compiledIdPattern.match(line) if result: outPageIdFd.write("1:%s\n" % result.group("i")) expectPageId = False outFd.write(line) continue if expectRevId: result = compiledIdPattern.match(line) if result: currentTextId = result.group("i") expectRevId = False outFd.write(line) continue outFd.write(line) else: continue # these are lines of text, we can skip them inFd.close() outFd.close() outPageIdFd.close()
if verbose: sys.stderr.write("mediawiki titles added to page title hash\n") t.addTitlesFromFile(o['moduleTitlesPath'], "828") if verbose: sys.stderr.write("module titles added to page title hash\n") t.addTitlesFromFile(o['templateTitlesPath'], "10") if verbose: sys.stderr.write("template titles added to page title hash\n") t.uniq() o['mainTitlesWithPrefixPath'] = out.makePath( "main-titles-with-nsprefix.gz") outFd = File.openOutput(o['mainTitlesWithPrefixPath']) for line in t.list: outFd.write(line + "\n") outFd.close() o['tmplTitlesWithPrefixPath'] = out.makePath( "tmpl-titles-with-nsprefix.gz") outFd = File.openOutput(o['tmplTitlesWithPrefixPath']) for line in t.listTemplates: outFd.write(line + "\n") outFd.close() if (verbose): sys.stderr.write( "Done converting retrieved titles, have %s and %s\n" % (o['mainTitlesWithPrefixPath'], o['tmplTitlesWithPrefixPath']))
def writeStubAndPageIds(self, contentPath, stubsPath, pageIdsPath): """Write an XML stub file (omitting text content) and a list of page ids, from a MediaWiki XML page content file. Arguments: contentPath -- path to the XML page content file to read stubsPath -- path to the stubs file to write pageIdsPath -- path to the page ids file to write""" pagePattern = "^\s*<page>" compiledPagePattern = re.compile(pagePattern) revisionPattern = "^\s*<revision>" compiledRevisionPattern = re.compile(revisionPattern) idPattern = "^\s*<id>(?P<i>.+)</id>\s*\n$" compiledIdPattern = re.compile(idPattern) textPattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"' compiledTextPattern = re.compile(textPattern) inFd = File.openInput(contentPath) outFd = File.openOutput(stubsPath) outPageIdFd = File.openOutput(pageIdsPath) currentTitle = None currentTextId = None pageId = None expectRevId = False expectPageId = False for line in inFd: # FIXME we could jus calculate text len if the output is missing # the bytes attr. (as in dumps not from Special:Export) # format in content file: # <text <text xml:space="preserve" bytes="78"> # format wanted for stubs file: # <text id="11248" bytes="9" /> if '<' in line: result = compiledTextPattern.match(line) if result: line = result.group("s") + '<text id="%s" bytes="%s" />\n' % (currentTextId, result.group("b")) outFd.write(line) continue elif '</text' in line: continue result = compiledPagePattern.match(line) if result: expectPageId = True outFd.write(line) continue result = compiledRevisionPattern.match(line) if result: expectRevId = True outFd.write(line) continue if expectPageId: result = compiledIdPattern.match(line) if result: outPageIdFd.write("1:%s\n" % result.group("i")) expectPageId = False outFd.write(line) continue if expectRevId: result = compiledIdPattern.match(line) if result: currentTextId = result.group("i") expectRevId = False outFd.write(line) continue outFd.write(line) else: continue # these are lines of text, we can skip them inFd.close() outFd.close() outPageIdFd.close()
t.addTitlesFromFile(o['mediawikiTitlesPath'], "8") if verbose: sys.stderr.write("mediawiki titles added to page title hash\n") t.addTitlesFromFile(o['moduleTitlesPath'], "828") if verbose: sys.stderr.write("module titles added to page title hash\n") t.addTitlesFromFile(o['templateTitlesPath'], "10") if verbose: sys.stderr.write("template titles added to page title hash\n") t.uniq() o['mainTitlesWithPrefixPath'] = out.makePath("main-titles-with-nsprefix.gz") outFd = File.openOutput(o['mainTitlesWithPrefixPath']) for line in t.list: outFd.write(line + "\n") outFd.close() o['tmplTitlesWithPrefixPath'] = out.makePath("tmpl-titles-with-nsprefix.gz") outFd = File.openOutput(o['tmplTitlesWithPrefixPath']) for line in t.listTemplates: outFd.write(line + "\n") outFd.close() if (verbose): sys.stderr.write("Done converting retrieved titles, have %s and %s\n" % (o['mainTitlesWithPrefixPath'], o['tmplTitlesWithPrefixPath'])) if o['retrieveContent']: if not o['mainTitlesWithPrefixPath'] or not o['tmplTitlesWithPrefixPath']: