def addRelatedTitlesFromFile(self, filename, relatedNsList, nsList): """Read list of titles from file, for those in one of the specified namespaces, convert the title to one from its related namespace (i.e. if it was in Category talk, convert to Category, if it was in File talk, convert to File, etc.) and add to title list and dict. Arguments: filename -- full path to list of titles relatedNsList -- list of namespaces wanted, e.g. [ "4", "6", "12" ] nsList -- list of namespaces to convert from, in the same order as the related NsList, e.g. [ "5", "7", "13" ]""" # don't pass templates in here, we do those separately # because it could be a huge list and we want the user # to be able to save and reuse it fd = File.openInput(filename) for line in fd: line = line.strip() sep = line.find(":") if sep != -1: prefix = line[:sep] if prefix in self.nsDictByString: # main, file, category, project talk namespaces if self.nsDictByString[prefix] in relatedNsList: noPrefixTitle = line[sep + 1:] # convert to file, category, project namespace relatedNs = str(int(self.nsDictByString[prefix]) - 1) if (self.nsDict[relatedNs]): newTitle = self.nsDict[ relatedNs] + ":" + noPrefixTitle else: newTitle = noPrefixTitle # main namespace titles self.list.append(newTitle) if noPrefixTitle in self.dict: self.dict[noPrefixTitle][relatedNs] = True else: self.dict[noPrefixTitle] = {relatedNs: True} # file, category, project talk namespaces elif self.nsDictByString[prefix] in nsList: ns = self.nsDictByString[prefix] noPrefixTitle = line[sep + 1:] self.list.append(noPrefixTitle) if noPrefixTitle in self.dict: self.dict[noPrefixTitle][ns] = True else: self.dict[noPrefixTitle] = {ns: True} elif "0" in nsList: # main namespace, won't be caught above self.list.append(line) if line in self.dict: self.dict[line]["0"] = True else: self.dict[line] = {"0": True} fd.close()
def addRelatedTitlesFromFile(self, filename, relatedNsList, nsList): """Read list of titles from file, for those in one of the specified namespaces, convert the title to one from its related namespace (i.e. if it was in Category talk, convert to Category, if it was in File talk, convert to File, etc.) and add to title list and dict. Arguments: filename -- full path to list of titles relatedNsList -- list of namespaces wanted, e.g. [ "4", "6", "12" ] nsList -- list of namespaces to convert from, in the same order as the related NsList, e.g. [ "5", "7", "13" ]""" # don't pass templates in here, we do those separately # because it could be a huge list and we want the user # to be able to save and reuse it fd = File.openInput(filename) for line in fd: line = line.strip() sep = line.find(":") if sep != -1: prefix = line[:sep] if prefix in self.nsDictByString: # main, file, category, project talk namespaces if self.nsDictByString[prefix] in relatedNsList: noPrefixTitle = line[sep+1:] # convert to file, category, project namespace relatedNs = str(int(self.nsDictByString[prefix]) - 1) if (self.nsDict[relatedNs]): newTitle = self.nsDict[relatedNs] + ":" + noPrefixTitle else: newTitle = noPrefixTitle # main namespace titles self.list.append(newTitle) if noPrefixTitle in self.dict: self.dict[noPrefixTitle][relatedNs] = True else: self.dict[noPrefixTitle] = { relatedNs : True } # file, category, project talk namespaces elif self.nsDictByString[prefix] in nsList: ns = self.nsDictByString[prefix] noPrefixTitle = line[sep+1:] self.list.append(noPrefixTitle) if noPrefixTitle in self.dict: self.dict[noPrefixTitle][ns] = True else: self.dict[noPrefixTitle] = { ns : True } elif "0" in nsList: # main namespace, won't be caught above self.list.append(line) if line in self.dict: self.dict[line]["0"] = True else: self.dict[line] = { "0" : True } fd.close()
def getTitlesDict(self,sqlFile): """Arguments: sqlFile -- file containing pageid whitespace nsnum whitespace pagetitle where the title is expected to be sql escaped and can be enclosed with single quotes""" fd = File.openInput(sqlFile) t = {} for line in fd: (pageid, ns, title) = line.split(' ',3) ns = int(ns) if title in t: t[title][ns] = pageid else: t[title] = { ns: pageid } return t
def getAllTitles(self): """Retrieve page content for all titles in accordance with arguments given to constructor, in batches, writing it out to a file. On error (failure to retrieve some content), raises WikiRetrieveErr exception""" self.outputFd = File.openOutput(self.outFileName) self.inputFd = File.openInput(self.titlesFile) first = True count = 0 eof = False while not eof: linecount = 0 titles = [] while not eof: line = self.inputFd.readline() if line == "": eof = True line = line.strip() if line: titles.append(line) linecount = linecount + 1 if linecount >= self.batchSize: break if (not titles): break count = count + self.batchSize content = self.getBatchOfPageContent(titles) if not len(content): raise WikiRetrieveErr( "content of zero length returned, uh oh.") if first: first = False content = self.stripSiteFooter(content) else: content = self.stripSiteHeaderAndFooter(content) self.outputFd.write(content) # cheap hack self.outputFd.write("</mediawiki>\n") self.outputFd.close() self.inputFd.close()
def writeSql(self): self.userDict = { 1: True } fd = File.openInput(self.xmlFile) logOutFd = File.openOutput(self.logOutFile) if self.userOutFile: userOutFd = File.openOutput(self.userOutFile) else: userOutFd = None if not self.skipHeader(fd): raise WikiContentErr("failed to find end of mediawiki/siteinfo header in xml file\n") eof = False while not eof: eof = self.doLogItem(fd, logOutFd, userOutFd) fd.close() logOutFd.close() if self.userOutFile: userOutFd.close() return
def addTitlesFromFile(self, filename, ns): """add titles from a file to the title list and dict. Note that template titles get added to a different title list than the rest, for separate processing Arguments: filename -- full path to file containing page titles ns -- number (string of digits) of namespace of page titles to grab from file""" fd = File.openInput(filename) prefix = self.nsDict[ns] + ":" prefixLen = len(prefix) for line in fd: if line.startswith(prefix): if ns == "10": # special case bleah self.listTemplates.append(line[:-1]) # lose newline else: self.list.append(line[:-1]) # lose newline noPrefixTitle = line[prefixLen:-1] if noPrefixTitle in self.dict: self.dict[noPrefixTitle][ns] = True else: self.dict[noPrefixTitle] = {ns: True}
def addTitlesFromFile(self, filename, ns): """add titles from a file to the title list and dict. Note that template titles get added to a different title list than the rest, for separate processing Arguments: filename -- full path to file containing page titles ns -- number (string of digits) of namespace of page titles to grab from file""" fd = File.openInput(filename) prefix = self.nsDict[ns] + ":" prefixLen = len(prefix) for line in fd: if line.startswith(prefix): if ns == "10": # special case bleah self.listTemplates.append(line[:-1]) # lose newline else: self.list.append(line[:-1]) # lose newline noPrefixTitle = line[prefixLen:-1] if noPrefixTitle in self.dict: self.dict[noPrefixTitle][ns] = True else: self.dict[noPrefixTitle] = { ns : True }
def writeStubAndPageIds(self, contentPath, stubsPath, pageIdsPath): """Write an XML stub file (omitting text content) and a list of page ids, from a MediaWiki XML page content file. Arguments: contentPath -- path to the XML page content file to read stubsPath -- path to the stubs file to write pageIdsPath -- path to the page ids file to write""" pagePattern = "^\s*<page>" compiledPagePattern = re.compile(pagePattern) revisionPattern = "^\s*<revision>" compiledRevisionPattern = re.compile(revisionPattern) idPattern = "^\s*<id>(?P<i>.+)</id>\s*\n$" compiledIdPattern = re.compile(idPattern) textPattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"' compiledTextPattern = re.compile(textPattern) inFd = File.openInput(contentPath) outFd = File.openOutput(stubsPath) outPageIdFd = File.openOutput(pageIdsPath) currentTitle = None currentTextId = None pageId = None expectRevId = False expectPageId = False for line in inFd: # FIXME we could jus calculate text len if the output is missing # the bytes attr. (as in dumps not from Special:Export) # format in content file: # <text <text xml:space="preserve" bytes="78"> # format wanted for stubs file: # <text id="11248" bytes="9" /> if '<' in line: result = compiledTextPattern.match(line) if result: line = result.group( "s") + '<text id="%s" bytes="%s" />\n' % ( currentTextId, result.group("b")) outFd.write(line) continue elif '</text' in line: continue result = compiledPagePattern.match(line) if result: expectPageId = True outFd.write(line) continue result = compiledRevisionPattern.match(line) if result: expectRevId = True outFd.write(line) continue if expectPageId: result = compiledIdPattern.match(line) if result: outPageIdFd.write("1:%s\n" % result.group("i")) expectPageId = False outFd.write(line) continue if expectRevId: result = compiledIdPattern.match(line) if result: currentTextId = result.group("i") expectRevId = False outFd.write(line) continue outFd.write(line) else: continue # these are lines of text, we can skip them inFd.close() outFd.close() outPageIdFd.close()
def writeStubAndPageIds(self, contentPath, stubsPath, pageIdsPath): """Write an XML stub file (omitting text content) and a list of page ids, from a MediaWiki XML page content file. Arguments: contentPath -- path to the XML page content file to read stubsPath -- path to the stubs file to write pageIdsPath -- path to the page ids file to write""" pagePattern = "^\s*<page>" compiledPagePattern = re.compile(pagePattern) revisionPattern = "^\s*<revision>" compiledRevisionPattern = re.compile(revisionPattern) idPattern = "^\s*<id>(?P<i>.+)</id>\s*\n$" compiledIdPattern = re.compile(idPattern) textPattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"' compiledTextPattern = re.compile(textPattern) inFd = File.openInput(contentPath) outFd = File.openOutput(stubsPath) outPageIdFd = File.openOutput(pageIdsPath) currentTitle = None currentTextId = None pageId = None expectRevId = False expectPageId = False for line in inFd: # FIXME we could jus calculate text len if the output is missing # the bytes attr. (as in dumps not from Special:Export) # format in content file: # <text <text xml:space="preserve" bytes="78"> # format wanted for stubs file: # <text id="11248" bytes="9" /> if '<' in line: result = compiledTextPattern.match(line) if result: line = result.group("s") + '<text id="%s" bytes="%s" />\n' % (currentTextId, result.group("b")) outFd.write(line) continue elif '</text' in line: continue result = compiledPagePattern.match(line) if result: expectPageId = True outFd.write(line) continue result = compiledRevisionPattern.match(line) if result: expectRevId = True outFd.write(line) continue if expectPageId: result = compiledIdPattern.match(line) if result: outPageIdFd.write("1:%s\n" % result.group("i")) expectPageId = False outFd.write(line) continue if expectRevId: result = compiledIdPattern.match(line) if result: currentTextId = result.group("i") expectRevId = False outFd.write(line) continue outFd.write(line) else: continue # these are lines of text, we can skip them inFd.close() outFd.close() outPageIdFd.close()