Python File Examples, wikifile.File Python Examples

Example #1

0

Show file

 def writeSql(self):
     self.userDict = { 1: True }
     fd = File.openInput(self.xmlFile)
     logOutFd = File.openOutput(self.logOutFile)
     if self.userOutFile:
         userOutFd = File.openOutput(self.userOutFile)
     else:
         userOutFd = None
     if not self.skipHeader(fd):
         raise WikiContentErr("failed to find end of mediawiki/siteinfo header in xml file\n")
     eof = False
     while not eof:
         eof = self.doLogItem(fd, logOutFd, userOutFd)
     fd.close()
     logOutFd.close()
     if self.userOutFile:
         userOutFd.close()
     return

Example #2

0

Show file

File: pageslogging2sql.py Project: wikimedia/operations-dumps-import-tools

 def write_sql(self):
     self.user_dict = {1: True}
     fd = File.open_input(self.xml_file)
     logout_fd = File.open_output(self.log_out_file)
     if self.user_out_file:
         userout_fd = File.open_output(self.user_out_file)
     else:
         userout_fd = None
     if not self.skip_header(fd):
         raise WikiContentErr(
             "failed to find end of mediawiki/siteinfo header in xml file\n"
         )
     eof = False
     while not eof:
         eof = self.do_log_item(fd, logout_fd, userout_fd)
     fd.close()
     logout_fd.close()
     if self.user_out_file:
         userout_fd.close()
     return

Example #3

0

Show file

File: wikicontent2sql.py Project: ubraz/6.834-final-project

    def addRelatedTitlesFromFile(self, filename, relatedNsList, nsList):
        """Read list of titles from file, for those in one of the
        specified namespaces, convert the title to one from its related
        namespace (i.e. if it was in Category talk, convert to Category,
        if it was in File talk, convert to File, etc.) and add to title
        list and dict. Arguments:
        filename       -- full path to list of titles
        relatedNsList  -- list of namespaces wanted, e.g. [ "4", "6", "12" ]
        nsList         -- list of namespaces to convert from, in the same order as the
                          related NsList, e.g. [ "5", "7", "13" ]"""

        # don't pass templates in here, we do those separately
        # because it could be a huge list and we want the user
        # to be able to save and reuse it
        fd = File.openInput(filename)
        for line in fd:
            line = line.strip()
            sep = line.find(":")
            if sep != -1:
                prefix = line[:sep]
                if prefix in self.nsDictByString:
                    # main, file, category, project talk namespaces
                    if self.nsDictByString[prefix] in relatedNsList:
                        noPrefixTitle = line[sep + 1:]
                        # convert to file, category, project namespace
                        relatedNs = str(int(self.nsDictByString[prefix]) - 1)
                        if (self.nsDict[relatedNs]):
                            newTitle = self.nsDict[
                                relatedNs] + ":" + noPrefixTitle
                        else:
                            newTitle = noPrefixTitle  # main namespace titles
                        self.list.append(newTitle)
                        if noPrefixTitle in self.dict:
                            self.dict[noPrefixTitle][relatedNs] = True
                        else:
                            self.dict[noPrefixTitle] = {relatedNs: True}
                    # file, category, project talk namespaces
                    elif self.nsDictByString[prefix] in nsList:
                        ns = self.nsDictByString[prefix]
                        noPrefixTitle = line[sep + 1:]
                        self.list.append(noPrefixTitle)
                        if noPrefixTitle in self.dict:
                            self.dict[noPrefixTitle][ns] = True
                        else:
                            self.dict[noPrefixTitle] = {ns: True}
            elif "0" in nsList:
                # main namespace, won't be caught above
                self.list.append(line)
                if line in self.dict:
                    self.dict[line]["0"] = True
                else:
                    self.dict[line] = {"0": True}
        fd.close()

Example #4

0

Show file

File: wikicontent2sql.py Project: nforrester/6.834-final-project

    def addRelatedTitlesFromFile(self, filename, relatedNsList, nsList):
        """Read list of titles from file, for those in one of the
        specified namespaces, convert the title to one from its related
        namespace (i.e. if it was in Category talk, convert to Category,
        if it was in File talk, convert to File, etc.) and add to title
        list and dict. Arguments:
        filename       -- full path to list of titles
        relatedNsList  -- list of namespaces wanted, e.g. [ "4", "6", "12" ]
        nsList         -- list of namespaces to convert from, in the same order as the
                          related NsList, e.g. [ "5", "7", "13" ]"""

        # don't pass templates in here, we do those separately
        # because it could be a huge list and we want the user
        # to be able to save and reuse it 
        fd = File.openInput(filename)
        for line in fd:
            line = line.strip()
            sep = line.find(":")
            if sep != -1:
                prefix = line[:sep]
                if prefix in self.nsDictByString:
                    # main, file, category, project talk namespaces
                    if self.nsDictByString[prefix] in relatedNsList:
                        noPrefixTitle = line[sep+1:]
                        # convert to file, category, project namespace
                        relatedNs = str(int(self.nsDictByString[prefix]) - 1)
                        if (self.nsDict[relatedNs]):
                            newTitle = self.nsDict[relatedNs] + ":" + noPrefixTitle 
                        else:
                            newTitle = noPrefixTitle  # main namespace titles
                        self.list.append(newTitle)
                        if noPrefixTitle in self.dict:
                            self.dict[noPrefixTitle][relatedNs] = True
                        else:
                            self.dict[noPrefixTitle] = { relatedNs : True }
                    # file, category, project talk namespaces
                    elif self.nsDictByString[prefix] in nsList:
                        ns = self.nsDictByString[prefix]
                        noPrefixTitle = line[sep+1:]
                        self.list.append(noPrefixTitle)
                        if noPrefixTitle in self.dict:
                            self.dict[noPrefixTitle][ns] = True
                        else:
                            self.dict[noPrefixTitle] = { ns : True }
            elif "0" in nsList:
                # main namespace, won't be caught above
                self.list.append(line)
                if line in self.dict:
                    self.dict[line]["0"] = True
                else:
                    self.dict[line] = { "0" : True }
        fd.close()

Example #5

0

Show file

 def getTitlesDict(self,sqlFile):
     """Arguments:
     sqlFile         -- file containing pageid whitespace nsnum whitespace pagetitle where the title
                        is expected to be sql escaped and can be enclosed with single quotes"""
     fd = File.openInput(sqlFile)
     t = {}
     for line in fd:
         (pageid, ns, title) = line.split(' ',3)
         ns = int(ns)
         if title in t:
             t[title][ns] = pageid
         else:
             t[title] = { ns: pageid }
     return t

Example #6

0

Show file

File: wikicontent2sql.py Project: wikimedia/operations-dumps-import-tools

    def add_titles_from_file(self, filename, ns):
        """add titles from a file to the title list and dict.
        Note that template titles get added to a different title list
        than the rest, for separate processing
        Arguments:
        filename   -- full path to file containing page titles
        ns         -- number (string of digits) of namespace of page titles to
                      grab from file"""

        fd = File.open_input(filename)
        prefix = self.ns_dict[ns] + ":"
        prefix_len = len(prefix)
        for line in fd:
            if line.startswith(prefix):
                if ns == "10":  # special case bleah
                    self.list_templates.append(line[:-1])  # lose newline
                else:
                    self.list.append(line[:-1])  # lose newline
                no_prefix_title = line[prefix_len:-1]
                if no_prefix_title in self.dict:
                    self.dict[no_prefix_title][ns] = True
                else:
                    self.dict[no_prefix_title] = {ns: True}

Example #7

0

Show file

File: wikicontent2sql.py Project: nforrester/6.834-final-project

    def addTitlesFromFile(self, filename, ns):
        """add titles from a file to the title list and dict.
        Note that template titles get added to a different title list
        than the rest, for separate processing
        Arguments:
        filename   -- full path to file containing page titles
        ns         -- number (string of digits) of namespace of page titles to
                      grab from file"""

        fd = File.openInput(filename)
        prefix = self.nsDict[ns] + ":"
        prefixLen = len(prefix)
        for line in fd:
            if line.startswith(prefix):
                if ns == "10": # special case bleah
                    self.listTemplates.append(line[:-1]) # lose newline
                else:
                    self.list.append(line[:-1]) # lose newline
                noPrefixTitle = line[prefixLen:-1]
                if noPrefixTitle in self.dict:
                    self.dict[noPrefixTitle][ns] = True
                else:
                    self.dict[noPrefixTitle] = { ns : True }

Example #8

0

Show file

File: wikicontent2sql.py Project: wikimedia/operations-dumps-import-tools

def do_main():
    o = {}  # stash all opt vars in here

    # init main opt vars
    for opt in ['template', 'sql_files', 'mw_version', 'output_dir', 'username', 'password']:
        o[opt] = None

    o['project'] = "wikipedia"
    o['lang_code'] = "en"
    o['batch_size'] = 500

    cwd = Path(os.getcwd())
    o['sqlfilter'] = cwd.make_path("sqlfilter")
    o['wcr'] = cwd.make_path("wikiretriever.py")
    o['mwxml2sql'] = cwd.make_path("mwxml2sql")

    # init step opt vars
    for opt in ['retrieve_titles', 'convert_titles', 'retrieve_content', 'make_stubs',
                'convert_xml', 'filter_sql']:
        o[opt] = True

    # init file opt vars
    for opt in ['titles_path', 'mediawiki_titles_path', 'module_titles_path', 'template_titles_path',
                'main_titles_with_prefix_path', 'tmpl_titles_with_prefix_path', 'main_content_path',
                'template_content_path', 'content_path', 'stubs_path', 'page_ids_path']:
        o[opt] = None

    verbose = False

    # option handling
    main_options = ["template=", "sqlfiles=", "mwversion=", "lang=",
                    "project=", "batchsize=", "output=", "auth="]
    cmd_options = ["sqlfilter=", "mwxml2sql=", "wcr="]

    steps = ["retrievetitles", "converttitles", "retrievecontent", "makestubs",
             "convertxml", "filtersql"]
    skip_step_flags = ["no" + s for s in steps]

    convert_titles_options = ["titles=", "mwtitles=", "mdltitles=", "tmpltitles="]
    retrieve_content_options = ["titleswithprefix=", "tmpltitleswithprefix="]
    make_stubs_options = ["maincontent=", "tmplcontent=", "content="]
    convert_xml_filter_sql_options = ["stubs=", "pageids="]

    files = [fopt[:-1] for fopt in convert_titles_options + retrieve_content_options +
             make_stubs_options + convert_xml_filter_sql_options]

    misc_flags = ["verbose", "help", "extendedhelp"]

    all_options = (main_options + cmd_options + skip_step_flags + convert_titles_options +
                   retrieve_content_options + make_stubs_options +
                   convert_xml_filter_sql_options + misc_flags)
    try:
        (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", all_options)
    except getopt.GetoptError as e:
        usage(e.msg)

    for (opt, val) in options:

        # main opts
        if opt == "--template":
            o['template'] = val
        elif opt == "--sqlfiles":
            o['sql_files'] = val
        elif opt == "--mwversion":
            o['mw_version'] = val
        elif opt == "--lang":
            o['lang_code'] = val
        elif opt == "--project":
            o['project'] = val
        elif opt == "--batchsize":
            if not val.isdigit():
                usage("batch size must be a number")
            o['batch_size'] = int(val)
        elif opt == "--output":
            o['output_dir'] = val
        elif opt == "--auth":
            if ':' in val:
                o['username'], o['password'] = val.split(':')
            else:
                o['username'] = val

        # command opts
        elif opt == "--sqlfilter":
            o['sqlfilter'] = val
        elif opt == "--mwxml2sql":
            o['mwxml2sql'] = val
        elif opt == "--wcr":
            o['wcr'] = val

        # step options
        elif opt.startswith("--no"):
            process_step_option(opt[4:], o)

        # file options
        elif opt[2:] in files:
            process_file_option(opt[2:], val, o)

        # misc flags
        elif opt == "--verbose":
            verbose = True
        elif opt == "--help":
            usage("Options help:\n")
        elif opt == "--extendedhelp":
            usage("Options help:\n", True)
        else:
            usage("Unknown option specified: %s" % opt)

    if len(remainder) > 0:
        usage("Unknown option specified: <%s>" % remainder[0])

    # output files will have this date in their names
    date = time.strftime("%Y-%m-%d-%H%M%S", time.gmtime(time.time()))
    out = Path(o['output_dir'], o['lang_code'], o['project'], date)

    # processing begins
    if o['retrieve_titles']:
        if not o['wcr']:
            usage("in retrieve_titles: Missing mandatory option wcr.")
        if not o['template']:
            usage("in retrieve_titles: Missing mandatory option template.")
        if ':' not in o['template']:
            usage("in retrieve_titles: template option should start with 'Template:' " +
                  "or the equivalent in the wiki's language")
        if not o['mw_version']:
            usage("in retrieve_titles: Missing mandatory option mwversion.")

        if (verbose):
            sys.stderr.write("Retrieving page titles from wiki\n")

        r = Retriever(o['wcr'], o['output_dir'], o['lang_code'], o['project'], verbose)
        if not o['titles_path']:
            # get titles corresponding to the template
            o['titles_path'] = r.get_titles_embedded_in(o['template'], out.make_file("main-titles.gz"))
            if verbose:
                sys.stderr.write("main content titles file produced: <%s>\n" % o['titles_path'])

        if not o['mediawiki_titles_path']:
            # get the mediawiki page titles
            o['mediawiki_titles_path'] = r.get_titles_in_namespace("8", out.make_file("mw-titles.gz"))
            if verbose:
                sys.stderr.write("mediawiki titles file produced: <%s>\n" % o['mediawiki_titles_path'])

        if not o['module_titles_path']:
            # get the module (lua) page titles
            o['module_titles_path'] = r.get_titles_in_namespace("828", out.make_file("mod-titles.gz"))
            if verbose:
                sys.stderr.write("modules (lua) titles file produced: <%s>\n" % o['module_titles_path'])

        if not o['template_titles_path']:
            # get the template page titles
            o['template_titles_path'] = r.get_titles_in_namespace("10", out.make_file("tmpl-titles.gz"))
            if verbose:
                sys.stderr.write("templates titles file produced: <%s>\n" % o['template_titles_path'])

        if (verbose):
            sys.stderr.write("Done retrieving page titles from wiki, have " +
                             "%s, %s, %s and %s\n" % (
                                 o['titles_path'], o['mediawiki_titles_path'],
                                 o['module_titles_path'], o['template_titles_path']))

    if o['convert_titles']:
        if (not o['titles_path'] or not o['mediawiki_titles_path'] or not o['module_titles_path'] or
                not o['template_titles_path']):
            usage("Missing mandatory option for skipping previous step.", True)
        if not o['wcr']:
            usage("Missing mandatory option wcr.")

        if (verbose):
            sys.stderr.write("Converting retrieved titles \n")

        r = Retriever(o['wcr'], o['output_dir'], o['lang_code'], o['project'], verbose)

        # get namespaces from the api
        ns_dict = r.get_ns_dict()

        ns_dict_by_string = {}
        for nsnum in ns_dict.keys():
            ns_dict_by_string[ns_dict[nsnum]] = nsnum

        if verbose:
            sys.stderr.write("namespace dicts assembled\n")

        # get list of titles with prefix, not the talk pages but the actual ones,
        # (for use for download) - without dups
        # also create a hash with title, list of ns for this title (it will have
        # at least one entry in the list)
        t = Titles(ns_dict, ns_dict_by_string)

        # check main, file, category, project talk namespaces and convert to
        # main, file, category, project talk namespaces
        t.add_related_titles_from_file(o['titles_path'], ["1", "5", "7", "15"], ["0", "4", "6", "14"])

        if verbose:
            sys.stderr.write("page title hash assembled\n")

        t.add_titles_from_file(o['mediawiki_titles_path'], "8")
        if verbose:
            sys.stderr.write("mediawiki titles added to page title hash\n")

        t.add_titles_from_file(o['module_titles_path'], "828")
        if verbose:
            sys.stderr.write("module titles added to page title hash\n")

        t.add_titles_from_file(o['template_titles_path'], "10")
        if verbose:
            sys.stderr.write("template titles added to page title hash\n")

        t.uniq()

        o['main_titles_with_prefix_path'] = out.make_path("main-titles-with-nsprefix.gz")
        out_fd = File.open_output(o['main_titles_with_prefix_path'])
        for line in t.list:
            out_fd.write(line + "\n")
        out_fd.close()

        o['tmpl_titles_with_prefix_path'] = out.make_path("tmpl-titles-with-nsprefix.gz")
        out_fd = File.open_output(o['tmpl_titles_with_prefix_path'])
        for line in t.list_templates:
            out_fd.write(line + "\n")
        out_fd.close()

        if (verbose):
            sys.stderr.write("Done converting retrieved titles, have %s and %s\n"
                             % (o['main_titles_with_prefix_path'], o['tmpl_titles_with_prefix_path']))

    if o['retrieve_content']:
        if not o['main_titles_with_prefix_path'] or not o['tmpl_titles_with_prefix_path']:
            usage("in retrieve_content: Missing mandatory option for skipping previous step.", True)

        if (verbose):
            sys.stderr.write("Retrieving page content from wiki\n")

        if not o['template_content_path']:
            # filter out the template titles from the main_titles_with_prefix_path file
            # and just download the rest
            o['template_content_path'] = r.get_content(o['tmpl_titles_with_prefix_path'],
                                                       out.make_file("template-content.gz"))
            if verbose:
                sys.stderr.write("content retrieved from template page titles\n")

        if not o['main_content_path']:
            o['main_content_path'] = r.get_content(o['main_titles_with_prefix_path'],
                                                   out.make_file("rest-content.gz"))
            if verbose:
                sys.stderr.write("content retrieved from page titles\n")

        o['content_path'] = out.make_path("content.gz")
        File.combine_xml([o['template_content_path'], o['main_content_path']], o['content_path'])

        if (verbose):
            sys.stderr.write("Done retrieving page content from wiki, have %s, %s and %s\n"
                             % (o['template_content_path'], o['main_content_path'], o['content_path']))

    if o['make_stubs']:
        if not o['content_path']:
            usage("in make_stubs: Missing mandatory option for skipping previous step.", True)

        if (verbose):
            sys.stderr.write("Generating stub XML file and pageids file from downloaded content\n")
        s = Stubber(o['output_dir'], verbose)
        # generate stub XML file for converting sql and list of page ids for filtering sql
        o['stubs_path'] = out.make_path("stubs.gz")
        o['page_ids_path'] = out.make_path("pageids.gz")
        s.write_stub_and_page_ids(o['content_path'], o['stubs_path'], o['page_ids_path'])
        if (verbose):
            sys.stderr.write("Done generating stub XML file and pageids file from " +
                             "downloaded content, have %s and %s\n" % (
                                 o['stubs_path'], o['page_ids_path']))

    if o['convert_xml']:
        if not o['content_path']:
            usage("in convert_xml: Missing mandatory option for skipping previous step.", True)
        if not o['mwxml2sql']:
            usage("in convert_xml: Missing mandatory option mwxml2sql.")

        if (verbose):
            sys.stderr.write("Converting content to page, revision, text tables\n")
        c = Converter(o['mwxml2sql'], o['output_dir'], verbose)
        # convert the content file to page, revision and text tables
        c.convert_content(o['content_path'], o['stubs_path'], o['mw_version'])
        if verbose:
            sys.stderr.write("Done converting content to page, revision, text tables\n")

    if o['filter_sql']:
        if not o['page_ids_path']:
            usage("in filter_sql: Missing mandatory option for skipping previous step.", True)
        if not o['sql_files']:
            usage("in filter_sql: Missing mandatory option sqlfiles.")
        if not o['sqlfilter']:
            usage("in filter_sql: Missing mandatory option sqlfilter.")

        if verbose:
            sys.stderr.write("Filtering sql tables against page ids for import\n")

        f = Filter(o['sqlfilter'], o['output_dir'], verbose)
        # filter all the sql tables (which should be in some nice directory)
        # against the pageids in page_ids_path file
        for table in ["categorylinks", "externallinks", "imagelinks", "interwiki",
                      "iwlinks", "langlinks", "page_props", "page_restrictions",
                      "pagelinks", "protected_titles", "redirect", "templatelinks"]:
            sql_filename = o['sql_files'].format(t=table)
            filtered_filename = os.path.basename(sql_filename)
            f.filter(sql_filename,
                     filtered_filename,
                     o['page_ids_path'])
        if (verbose):
            sys.stderr.write("Done filtering sql tables against page ids for import\n")

        # the one file we can't filter, it's not by pageid as categories might not have pages
        # so we'll have to import it wholesale... (or you can ignore them completely)
        sql_filename = o['sql_files'].format(t='category')
        new_filename = os.path.join(o['output_dir'], os.path.basename(sql_filename))
        if verbose:
            sys.stderr.write("about to copy %s to %s\n" % (sql_filename, new_filename))
        shutil.copyfile(sql_filename, new_filename)

    if (verbose):
        sys.stderr.write("Done!\n")
    sys.exit(0)

Example #9

0

Show file

File: wikicontent2sql.py Project: wikimedia/operations-dumps-import-tools

    def write_stub_and_page_ids(self, content_path, stubs_path, page_ids_path):
        """Write an XML stub file (omitting text content) and a
        list of page ids, from a MediaWiki XML page content file.
        Arguments:
        content_path  -- path to the XML page content file to read
        stubs_path    -- path to the stubs file to write
        page_ids_path  -- path to the page ids file to write"""

        page_pattern = "^\s*<page>"
        compiled_page_pattern = re.compile(page_pattern)
        revision_pattern = "^\s*<revision>"
        compiled_revision_pattern = re.compile(revision_pattern)
        id_pattern = "^\s*<id>(?P<i>.+)</id>\s*\n$"
        compiled_id_pattern = re.compile(id_pattern)
        text_pattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"'
        compiled_text_pattern = re.compile(text_pattern)

        in_fd = File.open_input(content_path)
        out_fd = File.open_output(stubs_path)
        outpage_id_fd = File.open_output(page_ids_path)
        current_title = None
        current_text_id = None
        page_id = None

        expect_rev_id = False
        expect_page_id = False

        for line in in_fd:
            # FIXME we could jus calculate text len  if the output is missing
            # the bytes attr. (as in dumps not from Special:Export)
            # format in content file:
            #   <text <text xml:space="preserve" bytes="78">
            # format wanted for stubs file:
            #   <text id="11248" bytes="9" />
            if '<' in line:
                result = compiled_text_pattern.match(line)
                if result:
                    line = result.group("s") + '<text id="%s" bytes="%s" />\n' % (
                        current_text_id, result.group("b"))
                    out_fd.write(line)
                    continue
                elif '</text' in line:
                    continue

                result = compiled_page_pattern.match(line)
                if result:
                    expect_page_id = True
                    out_fd.write(line)
                    continue
                result = compiled_revision_pattern.match(line)
                if result:
                    expect_rev_id = True
                    out_fd.write(line)
                    continue
                if expect_page_id:
                    result = compiled_id_pattern.match(line)
                    if result:
                        outpage_id_fd.write("1:%s\n" % result.group("i"))
                        expect_page_id = False
                    out_fd.write(line)
                    continue
                if expect_rev_id:
                    result = compiled_id_pattern.match(line)
                    if result:
                        current_text_id = result.group("i")
                        expect_rev_id = False
                    out_fd.write(line)
                    continue
                out_fd.write(line)
            else:
                continue  # these are lines of text, we can skip them
        in_fd.close()
        out_fd.close()
        outpage_id_fd.close()

Example #10

0

Show file

File: wikicontent2sql.py Project: ubraz/6.834-final-project

        if verbose:
            sys.stderr.write("mediawiki titles added to page title hash\n")

        t.addTitlesFromFile(o['moduleTitlesPath'], "828")
        if verbose:
            sys.stderr.write("module titles added to page title hash\n")

        t.addTitlesFromFile(o['templateTitlesPath'], "10")
        if verbose:
            sys.stderr.write("template titles added to page title hash\n")

        t.uniq()

        o['mainTitlesWithPrefixPath'] = out.makePath(
            "main-titles-with-nsprefix.gz")
        outFd = File.openOutput(o['mainTitlesWithPrefixPath'])
        for line in t.list:
            outFd.write(line + "\n")
        outFd.close()

        o['tmplTitlesWithPrefixPath'] = out.makePath(
            "tmpl-titles-with-nsprefix.gz")
        outFd = File.openOutput(o['tmplTitlesWithPrefixPath'])
        for line in t.listTemplates:
            outFd.write(line + "\n")
        outFd.close()

        if (verbose):
            sys.stderr.write(
                "Done converting retrieved titles, have %s and %s\n" %
                (o['mainTitlesWithPrefixPath'], o['tmplTitlesWithPrefixPath']))

Example #11

0

Show file

File: wikicontent2sql.py Project: nforrester/6.834-final-project

    def writeStubAndPageIds(self, contentPath, stubsPath, pageIdsPath): 
        """Write an XML stub file (omitting text content) and a
        list of page ids, from a MediaWiki XML page content file.
        Arguments:
        contentPath  -- path to the XML page content file to read
        stubsPath    -- path to the stubs file to write
        pageIdsPath  -- path to the page ids file to write"""
        
        pagePattern = "^\s*<page>"
        compiledPagePattern = re.compile(pagePattern)
        revisionPattern = "^\s*<revision>"
        compiledRevisionPattern = re.compile(revisionPattern)
        idPattern = "^\s*<id>(?P<i>.+)</id>\s*\n$"
        compiledIdPattern = re.compile(idPattern)
        textPattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"'
        compiledTextPattern = re.compile(textPattern)

        inFd = File.openInput(contentPath)
        outFd = File.openOutput(stubsPath)
        outPageIdFd = File.openOutput(pageIdsPath)
        currentTitle = None
        currentTextId = None
        pageId = None

        expectRevId = False
        expectPageId = False

        for line in inFd:
            # FIXME we could jus calculate text len  if the output is missing
            # the bytes attr. (as in dumps not from Special:Export)
            # format in content file:
            #   <text <text xml:space="preserve" bytes="78">
            # format wanted for stubs file:
            #   <text id="11248" bytes="9" />
            if '<' in line:
                result = compiledTextPattern.match(line)
                if result:
                    line = result.group("s") + '<text id="%s" bytes="%s" />\n' % (currentTextId, result.group("b"))
                    outFd.write(line)
                    continue
                elif '</text' in line:
                    continue

                result = compiledPagePattern.match(line)
                if result:
                    expectPageId = True
                    outFd.write(line)
                    continue
                result = compiledRevisionPattern.match(line)
                if result:
                    expectRevId = True
                    outFd.write(line)
                    continue
                if expectPageId:
                    result = compiledIdPattern.match(line)
                    if result:
                        outPageIdFd.write("1:%s\n" % result.group("i"))
                        expectPageId = False
                    outFd.write(line)
                    continue
                if expectRevId:
                    result = compiledIdPattern.match(line)
                    if result:
                        currentTextId = result.group("i")
                        expectRevId = False
                    outFd.write(line)
                    continue
                outFd.write(line)
            else:
                continue  # these are lines of text, we can skip them
        inFd.close()
        outFd.close()
        outPageIdFd.close()

Example #12

0

Show file

File: wikicontent2sql.py Project: nforrester/6.834-final-project

        t.addTitlesFromFile(o['mediawikiTitlesPath'], "8")
        if verbose:
            sys.stderr.write("mediawiki titles added to page title hash\n")

        t.addTitlesFromFile(o['moduleTitlesPath'], "828")
        if verbose:
            sys.stderr.write("module titles added to page title hash\n")

        t.addTitlesFromFile(o['templateTitlesPath'], "10")
        if verbose:
            sys.stderr.write("template titles added to page title hash\n")

        t.uniq()

        o['mainTitlesWithPrefixPath'] = out.makePath("main-titles-with-nsprefix.gz")
        outFd = File.openOutput(o['mainTitlesWithPrefixPath'])
        for line in t.list:
            outFd.write(line + "\n")
        outFd.close()

        o['tmplTitlesWithPrefixPath'] = out.makePath("tmpl-titles-with-nsprefix.gz")
        outFd = File.openOutput(o['tmplTitlesWithPrefixPath'])
        for line in t.listTemplates:
            outFd.write(line + "\n")
        outFd.close()

        if (verbose):
           sys.stderr.write("Done converting retrieved titles, have %s and %s\n" % (o['mainTitlesWithPrefixPath'], o['tmplTitlesWithPrefixPath']))

    if o['retrieveContent']:
        if not o['mainTitlesWithPrefixPath'] or not o['tmplTitlesWithPrefixPath']: