def write_sql(self):
     self.user_dict = {1: True}
     fd = File.open_input(self.xml_file)
     logout_fd = File.open_output(self.log_out_file)
     if self.user_out_file:
         userout_fd = File.open_output(self.user_out_file)
     else:
         userout_fd = None
     if not self.skip_header(fd):
         raise WikiContentErr(
             "failed to find end of mediawiki/siteinfo header in xml file\n"
         )
     eof = False
     while not eof:
         eof = self.do_log_item(fd, logout_fd, userout_fd)
     fd.close()
     logout_fd.close()
     if self.user_out_file:
         userout_fd.close()
     return
def do_main():
    o = {}  # stash all opt vars in here

    # init main opt vars
    for opt in ['template', 'sql_files', 'mw_version', 'output_dir', 'username', 'password']:
        o[opt] = None

    o['project'] = "wikipedia"
    o['lang_code'] = "en"
    o['batch_size'] = 500

    cwd = Path(os.getcwd())
    o['sqlfilter'] = cwd.make_path("sqlfilter")
    o['wcr'] = cwd.make_path("wikiretriever.py")
    o['mwxml2sql'] = cwd.make_path("mwxml2sql")

    # init step opt vars
    for opt in ['retrieve_titles', 'convert_titles', 'retrieve_content', 'make_stubs',
                'convert_xml', 'filter_sql']:
        o[opt] = True

    # init file opt vars
    for opt in ['titles_path', 'mediawiki_titles_path', 'module_titles_path', 'template_titles_path',
                'main_titles_with_prefix_path', 'tmpl_titles_with_prefix_path', 'main_content_path',
                'template_content_path', 'content_path', 'stubs_path', 'page_ids_path']:
        o[opt] = None

    verbose = False

    # option handling
    main_options = ["template=", "sqlfiles=", "mwversion=", "lang=",
                    "project=", "batchsize=", "output=", "auth="]
    cmd_options = ["sqlfilter=", "mwxml2sql=", "wcr="]

    steps = ["retrievetitles", "converttitles", "retrievecontent", "makestubs",
             "convertxml", "filtersql"]
    skip_step_flags = ["no" + s for s in steps]

    convert_titles_options = ["titles=", "mwtitles=", "mdltitles=", "tmpltitles="]
    retrieve_content_options = ["titleswithprefix=", "tmpltitleswithprefix="]
    make_stubs_options = ["maincontent=", "tmplcontent=", "content="]
    convert_xml_filter_sql_options = ["stubs=", "pageids="]

    files = [fopt[:-1] for fopt in convert_titles_options + retrieve_content_options +
             make_stubs_options + convert_xml_filter_sql_options]

    misc_flags = ["verbose", "help", "extendedhelp"]

    all_options = (main_options + cmd_options + skip_step_flags + convert_titles_options +
                   retrieve_content_options + make_stubs_options +
                   convert_xml_filter_sql_options + misc_flags)
    try:
        (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", all_options)
    except getopt.GetoptError as e:
        usage(e.msg)

    for (opt, val) in options:

        # main opts
        if opt == "--template":
            o['template'] = val
        elif opt == "--sqlfiles":
            o['sql_files'] = val
        elif opt == "--mwversion":
            o['mw_version'] = val
        elif opt == "--lang":
            o['lang_code'] = val
        elif opt == "--project":
            o['project'] = val
        elif opt == "--batchsize":
            if not val.isdigit():
                usage("batch size must be a number")
            o['batch_size'] = int(val)
        elif opt == "--output":
            o['output_dir'] = val
        elif opt == "--auth":
            if ':' in val:
                o['username'], o['password'] = val.split(':')
            else:
                o['username'] = val

        # command opts
        elif opt == "--sqlfilter":
            o['sqlfilter'] = val
        elif opt == "--mwxml2sql":
            o['mwxml2sql'] = val
        elif opt == "--wcr":
            o['wcr'] = val

        # step options
        elif opt.startswith("--no"):
            process_step_option(opt[4:], o)

        # file options
        elif opt[2:] in files:
            process_file_option(opt[2:], val, o)

        # misc flags
        elif opt == "--verbose":
            verbose = True
        elif opt == "--help":
            usage("Options help:\n")
        elif opt == "--extendedhelp":
            usage("Options help:\n", True)
        else:
            usage("Unknown option specified: %s" % opt)

    if len(remainder) > 0:
        usage("Unknown option specified: <%s>" % remainder[0])

    # output files will have this date in their names
    date = time.strftime("%Y-%m-%d-%H%M%S", time.gmtime(time.time()))
    out = Path(o['output_dir'], o['lang_code'], o['project'], date)

    # processing begins
    if o['retrieve_titles']:
        if not o['wcr']:
            usage("in retrieve_titles: Missing mandatory option wcr.")
        if not o['template']:
            usage("in retrieve_titles: Missing mandatory option template.")
        if ':' not in o['template']:
            usage("in retrieve_titles: template option should start with 'Template:' " +
                  "or the equivalent in the wiki's language")
        if not o['mw_version']:
            usage("in retrieve_titles: Missing mandatory option mwversion.")

        if (verbose):
            sys.stderr.write("Retrieving page titles from wiki\n")

        r = Retriever(o['wcr'], o['output_dir'], o['lang_code'], o['project'], verbose)
        if not o['titles_path']:
            # get titles corresponding to the template
            o['titles_path'] = r.get_titles_embedded_in(o['template'], out.make_file("main-titles.gz"))
            if verbose:
                sys.stderr.write("main content titles file produced: <%s>\n" % o['titles_path'])

        if not o['mediawiki_titles_path']:
            # get the mediawiki page titles
            o['mediawiki_titles_path'] = r.get_titles_in_namespace("8", out.make_file("mw-titles.gz"))
            if verbose:
                sys.stderr.write("mediawiki titles file produced: <%s>\n" % o['mediawiki_titles_path'])

        if not o['module_titles_path']:
            # get the module (lua) page titles
            o['module_titles_path'] = r.get_titles_in_namespace("828", out.make_file("mod-titles.gz"))
            if verbose:
                sys.stderr.write("modules (lua) titles file produced: <%s>\n" % o['module_titles_path'])

        if not o['template_titles_path']:
            # get the template page titles
            o['template_titles_path'] = r.get_titles_in_namespace("10", out.make_file("tmpl-titles.gz"))
            if verbose:
                sys.stderr.write("templates titles file produced: <%s>\n" % o['template_titles_path'])

        if (verbose):
            sys.stderr.write("Done retrieving page titles from wiki, have " +
                             "%s, %s, %s and %s\n" % (
                                 o['titles_path'], o['mediawiki_titles_path'],
                                 o['module_titles_path'], o['template_titles_path']))

    if o['convert_titles']:
        if (not o['titles_path'] or not o['mediawiki_titles_path'] or not o['module_titles_path'] or
                not o['template_titles_path']):
            usage("Missing mandatory option for skipping previous step.", True)
        if not o['wcr']:
            usage("Missing mandatory option wcr.")

        if (verbose):
            sys.stderr.write("Converting retrieved titles \n")

        r = Retriever(o['wcr'], o['output_dir'], o['lang_code'], o['project'], verbose)

        # get namespaces from the api
        ns_dict = r.get_ns_dict()

        ns_dict_by_string = {}
        for nsnum in ns_dict.keys():
            ns_dict_by_string[ns_dict[nsnum]] = nsnum

        if verbose:
            sys.stderr.write("namespace dicts assembled\n")

        # get list of titles with prefix, not the talk pages but the actual ones,
        # (for use for download) - without dups
        # also create a hash with title, list of ns for this title (it will have
        # at least one entry in the list)
        t = Titles(ns_dict, ns_dict_by_string)

        # check main, file, category, project talk namespaces and convert to
        # main, file, category, project talk namespaces
        t.add_related_titles_from_file(o['titles_path'], ["1", "5", "7", "15"], ["0", "4", "6", "14"])

        if verbose:
            sys.stderr.write("page title hash assembled\n")

        t.add_titles_from_file(o['mediawiki_titles_path'], "8")
        if verbose:
            sys.stderr.write("mediawiki titles added to page title hash\n")

        t.add_titles_from_file(o['module_titles_path'], "828")
        if verbose:
            sys.stderr.write("module titles added to page title hash\n")

        t.add_titles_from_file(o['template_titles_path'], "10")
        if verbose:
            sys.stderr.write("template titles added to page title hash\n")

        t.uniq()

        o['main_titles_with_prefix_path'] = out.make_path("main-titles-with-nsprefix.gz")
        out_fd = File.open_output(o['main_titles_with_prefix_path'])
        for line in t.list:
            out_fd.write(line + "\n")
        out_fd.close()

        o['tmpl_titles_with_prefix_path'] = out.make_path("tmpl-titles-with-nsprefix.gz")
        out_fd = File.open_output(o['tmpl_titles_with_prefix_path'])
        for line in t.list_templates:
            out_fd.write(line + "\n")
        out_fd.close()

        if (verbose):
            sys.stderr.write("Done converting retrieved titles, have %s and %s\n"
                             % (o['main_titles_with_prefix_path'], o['tmpl_titles_with_prefix_path']))

    if o['retrieve_content']:
        if not o['main_titles_with_prefix_path'] or not o['tmpl_titles_with_prefix_path']:
            usage("in retrieve_content: Missing mandatory option for skipping previous step.", True)

        if (verbose):
            sys.stderr.write("Retrieving page content from wiki\n")

        if not o['template_content_path']:
            # filter out the template titles from the main_titles_with_prefix_path file
            # and just download the rest
            o['template_content_path'] = r.get_content(o['tmpl_titles_with_prefix_path'],
                                                       out.make_file("template-content.gz"))
            if verbose:
                sys.stderr.write("content retrieved from template page titles\n")

        if not o['main_content_path']:
            o['main_content_path'] = r.get_content(o['main_titles_with_prefix_path'],
                                                   out.make_file("rest-content.gz"))
            if verbose:
                sys.stderr.write("content retrieved from page titles\n")

        o['content_path'] = out.make_path("content.gz")
        File.combine_xml([o['template_content_path'], o['main_content_path']], o['content_path'])

        if (verbose):
            sys.stderr.write("Done retrieving page content from wiki, have %s, %s and %s\n"
                             % (o['template_content_path'], o['main_content_path'], o['content_path']))

    if o['make_stubs']:
        if not o['content_path']:
            usage("in make_stubs: Missing mandatory option for skipping previous step.", True)

        if (verbose):
            sys.stderr.write("Generating stub XML file and pageids file from downloaded content\n")
        s = Stubber(o['output_dir'], verbose)
        # generate stub XML file for converting sql and list of page ids for filtering sql
        o['stubs_path'] = out.make_path("stubs.gz")
        o['page_ids_path'] = out.make_path("pageids.gz")
        s.write_stub_and_page_ids(o['content_path'], o['stubs_path'], o['page_ids_path'])
        if (verbose):
            sys.stderr.write("Done generating stub XML file and pageids file from " +
                             "downloaded content, have %s and %s\n" % (
                                 o['stubs_path'], o['page_ids_path']))

    if o['convert_xml']:
        if not o['content_path']:
            usage("in convert_xml: Missing mandatory option for skipping previous step.", True)
        if not o['mwxml2sql']:
            usage("in convert_xml: Missing mandatory option mwxml2sql.")

        if (verbose):
            sys.stderr.write("Converting content to page, revision, text tables\n")
        c = Converter(o['mwxml2sql'], o['output_dir'], verbose)
        # convert the content file to page, revision and text tables
        c.convert_content(o['content_path'], o['stubs_path'], o['mw_version'])
        if verbose:
            sys.stderr.write("Done converting content to page, revision, text tables\n")

    if o['filter_sql']:
        if not o['page_ids_path']:
            usage("in filter_sql: Missing mandatory option for skipping previous step.", True)
        if not o['sql_files']:
            usage("in filter_sql: Missing mandatory option sqlfiles.")
        if not o['sqlfilter']:
            usage("in filter_sql: Missing mandatory option sqlfilter.")

        if verbose:
            sys.stderr.write("Filtering sql tables against page ids for import\n")

        f = Filter(o['sqlfilter'], o['output_dir'], verbose)
        # filter all the sql tables (which should be in some nice directory)
        # against the pageids in page_ids_path file
        for table in ["categorylinks", "externallinks", "imagelinks", "interwiki",
                      "iwlinks", "langlinks", "page_props", "page_restrictions",
                      "pagelinks", "protected_titles", "redirect", "templatelinks"]:
            sql_filename = o['sql_files'].format(t=table)
            filtered_filename = os.path.basename(sql_filename)
            f.filter(sql_filename,
                     filtered_filename,
                     o['page_ids_path'])
        if (verbose):
            sys.stderr.write("Done filtering sql tables against page ids for import\n")

        # the one file we can't filter, it's not by pageid as categories might not have pages
        # so we'll have to import it wholesale... (or you can ignore them completely)
        sql_filename = o['sql_files'].format(t='category')
        new_filename = os.path.join(o['output_dir'], os.path.basename(sql_filename))
        if verbose:
            sys.stderr.write("about to copy %s to %s\n" % (sql_filename, new_filename))
        shutil.copyfile(sql_filename, new_filename)

    if (verbose):
        sys.stderr.write("Done!\n")
    sys.exit(0)
    def write_stub_and_page_ids(self, content_path, stubs_path, page_ids_path):
        """Write an XML stub file (omitting text content) and a
        list of page ids, from a MediaWiki XML page content file.
        Arguments:
        content_path  -- path to the XML page content file to read
        stubs_path    -- path to the stubs file to write
        page_ids_path  -- path to the page ids file to write"""

        page_pattern = "^\s*<page>"
        compiled_page_pattern = re.compile(page_pattern)
        revision_pattern = "^\s*<revision>"
        compiled_revision_pattern = re.compile(revision_pattern)
        id_pattern = "^\s*<id>(?P<i>.+)</id>\s*\n$"
        compiled_id_pattern = re.compile(id_pattern)
        text_pattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"'
        compiled_text_pattern = re.compile(text_pattern)

        in_fd = File.open_input(content_path)
        out_fd = File.open_output(stubs_path)
        outpage_id_fd = File.open_output(page_ids_path)
        current_title = None
        current_text_id = None
        page_id = None

        expect_rev_id = False
        expect_page_id = False

        for line in in_fd:
            # FIXME we could jus calculate text len  if the output is missing
            # the bytes attr. (as in dumps not from Special:Export)
            # format in content file:
            #   <text <text xml:space="preserve" bytes="78">
            # format wanted for stubs file:
            #   <text id="11248" bytes="9" />
            if '<' in line:
                result = compiled_text_pattern.match(line)
                if result:
                    line = result.group("s") + '<text id="%s" bytes="%s" />\n' % (
                        current_text_id, result.group("b"))
                    out_fd.write(line)
                    continue
                elif '</text' in line:
                    continue

                result = compiled_page_pattern.match(line)
                if result:
                    expect_page_id = True
                    out_fd.write(line)
                    continue
                result = compiled_revision_pattern.match(line)
                if result:
                    expect_rev_id = True
                    out_fd.write(line)
                    continue
                if expect_page_id:
                    result = compiled_id_pattern.match(line)
                    if result:
                        outpage_id_fd.write("1:%s\n" % result.group("i"))
                        expect_page_id = False
                    out_fd.write(line)
                    continue
                if expect_rev_id:
                    result = compiled_id_pattern.match(line)
                    if result:
                        current_text_id = result.group("i")
                        expect_rev_id = False
                    out_fd.write(line)
                    continue
                out_fd.write(line)
            else:
                continue  # these are lines of text, we can skip them
        in_fd.close()
        out_fd.close()
        outpage_id_fd.close()