def _wiki_dump_to_huge_math_pages_one( env_dict, wiki_xml_file ): """ Grab one huge wiki page and have fun with it while creating one huge math page file. """ wiki_xml_math_output = env_dict["wiki"]["xml_math_output"] % os.path.basename(wiki_xml_file) logger.warning(u"Started extracting math pages from [%s] to [%s]", wiki_xml_file, wiki_xml_math_output) # load wiki dump # wiki_page_dumper = dump.pager( wiki_xml_file, env_dict["pager"]["delimiter"], env_dict["pager"]["buffer"]) # for all pages and for all wiki maths # - try to find must_exist # -- if true output # must_exist = env_dict["pager"]["identify_by"] with codecs.open(wiki_xml_math_output, encoding='utf-8', mode='wb') as huge_math_output: math_pages = 0 for pages_done, page in enumerate(wiki_page_dumper.pages()): if page and must_exist in page: math_pages += 1 #logger.info( u"Pages done:[%d] Math:[%d]", pages_done, math_pages ) huge_math_output.write(page) else: if not page: logger_suspicious.warning(u"Page is null - [%d]", pages_done) logger.info(u"Stopped extracting math pages from [%s] to [%s], total [%s]", wiki_xml_file, wiki_xml_math_output, math_pages)
def _wiki_dump_to_huge_math_pages_one(env_dict, wiki_xml_file): """ Grab one huge wiki page and have fun with it while creating one huge math page file. """ wiki_xml_math_output = env_dict["wiki"][ "xml_math_output"] % os.path.basename(wiki_xml_file) logger.warning(u"Started extracting math pages from [%s] to [%s]", wiki_xml_file, wiki_xml_math_output) # load wiki dump # wiki_page_dumper = dump.pager(wiki_xml_file, env_dict["pager"]["delimiter"], env_dict["pager"]["buffer"]) # for all pages and for all wiki maths # - try to find must_exist # -- if true output # must_exist = env_dict["pager"]["identify_by"] with codecs.open(wiki_xml_math_output, encoding='utf-8', mode='wb') as huge_math_output: math_pages = 0 for pages_done, page in enumerate(wiki_page_dumper.pages()): if page and must_exist in page: math_pages += 1 #logger.info( u"Pages done:[%d] Math:[%d]", pages_done, math_pages ) huge_math_output.write(page) else: if not page: logger_suspicious.warning(u"Page is null - [%d]", pages_done) logger.info(u"Stopped extracting math pages from [%s] to [%s], total [%s]", wiki_xml_file, wiki_xml_math_output, math_pages)
def _huge_math_page_texhtml(env_dict): """ Grab one huge wiki page and have fun with it while creating all pages. """ wiki_xml_math_output = env_dict["wiki"]["big_xml"] #wiki_xml_math_output = env_dict["wiki"]["xml_math_output_big"] wiki_xml_math_output = env_dict["wiki"]["xml_math_output_test"] # load wiki dump # wiki_page_dumper = dump.pager(wiki_xml_math_output, env_dict["pager"]["delimiter"], env_dict["pager"]["buffer"]) from HTMLParser import HTMLParser ht = HTMLParser() titles = [] title_pattern = re.compile(env_dict["pager"]["re_title"], re.DOTALL) uniq = set() # def do_texhtml( page ): # total = 0 # for r in re.compile(u"<.*?texhtml.*?>(.*?)</.*?>").finditer(page): # found = True # total += 1 # #html = r.group() # #msg = u"%s\n\t%s\n\t%s" % (ht.unescape(html), html, r.group(1)) # norm = converters.latex.normalise(r.group(1).strip()) # if not norm in uniq: # uniq.add(norm) # msg = ht.unescape(r.group(1)).replace(u" ", " "). \ # replace(u"<sub>", u"_"). \ # replace(u"<sup>", u"^"). \ # replace(u"<var >", u" ") # logger.info(msg) # return total def do_title(page): try: title = title_pattern.search(page).group(1) titles.append(title) except: logger.warning(u"Could not parse title [%s]", page[:500]) # try to load pickled mathml (ok/fail) # <span class="texhtml">?</span> total = 0 total_pages = 0 pages_done = 0 for pages_done, page in enumerate( wiki_page_dumper.pages(templates.htmltemplate)): if pages_done % 100000 == 0: logger.info( u"Total formulas: %s, On pages: %s, Unique: %s, Done [%s]" % (total, total_pages, len(uniq), pages_done)) do_title(page) # found = do_texhtml( page ) # if found > 0: # total_pages += 1 if len(titles) > 0: with codecs.open("all.titles", mode="w+", encoding="utf-8") as fout: for title in titles: fout.write(title + "\n") print "Pages done: %s, Total formulas: %s, On pages: %s, Unique: %s" % \ ( pages_done, total, total_pages, len(uniq) )
def _huge_math_page_to_pages(env_dict): """ Grab one huge wiki page and have fun with it while creating all pages. """ import _math wiki_xml_math_output = env_dict["wiki"]["xml_math_output_big"] #wiki_xml_math_output = env_dict["wiki"]["xml_math_output_test"] from indexer.egomath.interface import egomath_inst egomath_inst.reset_logging() wiki_pages_output = env_dict["wiki"]["pages_output"] pickle_mathml_ok = env_dict["converters"]["latexml"]["pickle_ok"] pickle_mathml_fail = env_dict["converters"]["latexml"]["pickle_fail"] logger.info(u"Started separating pages from [%s] to [%s]", wiki_xml_math_output, wiki_pages_output) # load wiki dump # wiki_page_dumper = dump.pager(wiki_xml_math_output, env_dict["pager"]["delimiter"], env_dict["pager"]["buffer"]) # try to load pickled mathml (ok/fail) # converted_mathml = None if env_dict["mathml"]["convert"] == "pickle": buffering = 100 * 1024 * 1024 converted_mathml = _math.mathpickles(pickle_mathml_ok, pickle_mathml_fail, buffering=buffering) elif env_dict["mathml"]["convert"] == "db": converted_mathml = _math.mathdb(env_dict) latex_pattern = env_dict["pager"]["re_math"] title_pattern = re.compile(env_dict["pager"]["re_title"], re.DOTALL) total_formula_count = 0 formula_unique = set() if env_dict["wiki"]["collect_stats"] else None pages_done = 0 converted_mathml_cnt = 0 from collections import defaultdict pages_formula = defaultdict(int) # for all pages and for all wiki maths # for pages_done, page in enumerate( wiki_page_dumper.pages(templates.htmltemplate)): logger.info(u'Done %d pages', pages_done) # if title already exists do not write try: title = title_pattern.search(page).group(1).replace(" ", "_") url = u"http://en.wikipedia.org/wiki/%s" % title assert not u"$title" in title page_store = _math.page_to_store(wiki_pages_output, title + ".html") if not env_dict["pager"]["overwrite"] and page_store.exists(): logger.warning(u"Page exists [%s] [%d]", title, pages_done) continue except Exception, e: logger.error(u"Could not store page because of %s", repr(e)) continue from _parser import parser page = parser.preprocess_page_math(env_dict, page) # the page we got should be wiki tag free; however, it will contain only # basic math <math> B \gt </math> which can contain # non latex characters > instead of \gt # - we must fix this # page_replacements = [] page_formula_count = 0 for wiki_math_iter in latex_pattern.finditer(page): page_formula_count += 1 total_formula_count += 1 page_replacements += \ _math.convert_wikimath_to_realmath( env_dict, wiki_math_iter, converted_mathml, url, title, total_formula_count, formula_unique) pages_formula[page_formula_count] += 1 info_msg = u"# of formulae on page [%s] is [%d], total [%d]" % ( utils.ascii(title), page_formula_count, total_formula_count) if page_formula_count == 0: logger_suspicious.warning(info_msg + u" -> skipping 0.") logger.warning(info_msg) continue else: logger.warning(info_msg) # create the page # tmp = "" last = 0 e = None for (s, e, r) in page_replacements: tmp += page[last:s] + r last = e tmp += page[e:] page = tmp # store the page try: page_store.store(page) except IOError, e: logger.error(u"Could not store [%s] page because of %s", title, repr(e))
import dump import utils class elements(object): logger = utils.logger('wiki.statistics.elements') def __init__(self, re_expression): elements.logger.info(u"Initializing with re_expression=%s", re_expression) self._re = re.compile(re_expression, re.DOTALL) def __call__(self, pages): size = 0 for page in pages: els = self._re.findall(page) print utils.ascii(u"\n".join(els)) size += len(els) print u"Total size: %d" % size if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG) MB = 1024 * 1024 wikier = dump.pager(r"../output_math/math.pages", 50 * MB) elements(u"<title>(.*?)</title>")(wikier.pages()) print "Finished importing %s" % __file__
import re import dump import utils class elements(object): logger = utils.logger("wiki.statistics.elements") def __init__(self, re_expression): elements.logger.info(u"Initializing with re_expression=%s", re_expression) self._re = re.compile(re_expression, re.DOTALL) def __call__(self, pages): size = 0 for page in pages: els = self._re.findall(page) print utils.ascii(u"\n".join(els)) size += len(els) print u"Total size: %d" % size if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG) MB = 1024 * 1024 wikier = dump.pager(r"../output_math/math.pages", 50 * MB) elements(u"<title>(.*?)</title>")(wikier.pages()) print "Finished importing %s" % __file__
def _huge_math_page_texhtml( env_dict ): """ Grab one huge wiki page and have fun with it while creating all pages. """ wiki_xml_math_output = env_dict["wiki"]["big_xml"] #wiki_xml_math_output = env_dict["wiki"]["xml_math_output_big"] wiki_xml_math_output = env_dict["wiki"]["xml_math_output_test"] # load wiki dump # wiki_page_dumper = dump.pager(wiki_xml_math_output, env_dict["pager"]["delimiter"], env_dict["pager"]["buffer"]) from HTMLParser import HTMLParser ht = HTMLParser() titles = [] title_pattern = re.compile(env_dict["pager"]["re_title"], re.DOTALL) uniq = set() # def do_texhtml( page ): # total = 0 # for r in re.compile(u"<.*?texhtml.*?>(.*?)</.*?>").finditer(page): # found = True # total += 1 # #html = r.group() # #msg = u"%s\n\t%s\n\t%s" % (ht.unescape(html), html, r.group(1)) # norm = converters.latex.normalise(r.group(1).strip()) # if not norm in uniq: # uniq.add(norm) # msg = ht.unescape(r.group(1)).replace(u" ", " "). \ # replace(u"<sub>", u"_"). \ # replace(u"<sup>", u"^"). \ # replace(u"<var >", u" ") # logger.info(msg) # return total def do_title( page ): try: title = title_pattern.search(page).group(1) titles.append(title) except: logger.warning(u"Could not parse title [%s]", page[:500]) # try to load pickled mathml (ok/fail) # <span class="texhtml">?</span> total = 0 total_pages = 0 pages_done = 0 for pages_done, page in enumerate(wiki_page_dumper.pages(templates.htmltemplate)): if pages_done % 100000 == 0: logger.info(u"Total formulas: %s, On pages: %s, Unique: %s, Done [%s]" % ( total, total_pages, len(uniq), pages_done )) do_title(page) # found = do_texhtml( page ) # if found > 0: # total_pages += 1 if len(titles) > 0: with codecs.open("all.titles", mode="w+", encoding="utf-8") as fout: for title in titles: fout.write(title + "\n") print "Pages done: %s, Total formulas: %s, On pages: %s, Unique: %s" % \ ( pages_done, total, total_pages, len(uniq) )
def _huge_math_page_to_pages( env_dict ): """ Grab one huge wiki page and have fun with it while creating all pages. """ import _math wiki_xml_math_output = env_dict["wiki"]["xml_math_output_big"] #wiki_xml_math_output = env_dict["wiki"]["xml_math_output_test"] from indexer.egomath.interface import egomath_inst egomath_inst.reset_logging() wiki_pages_output = env_dict["wiki"]["pages_output"] pickle_mathml_ok = env_dict["converters"]["latexml"]["pickle_ok"] pickle_mathml_fail = env_dict["converters"]["latexml"]["pickle_fail"] logger.info(u"Started separating pages from [%s] to [%s]", wiki_xml_math_output, wiki_pages_output) # load wiki dump # wiki_page_dumper = dump.pager(wiki_xml_math_output, env_dict["pager"]["delimiter"], env_dict["pager"]["buffer"]) # try to load pickled mathml (ok/fail) # converted_mathml = None if env_dict["mathml"]["convert"] == "pickle": buffering = 100 * 1024 * 1024 converted_mathml = _math.mathpickles(pickle_mathml_ok, pickle_mathml_fail, buffering=buffering) elif env_dict["mathml"]["convert"] == "db": converted_mathml = _math.mathdb(env_dict) latex_pattern = env_dict["pager"]["re_math"] title_pattern = re.compile(env_dict["pager"]["re_title"], re.DOTALL) total_formula_count = 0 formula_unique = set() if env_dict["wiki"]["collect_stats"] else None pages_done = 0 converted_mathml_cnt = 0 from collections import defaultdict pages_formula = defaultdict(int) # for all pages and for all wiki maths # for pages_done, page in enumerate(wiki_page_dumper.pages(templates.htmltemplate)): logger.info(u'Done %d pages', pages_done) # if title already exists do not write try: title = title_pattern.search(page).group(1).replace(" ", "_") url = u"http://en.wikipedia.org/wiki/%s" % title assert not u"$title" in title page_store = _math.page_to_store(wiki_pages_output, title + ".html") if not env_dict["pager"]["overwrite"] and page_store.exists(): logger.warning(u"Page exists [%s] [%d]", title, pages_done) continue except Exception, e: logger.error(u"Could not store page because of %s", repr(e)) continue from _parser import parser page = parser.preprocess_page_math(env_dict, page) # the page we got should be wiki tag free; however, it will contain only # basic math <math> B \gt </math> which can contain # non latex characters > instead of \gt # - we must fix this # page_replacements = [] page_formula_count = 0 for wiki_math_iter in latex_pattern.finditer(page): page_formula_count += 1 total_formula_count += 1 page_replacements += \ _math.convert_wikimath_to_realmath( env_dict, wiki_math_iter, converted_mathml, url, title, total_formula_count, formula_unique) pages_formula[page_formula_count] += 1 info_msg = u"# of formulae on page [%s] is [%d], total [%d]" % ( utils.ascii(title), page_formula_count, total_formula_count) if page_formula_count == 0: logger_suspicious.warning(info_msg + u" -> skipping 0.") logger.warning(info_msg) continue else: logger.warning(info_msg) # create the page # tmp = "" last = 0 e = None for (s, e, r) in page_replacements: tmp += page[last:s] + r last = e tmp += page[e:] page = tmp # store the page try: page_store.store(page) except IOError, e: logger.error(u"Could not store [%s] page because of %s", title, repr(e))