def wiki2sentences(wiki, sent_detector,withTags=True): # get rid of (nested) template calls oldLen = 1E10 while len(wiki)<oldLen: oldLen = len(wiki) wiki = re.sub('{[^{}]*}',' ',wiki) tree = parse_txt(wiki) text = tree2string(tree) lines = cleanup(text).split('\n') sentences = [] tags = [] for line in lines: if line.startswith('<s>'): sentences.append(line[3:].strip()) tags.append('Section') elif line.startswith('<i>'): sentences.append(line[3:].strip()) tags.append('Item') else: newSentences = sent_detector(line.strip()) sentences += newSentences tags += ['Sentence']*(len(newSentences)-1) tags.append('LastSentence') if withTags: return sentences,tags else: return sentences
def _process_page(self, elem): """ Gather frequency distribution of a page, category names, and linked page names. """ # Get the text we need. #id = int(self._find(elem, 'id').text) title = self._find(elem, 'title').text #datetime = self._find(elem, 'revision', 'timestamp').text text = self._find(elem, 'revision', 'text').text redirect = self._find(elem, 'redirect') # `title` should be the canonical title, i.e. the 'official' # title of a page. If the page redirects to another (the canonical # page), the <redirect> elem contains the canonical title to which # the page redirects. if redirect is not None: title = redirect.attrib.get('title') # Extract certain elements. # pagelinks is a list of linked page titles. # categories is a list of this page's category titles. result = parse_txt(text) pagelinks = [pagelink.full_target for pagelink in result.find(parser.ArticleLink)] # Have to trim the beginning of Category links. # So 'Category:Anarchism' becomes just 'Anarchism'. cstart = len('Category:') categories = [category.full_target[cstart:] for category in result.find(parser.CategoryLink)] # Build the bag of words representation of the document. clean_text = self._clean(text) return clean_text
def simpleparse( raw, lang=None ): # !!! USE FOR DEBUGGING ONLY !!! does not use post processors a = compat.parse_txt(raw, lang=lang) core.show(a) return a
def wiki2sentences(wiki, sent_detector, withTags=True): # get rid of (nested) template calls oldLen = 1E10 while len(wiki) < oldLen: oldLen = len(wiki) wiki = re.sub('{[^{}]*}', ' ', wiki) tree = parse_txt(wiki) text = tree2string(tree) lines = cleanup(text).split('\n') sentences = [] tags = [] for line in lines: if line.startswith('<s>'): sentences.append(line[3:].strip()) tags.append('Section') elif line.startswith('<i>'): sentences.append(line[3:].strip()) tags.append('Item') else: newSentences = sent_detector(line.strip()) sentences += newSentences tags += ['Sentence'] * (len(newSentences) - 1) tags.append('LastSentence') if withTags: return sentences, tags else: return sentences
def parseString(title=None, raw=None, wikidb=None, revision=None, lang=None, magicwords=None, expandTemplates=True): """parse article with title from raw mediawiki text""" uniquifier = None siteinfo = None assert title is not None, 'no title given' if raw is None: page = wikidb.normalize_and_get_page(title, 0) if page: raw = page.rawtext else: raw = None assert raw is not None, "cannot get article %r" % (title,) input_raw = raw te = None if wikidb: if expandTemplates: te = expander.Expander(raw, pagename=title, wikidb=wikidb) input_raw = te.expandTemplates(True) uniquifier = te.uniquifier if hasattr(wikidb, 'get_siteinfo'): siteinfo = wikidb.get_siteinfo() src = None if hasattr(wikidb, 'getSource'): src = wikidb.getSource(title, revision=revision) assert not isinstance(src, dict) if not src: src = metabook.source() if lang is None: lang = src.language if magicwords is None: if siteinfo is not None and 'magicwords' in siteinfo: magicwords = siteinfo['magicwords'] else: magicwords = src.get('magicwords') if siteinfo is None: nshandler = nshandling.get_nshandler_for_lang(lang) else: nshandler = nshandling.nshandler(siteinfo) a = compat.parse_txt(input_raw, title=title, wikidb=wikidb, nshandler=nshandler, lang=lang, magicwords=magicwords, uniquifier=uniquifier, expander=te) a.caption = title if te and te.magic_displaytitle: a.caption = te.magic_displaytitle from mwlib.old_uparser import postprocessors for x in postprocessors: x(a, title=title, revision=revision, wikidb=wikidb, lang=lang) return a
def get_media_for_article(self, topic): media = [] try: client = Client() markup = self.get_markup_for_topic(topic) article = compat.parse_txt(markup) self.depth_find_media(article, topic, media) #print "toplc: " + topic #print media except: pass return media
def get_article(self, topic): global usejsoncache global threaded cacheDir = "./json_cache/{}".format(self.wiki_version) try: os.mkdir(cacheDir) except: pass filename = "{}/{}".format(cacheDir, topic) if usejsoncache is True and os.path.isfile(filename): print "cache hit for " + topic js = open(filename).read() else: markup = self.get_markup_for_topic(topic) templParser = mwlib.templ.parser.Parser(markup) templates = templParser.parse() print "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$" markup = "" for t in templates: # print "==>{} {}<==".format(type(t), t) if isinstance(t, unicode): markup+= t elif isinstance(t, mwlib.templ.nodes.Template): print "==>{}<==".format(t[0]) if t[0] == "Wide image": print " -->{}<--".format(t[1][0]) markup+= " [[File:{}]] ".format(t[1][0]) # print article print "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$" markup = expandstr(markup) # print markup article = compat.parse_txt(markup) self.reset() if threaded: self.build_related_media(article) else: print "Not performing threaded processing" self.depth_first(article) obj = {"content":self.content, "title":topic, "url":"https://{}.wikipedia.org/wiki/{}".format(self.wiki_version, topic), "url":"https://{}.wikipedia.org/wiki/{}".format(self.wiki_version, topic)} js = json.dumps(obj, indent=2) open(filename, "w").write(js) return js
def wiki2sentences(wiki, sent_detector, withTags=True): # save the dates (made obsolete by introduction of wikitemplate2text) #wiki = re.sub('{{date\|([^{}]*?)}}', r'\1', wiki) # repeat since everything can be nested oldLen = 1E10 while len(wiki) < oldLen: oldLen = len(wiki) # eliminates html comments. e.g. <--This is a comment--> wiki = re.sub('<!--.*?-->', '', wiki) # eliminate wiki tables # commented out because mwlib takes care of them #wiki = re.sub('{\|[^{}]*?\|}', wikitable2text, wiki) # eliminate wiki templates. e.g. {{date}} wiki = re.sub('{{[^{}]*}}', wikitemplate2text, wiki) # eliminate refrence tags. e.g. <ref>text</ref> wiki = re.sub('<ref[^/>]*?>[^<]*?</ref>', '', wiki) # eliminate html tags. e.g. <ref name="My Life"/> wiki = re.sub('</?[A-Za-z][^>]*?>', '', wiki) # remove angle brackets # mwlib restores the changes, so do nothing. #wiki = re.sub('<', '<', wiki) #wiki = re.sub('>', '>', wiki) #print wiki.encode('utf-8') tree = parse_txt(wiki) text = tree2string(tree) lines = cleanup(text).split('\n') sentences = [] tags = [] for line in lines: if line.startswith('<s>'): sentences.append(line[3:].strip()) tags.append('Section') elif line.startswith('<i>'): newSentences = sent_detector(line[3:].strip()) sentences += newSentences tags += ['Item-Sentence']*(len(newSentences)-1) tags.append('Item-LastSentence') else: newSentences = sent_detector(line.strip()) sentences += newSentences tags += ['Sentence']*(len(newSentences)-1) tags.append('LastSentence') if withTags: return sentences,tags else: return sentences
def extractPage(wiki_text_n, path, wiki_text, verbose=False): try: page = extractText(parse_txt(wiki_text)) music_info = music_re.search(page) if music_info: music_info_end = infobox_end(page, music_info.end()) if music_info_end > 0: infobox = page[music_info.start() + 2:music_info_end] if verbose: print "Page:", wiki_text_n print "Infobox:", infobox infobox = "".join(infobox.split("\n")) infobox = "".join(infobox.split("\r")) infobox = stripmacros(infobox, " ") if verbose: print "Infobox 1:", infobox infobox = infobox.split("|") if verbose: for ib in infobox: print "Line: [%s]\n" % (ib) attrs = "<Metadata MyRadioType=\"" + music_info.group( 2) + "\" " ibattrs = {} for info in infobox: item = info.split("=", 1) if len(item) == 2: key = nopunct(item[0]).strip().replace(" ", "_") value = item[1].strip() if value.endswith("}}"): value = value[:-2] ibattrs[key] = saxutils.quoteattr( striptags(saxutils.unescape(value), ",")) for (key, value) in ibattrs.items(): attrs += " %s=%s" % (key, value) attrs += "/>\n" if verbose: print "Extracting:", attrs page = page[:music_info.start()] + page[music_info_end:] page = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n<Document>\n" + attrs + page + "</Document>\n" page = stripmacros(page) codecs.open(os.path.join(path, "%08d.xml" % (wiki_text_n)), "w", "utf-8", "replace").write(page) except Exception, e: if verbose: print "Error at %d was %s" % (wiki_text_n, str(e)) codecs.open(os.path.join(path, "%08d.raw" % (wiki_text_n)), "w", "utf-8", "replace").write(wiki_text)
def _parse(txt): """parse text....and try to return a 'better' (some inner) node""" from mwlib.refine.compat import parse_txt from mwlib import parser res = parse_txt(txt) # res is an parser.Article. if len(res.children) != 1: res.__class__ = parser.Node return res res = res.children[0] if res.__class__ == parser.Paragraph: res.__class__ = parser.Node return res
def extractPage( wiki_text_n, path, wiki_text, verbose = False ): try: page = extractText(parse_txt(wiki_text)) music_info = music_re.search(page) if music_info: music_info_end = infobox_end(page,music_info.end()) if music_info_end > 0: infobox = page[music_info.start()+2:music_info_end] if verbose: print "Page:",wiki_text_n print "Infobox:",infobox infobox = "".join(infobox.split("\n")) infobox = "".join(infobox.split("\r")) infobox = stripmacros(infobox," ") if verbose: print "Infobox 1:",infobox infobox = infobox.split("|") if verbose: for ib in infobox: print "Line: [%s]\n"%(ib) attrs = "<Metadata MyRadioType=\"" + music_info.group(2) + "\" " ibattrs = {} for info in infobox: item = info.split("=",1) if len(item) == 2: key = nopunct(item[0]).strip().replace(" ","_") value = item[1].strip() if value.endswith("}}"): value = value[:-2] ibattrs[key] = saxutils.quoteattr(striptags(saxutils.unescape(value),",")) for (key,value) in ibattrs.items(): attrs += " %s=%s"%(key,value) attrs += "/>\n" if verbose: print "Extracting:",attrs page = page[:music_info.start()] + page[music_info_end:] page = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n<Document>\n" + attrs + page + "</Document>\n" page = stripmacros(page) codecs.open(os.path.join(path,"%08d.xml"%(wiki_text_n)),"w","utf-8","replace").write(page) except Exception, e: if verbose: print "Error at %d was %s"%(wiki_text_n,str(e)) codecs.open(os.path.join(path,"%08d.raw"%(wiki_text_n)),"w","utf-8","replace").write(wiki_text)
def _process_page(self, elem): """ Gather frequency distribution of a page, category names, and linked page names. """ # Get the text we need. #id = int(self._find(elem, 'id').text) title = self._find(elem, 'title').text #datetime = self._find(elem, 'revision', 'timestamp').text text = self._find(elem, 'revision', 'text').text redirect = self._find(elem, 'redirect') # `title` should be the canonical title, i.e. the 'official' # title of a page. If the page redirects to another (the canonical # page), the <redirect> elem contains the canonical title to which # the page redirects. if redirect is not None: title = redirect.attrib.get('title') # Extract certain elements. # pagelinks is a list of linked page titles. # categories is a list of this page's category titles. result = parse_txt(text) pagelinks = [ pagelink.full_target for pagelink in result.find(parser.ArticleLink) ] # Have to trim the beginning of Category links. # So 'Category:Anarchism' becomes just 'Anarchism'. cstart = len('Category:') categories = [ category.full_target[cstart:] for category in result.find(parser.CategoryLink) ] # Build the bag of words representation of the document. clean_text = self._clean(text) return clean_text
def get_random_image(self): global random_image_updated global random_image now = time.time() d = now - random_image_updated if d > 60: # Refresh every minute. i = 0 media = [] while len(media) == 0 and i<100: url = "https://{}.wikipedia.org/wiki/Special:Random?action=raw".format(self.wiki_version) response = requests.get(url, timeout=2.0) markup = response.text topic = urlparse.parse_qs(urlparse.urlparse(response.url).query)["title"] # print markup article = compat.parse_txt(markup) self.depth_find_media(article, topic, media) i+=1 if len(media) > 0: random_image = media[0] random_image_updated = now return random_image
print "Fail!" sys.exit(2) raw = wc.read() rdata = json.loads(raw) wc.close() page = rdata['query']['pages'].itervalues().next() if not page: print "NO page found" sys.exit(3) revision = page['revisions'][0] if not revision: print "NO revision found" sys.exit(4) content = revision[str(revision.keys()[0])] parsed = compat.parse_txt(content) table = parsed.find(parser.Table)[0] if not table: print "Table not found" sys.exit(5) for row in table.children: cells = row.find(parser.Cell) print cells[0].asText().replace("}}", "").replace("{{", "").strip() + \ " || " + cells[1].asText().strip() + " || " + cells[2].asText().strip() \ + " || " + cells[3].asText().strip()
def parseString(title=None, raw=None, wikidb=None, revision=None, lang=None, magicwords=None, expandTemplates=True): """parse article with title from raw mediawiki text""" uniquifier = None siteinfo = None assert title is not None, 'no title given' if raw is None: page = wikidb.normalize_and_get_page(title, 0) if page: raw = page.rawtext else: raw = None assert raw is not None, "cannot get article %r" % (title, ) input = raw te = None if wikidb: if expandTemplates: te = expander.Expander(raw, pagename=title, wikidb=wikidb) input = te.expandTemplates(True) uniquifier = te.uniquifier if hasattr(wikidb, 'get_siteinfo'): siteinfo = wikidb.get_siteinfo() src = None if hasattr(wikidb, 'getSource'): src = wikidb.getSource(title, revision=revision) assert not isinstance(src, dict) if not src: src = metabook.source() if lang is None: lang = src.language if magicwords is None: if siteinfo is not None and 'magicwords' in siteinfo: magicwords = siteinfo['magicwords'] else: magicwords = src.get('magicwords') if siteinfo is None: nshandler = nshandling.get_nshandler_for_lang(lang) else: nshandler = nshandling.nshandler(siteinfo) a = compat.parse_txt(input, title=title, wikidb=wikidb, nshandler=nshandler, lang=lang, magicwords=magicwords, uniquifier=uniquifier, expander=te) a.caption = title if te and te.magic_displaytitle: a.caption = te.magic_displaytitle from mwlib.old_uparser import postprocessors for x in postprocessors: x(a, title=title, revision=revision, wikidb=wikidb, lang=lang) return a
def simpleparse(raw,lang=None): # !!! USE FOR DEBUGGING ONLY !!! does not use post processors a=compat.parse_txt(raw,lang=lang) core.show(a) return a
#! /usr/bin/env python # -*- coding: utf-8 -*- import sys import time s = unicode(open(sys.argv[1], "rb").read(), "utf-8") from mwlib import uparser, advtree, treecleaner from mwlib.refine import compat stime = time.time() r = compat.parse_txt(s) print "parse:", time.time() - stime stime = time.time() advtree.buildAdvancedTree(r) print "tree", time.time() - stime stime = time.time() tc = treecleaner.TreeCleaner(r) tc.cleanAll() print "clean:", time.time() - stime