def wiki2sentences(wiki, sent_detector,withTags=True):
    # get rid of (nested) template calls 
    oldLen = 1E10
    while len(wiki)<oldLen:
        oldLen = len(wiki)
        wiki = re.sub('{[^{}]*}',' ',wiki)

    tree = parse_txt(wiki)
    text = tree2string(tree)
    lines = cleanup(text).split('\n')
    sentences = []
    tags = []
    for line in lines:
        if line.startswith('<s>'):
            sentences.append(line[3:].strip())
            tags.append('Section')
        elif line.startswith('<i>'):
            sentences.append(line[3:].strip())
            tags.append('Item')
        else:
            newSentences = sent_detector(line.strip())
            sentences += newSentences
            tags += ['Sentence']*(len(newSentences)-1)
            tags.append('LastSentence')
    if withTags:
        return sentences,tags
    else:
        return sentences
Example #2
0
    def _process_page(self, elem):
        """
        Gather frequency distribution of a page,
        category names, and linked page names.
        """

        # Get the text we need.
        #id          = int(self._find(elem, 'id').text)
        title       = self._find(elem, 'title').text
        #datetime    = self._find(elem, 'revision', 'timestamp').text
        text        = self._find(elem, 'revision', 'text').text
        redirect    = self._find(elem, 'redirect')

        # `title` should be the canonical title, i.e. the 'official'
        # title of a page. If the page redirects to another (the canonical
        # page), the <redirect> elem contains the canonical title to which
        # the page redirects.
        if redirect is not None:
            title = redirect.attrib.get('title')

        # Extract certain elements.
        # pagelinks is a list of linked page titles.
        # categories is a list of this page's category titles.
        result = parse_txt(text)
        pagelinks = [pagelink.full_target for pagelink in result.find(parser.ArticleLink)]

        # Have to trim the beginning of Category links.
        # So 'Category:Anarchism' becomes just 'Anarchism'.
        cstart = len('Category:')
        categories = [category.full_target[cstart:] for category in result.find(parser.CategoryLink)]

        # Build the bag of words representation of the document.
        clean_text = self._clean(text)
        return clean_text
Example #3
0
def simpleparse(
        raw,
        lang=None
):  # !!! USE FOR DEBUGGING ONLY !!! does not use post processors
    a = compat.parse_txt(raw, lang=lang)
    core.show(a)
    return a
def wiki2sentences(wiki, sent_detector, withTags=True):
    # get rid of (nested) template calls
    oldLen = 1E10
    while len(wiki) < oldLen:
        oldLen = len(wiki)
        wiki = re.sub('{[^{}]*}', ' ', wiki)

    tree = parse_txt(wiki)
    text = tree2string(tree)
    lines = cleanup(text).split('\n')
    sentences = []
    tags = []
    for line in lines:
        if line.startswith('<s>'):
            sentences.append(line[3:].strip())
            tags.append('Section')
        elif line.startswith('<i>'):
            sentences.append(line[3:].strip())
            tags.append('Item')
        else:
            newSentences = sent_detector(line.strip())
            sentences += newSentences
            tags += ['Sentence'] * (len(newSentences) - 1)
            tags.append('LastSentence')
    if withTags:
        return sentences, tags
    else:
        return sentences
Example #5
0
def parseString(title=None, raw=None, wikidb=None, revision=None,
                lang=None, magicwords=None, expandTemplates=True):
    """parse article with title from raw mediawiki text"""

    uniquifier = None
    siteinfo = None
    assert title is not None, 'no title given'

    if raw is None:
        page = wikidb.normalize_and_get_page(title, 0)
        if page:
            raw = page.rawtext
        else:
            raw = None

        assert raw is not None, "cannot get article %r" % (title,)
    input_raw = raw
    te = None
    if wikidb:
        if expandTemplates:
            te = expander.Expander(raw, pagename=title, wikidb=wikidb)
            input_raw = te.expandTemplates(True)
            uniquifier = te.uniquifier
        if hasattr(wikidb, 'get_siteinfo'):
            siteinfo = wikidb.get_siteinfo()

        src = None
        if hasattr(wikidb, 'getSource'):
            src = wikidb.getSource(title, revision=revision)
            assert not isinstance(src, dict)

        if not src:
            src = metabook.source()

        if lang is None:
            lang = src.language

        if magicwords is None:
            if siteinfo is not None and 'magicwords' in siteinfo:
                magicwords = siteinfo['magicwords']
            else:
                magicwords = src.get('magicwords')

    if siteinfo is None:
        nshandler = nshandling.get_nshandler_for_lang(lang)
    else:
        nshandler = nshandling.nshandler(siteinfo)
    a = compat.parse_txt(input_raw, title=title, wikidb=wikidb, nshandler=nshandler, lang=lang, magicwords=magicwords, uniquifier=uniquifier, expander=te)

    a.caption = title
    if te and te.magic_displaytitle:
        a.caption = te.magic_displaytitle

    from mwlib.old_uparser import postprocessors
    for x in postprocessors:
        x(a, title=title, revision=revision, wikidb=wikidb, lang=lang)

    return a
Example #6
0
    def get_media_for_article(self, topic):
        media = []
        try:
            client = Client()
            markup = self.get_markup_for_topic(topic)
            article = compat.parse_txt(markup)
            self.depth_find_media(article, topic, media)
            #print "toplc: " + topic
            #print media
        except:
            pass

        return media
Example #7
0
    def get_article(self, topic):
        global usejsoncache
        global threaded
        
        cacheDir = "./json_cache/{}".format(self.wiki_version)
        try:
            os.mkdir(cacheDir)
        except:
            pass
            
        filename = "{}/{}".format(cacheDir, topic)
        if usejsoncache is True and os.path.isfile(filename):
            print "cache hit for " + topic
            js = open(filename).read()
        else:
            markup = self.get_markup_for_topic(topic)

            templParser = mwlib.templ.parser.Parser(markup)
            templates = templParser.parse()
            print "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$"
            markup = ""
            for t in templates:
                # print "==>{} {}<==".format(type(t), t)
                if isinstance(t, unicode):
                    markup+= t

                elif isinstance(t, mwlib.templ.nodes.Template):
                    print "==>{}<==".format(t[0])
                    if t[0] == "Wide image":
                        print "  -->{}<--".format(t[1][0])
                        markup+= " [[File:{}]] ".format(t[1][0])

            # print article
            print "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$"

            markup = expandstr(markup)
            # print markup
            article = compat.parse_txt(markup)

            self.reset()
            if threaded:
                self.build_related_media(article)
            else:
                print "Not performing threaded processing"
            self.depth_first(article)
        
            obj = {"content":self.content, "title":topic, "url":"https://{}.wikipedia.org/wiki/{}".format(self.wiki_version, topic), "url":"https://{}.wikipedia.org/wiki/{}".format(self.wiki_version, topic)}
            js = json.dumps(obj, indent=2)
            
            open(filename, "w").write(js)
        return js
Example #8
0
def wiki2sentences(wiki, sent_detector, withTags=True):
	# save the dates (made obsolete by introduction of wikitemplate2text)
	#wiki = re.sub('{{date\|([^{}]*?)}}', r'\1',  wiki)

	# repeat since everything can be nested
	oldLen = 1E10
	while len(wiki) < oldLen:
		oldLen = len(wiki)
		# eliminates html comments. e.g. <--This is a comment-->
		wiki = re.sub('<!--.*?-->', '', wiki)
		# eliminate wiki tables
		# commented out because mwlib takes care of them
		#wiki = re.sub('{\|[^{}]*?\|}', wikitable2text, wiki)
		# eliminate wiki templates. e.g. {{date}}
		wiki = re.sub('{{[^{}]*}}', wikitemplate2text, wiki)
		# eliminate refrence tags. e.g. <ref>text</ref>
		wiki = re.sub('<ref[^/>]*?>[^<]*?</ref>', '', wiki)
		# eliminate html tags. e.g. <ref name="My Life"/>
		wiki = re.sub('</?[A-Za-z][^>]*?>', '', wiki)
	
	# remove angle brackets
	# mwlib restores the changes, so do nothing.
	#wiki = re.sub('<', '&lt;', wiki)
	#wiki = re.sub('>', '&gt;', wiki)
	#print wiki.encode('utf-8')

	tree = parse_txt(wiki)
	text = tree2string(tree)
	lines = cleanup(text).split('\n')
	sentences = []
	tags = []
	for line in lines:
		if line.startswith('<s>'):
			sentences.append(line[3:].strip())
			tags.append('Section')
		elif line.startswith('<i>'):
			newSentences = sent_detector(line[3:].strip())
			sentences += newSentences
			tags += ['Item-Sentence']*(len(newSentences)-1)
			tags.append('Item-LastSentence')
		else:
			newSentences = sent_detector(line.strip())
			sentences += newSentences
			tags += ['Sentence']*(len(newSentences)-1)
			tags.append('LastSentence')
	if withTags:
		return sentences,tags
	else:
		return sentences
Example #9
0
def extractPage(wiki_text_n, path, wiki_text, verbose=False):
    try:
        page = extractText(parse_txt(wiki_text))
        music_info = music_re.search(page)
        if music_info:
            music_info_end = infobox_end(page, music_info.end())
            if music_info_end > 0:
                infobox = page[music_info.start() + 2:music_info_end]
                if verbose:
                    print "Page:", wiki_text_n
                    print "Infobox:", infobox
                infobox = "".join(infobox.split("\n"))
                infobox = "".join(infobox.split("\r"))
                infobox = stripmacros(infobox, " ")
                if verbose:
                    print "Infobox 1:", infobox
                infobox = infobox.split("|")
                if verbose:
                    for ib in infobox:
                        print "Line: [%s]\n" % (ib)
                attrs = "<Metadata MyRadioType=\"" + music_info.group(
                    2) + "\" "
                ibattrs = {}
                for info in infobox:
                    item = info.split("=", 1)
                    if len(item) == 2:
                        key = nopunct(item[0]).strip().replace(" ", "_")
                        value = item[1].strip()
                        if value.endswith("}}"):
                            value = value[:-2]
                        ibattrs[key] = saxutils.quoteattr(
                            striptags(saxutils.unescape(value), ","))
                for (key, value) in ibattrs.items():
                    attrs += " %s=%s" % (key, value)
                attrs += "/>\n"
                if verbose:
                    print "Extracting:", attrs
                page = page[:music_info.start()] + page[music_info_end:]
                page = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n<Document>\n" + attrs + page + "</Document>\n"
                page = stripmacros(page)
                codecs.open(os.path.join(path, "%08d.xml" % (wiki_text_n)),
                            "w", "utf-8", "replace").write(page)
    except Exception, e:
        if verbose:
            print "Error at %d was %s" % (wiki_text_n, str(e))
        codecs.open(os.path.join(path, "%08d.raw" % (wiki_text_n)), "w",
                    "utf-8", "replace").write(wiki_text)
Example #10
0
def _parse(txt):
    """parse text....and try to return a 'better' (some inner) node"""
    from mwlib.refine.compat import parse_txt
    from mwlib import parser

    res = parse_txt(txt)

    # res is an parser.Article.
    if len(res.children) != 1:
        res.__class__ = parser.Node
        return res

    res = res.children[0]

    if res.__class__ == parser.Paragraph:
        res.__class__ = parser.Node

    return res
Example #11
0
def _parse(txt):
    """parse text....and try to return a 'better' (some inner) node"""
    from mwlib.refine.compat import parse_txt
    from mwlib import parser

    res = parse_txt(txt)

    # res is an parser.Article.
    if len(res.children) != 1:
        res.__class__ = parser.Node
        return res

    res = res.children[0]

    if res.__class__ == parser.Paragraph:
        res.__class__ = parser.Node

    return res
Example #12
0
def extractPage( wiki_text_n, path, wiki_text, verbose = False ):
    try:
        page = extractText(parse_txt(wiki_text))
        music_info = music_re.search(page)
        if music_info:
            music_info_end = infobox_end(page,music_info.end())
            if music_info_end > 0:
                infobox = page[music_info.start()+2:music_info_end]
                if verbose:
                    print "Page:",wiki_text_n
                    print "Infobox:",infobox
                infobox = "".join(infobox.split("\n"))
                infobox = "".join(infobox.split("\r"))
                infobox = stripmacros(infobox," ")
                if verbose:
                    print "Infobox 1:",infobox
                infobox = infobox.split("|")
                if verbose:
                    for ib in infobox:
                        print "Line: [%s]\n"%(ib)
                attrs = "<Metadata MyRadioType=\"" + music_info.group(2) + "\" "
                ibattrs = {}
                for info in infobox:
                    item = info.split("=",1)
                    if len(item) == 2:
                        key = nopunct(item[0]).strip().replace(" ","_")
                        value = item[1].strip()
                        if value.endswith("}}"):
                            value = value[:-2]
                        ibattrs[key] = saxutils.quoteattr(striptags(saxutils.unescape(value),","))
                for (key,value) in ibattrs.items():
                    attrs += " %s=%s"%(key,value)
                attrs += "/>\n"
                if verbose:
                    print "Extracting:",attrs
                page = page[:music_info.start()] + page[music_info_end:]
                page = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n<Document>\n" + attrs + page + "</Document>\n"
                page = stripmacros(page)
                codecs.open(os.path.join(path,"%08d.xml"%(wiki_text_n)),"w","utf-8","replace").write(page)
    except Exception, e:  
        if verbose:
            print "Error at %d was %s"%(wiki_text_n,str(e))
        codecs.open(os.path.join(path,"%08d.raw"%(wiki_text_n)),"w","utf-8","replace").write(wiki_text)
Example #13
0
    def _process_page(self, elem):
        """
        Gather frequency distribution of a page,
        category names, and linked page names.
        """

        # Get the text we need.
        #id          = int(self._find(elem, 'id').text)
        title = self._find(elem, 'title').text
        #datetime    = self._find(elem, 'revision', 'timestamp').text
        text = self._find(elem, 'revision', 'text').text
        redirect = self._find(elem, 'redirect')

        # `title` should be the canonical title, i.e. the 'official'
        # title of a page. If the page redirects to another (the canonical
        # page), the <redirect> elem contains the canonical title to which
        # the page redirects.
        if redirect is not None:
            title = redirect.attrib.get('title')

        # Extract certain elements.
        # pagelinks is a list of linked page titles.
        # categories is a list of this page's category titles.
        result = parse_txt(text)
        pagelinks = [
            pagelink.full_target
            for pagelink in result.find(parser.ArticleLink)
        ]

        # Have to trim the beginning of Category links.
        # So 'Category:Anarchism' becomes just 'Anarchism'.
        cstart = len('Category:')
        categories = [
            category.full_target[cstart:]
            for category in result.find(parser.CategoryLink)
        ]

        # Build the bag of words representation of the document.
        clean_text = self._clean(text)
        return clean_text
Example #14
0
    def get_random_image(self):
        global random_image_updated
        global random_image
        now = time.time()
        d = now - random_image_updated
        if d > 60: # Refresh every minute.
            i = 0
            media = []
            while len(media) == 0 and i<100:
                url = "https://{}.wikipedia.org/wiki/Special:Random?action=raw".format(self.wiki_version)
                response = requests.get(url, timeout=2.0)
                markup = response.text
                topic = urlparse.parse_qs(urlparse.urlparse(response.url).query)["title"]
                # print markup
                article = compat.parse_txt(markup)
                self.depth_find_media(article, topic, media)
                i+=1
        
            if len(media) > 0:
                random_image = media[0]
                random_image_updated = now

        return random_image
Example #15
0
        print "Fail!"
        sys.exit(2)

    raw = wc.read()
    rdata = json.loads(raw)
    wc.close()

    page = rdata['query']['pages'].itervalues().next()
    if not page:
        print "NO page found"
        sys.exit(3)

    revision = page['revisions'][0]
    if not revision:
        print "NO revision found"
        sys.exit(4)

    content = revision[str(revision.keys()[0])]
    parsed = compat.parse_txt(content)
    table = parsed.find(parser.Table)[0]

    if not table:
        print "Table not found"
        sys.exit(5)

    for row in table.children:
        cells = row.find(parser.Cell)
        print cells[0].asText().replace("}}", "").replace("{{", "").strip() + \
        " || " + cells[1].asText().strip() + " || " + cells[2].asText().strip() \
        + " || " + cells[3].asText().strip()
Example #16
0
def parseString(title=None,
                raw=None,
                wikidb=None,
                revision=None,
                lang=None,
                magicwords=None,
                expandTemplates=True):
    """parse article with title from raw mediawiki text"""

    uniquifier = None
    siteinfo = None
    assert title is not None, 'no title given'
    if raw is None:
        page = wikidb.normalize_and_get_page(title, 0)
        if page:
            raw = page.rawtext
        else:
            raw = None

        assert raw is not None, "cannot get article %r" % (title, )
    input = raw
    te = None
    if wikidb:
        if expandTemplates:
            te = expander.Expander(raw, pagename=title, wikidb=wikidb)
            input = te.expandTemplates(True)
            uniquifier = te.uniquifier
        if hasattr(wikidb, 'get_siteinfo'):
            siteinfo = wikidb.get_siteinfo()

        src = None
        if hasattr(wikidb, 'getSource'):
            src = wikidb.getSource(title, revision=revision)
            assert not isinstance(src, dict)

        if not src:
            src = metabook.source()

        if lang is None:
            lang = src.language
        if magicwords is None:
            if siteinfo is not None and 'magicwords' in siteinfo:
                magicwords = siteinfo['magicwords']
            else:
                magicwords = src.get('magicwords')

    if siteinfo is None:
        nshandler = nshandling.get_nshandler_for_lang(lang)
    else:
        nshandler = nshandling.nshandler(siteinfo)
    a = compat.parse_txt(input,
                         title=title,
                         wikidb=wikidb,
                         nshandler=nshandler,
                         lang=lang,
                         magicwords=magicwords,
                         uniquifier=uniquifier,
                         expander=te)

    a.caption = title
    if te and te.magic_displaytitle:
        a.caption = te.magic_displaytitle

    from mwlib.old_uparser import postprocessors
    for x in postprocessors:
        x(a, title=title, revision=revision, wikidb=wikidb, lang=lang)

    return a
Example #17
0
def simpleparse(raw,lang=None):    # !!! USE FOR DEBUGGING ONLY !!! does not use post processors
    a=compat.parse_txt(raw,lang=lang)
    core.show(a)
    return a
Example #18
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import time

s = unicode(open(sys.argv[1], "rb").read(), "utf-8")

from mwlib import uparser, advtree, treecleaner
from mwlib.refine import compat

stime = time.time()
r = compat.parse_txt(s)
print "parse:", time.time() - stime

stime = time.time()
advtree.buildAdvancedTree(r)
print "tree", time.time() - stime

stime = time.time()
tc = treecleaner.TreeCleaner(r)
tc.cleanAll()
print "clean:", time.time() - stime