Example #1
0
    def getImageTemplates(self, name, wikidb=None):
        from mwlib.expander import get_templates

        page = self.get_image_description_page(name)
        if page is not None:
            return get_templates(page.rawtext)
        print 'no such image: %r' % name
        return []
Example #2
0
    def getImageTemplates(self, name, wikidb=None):
        from mwlib.expander import get_templates

        page = self.get_image_description_page(name)
        if page is not None:
            return get_templates(page.rawtext)
        print "no such image: %r" % name
        return []
Example #3
0
    def process_templates(self, actual_title, s):
        """Prints the templates into the second output file."""
        from mwlib.expander import get_templates

        self.templates_f.write(u"{0} {1}\n".format(WikipediaParser.page_separator, actual_title).encode("utf-8"))
        if not WikitextToConll.write_templates_into_f(s, self.templates_f):
            sys.stderr.write(u"Problems with templates and mwparser: {0}\n".format(actual_title).encode("utf-8"))
        return get_templates(s)
Example #4
0
    def process_templates(self, actual_title, s):
        """Prints the templates into the second output file."""
        from mwlib.expander import get_templates

        self.templates_f.write(u"{0} {1}\n".format(
                WikipediaParser.page_separator, actual_title).encode("utf-8"))
        if not WikitextToConll.write_templates_into_f(s, self.templates_f):
            sys.stderr.write(u"Problems with templates and mwparser: {0}\n".format(
                             actual_title).encode("utf-8"))
        return get_templates(s)
Example #5
0
def getContributorsFromInformationTemplate(raw, title, wikidb):
    from mwlib.expander import find_template, get_templates, get_template_args, Expander
    from mwlib import uparser, parser, advtree
    
    def getUserLinks(raw):
        def isUserLink(node):
            return isinstance(node, parser.NamespaceLink) and node.namespace == 2 # NS_USER

        result = list(set([
            u.target
            for u in uparser.parseString(title,
                raw=raw,
                wikidb=wikidb,
            ).filter(isUserLink)
        ]))
        result.sort()
        return result

    def get_authors_from_template_args(template):
        args = get_template_args(template, expander)

        author_arg = args.get('Author', None)
        if author_arg:
            userlinks = getUserLinks(author_arg)
            if userlinks:
                return userlinks
            node = uparser.parseString('', raw=args['Author'], wikidb=wikidb)
            advtree.extendClasses(node)
            txt = node.getAllDisplayText().strip()
            if txt:
                return [txt]

        if args.args:
            return getUserLinks('\n'.join([args.get(i, u'') for i in range(len(args.args))]))
        
        return []

    expander = Expander(u'', title, wikidb)       

    template = find_template(raw, 'Information')
    if template is not None:
        authors = get_authors_from_template_args(template)
        if authors:
            return authors

    authors = []
    for template in get_templates(raw):
        t = find_template(raw, template)
        if t is not None:
            authors.extend(get_authors_from_template_args(t))
    if authors:
        return authors

    return getUserLinks(raw)
Example #6
0
def getContributorsFromInformationTemplate(raw, title, wikidb):
    from mwlib.expander import find_template, get_templates, get_template_args, Expander
    from mwlib import uparser, parser, advtree
    from mwlib.templ.parser import parse
    
    def getUserLinks(raw):
        def isUserLink(node):
            return isinstance(node, parser.NamespaceLink) and node.namespace == 2 # NS_USER

        result = list(set([
            u.target
            for u in uparser.parseString(title,
                raw=raw,
                wikidb=wikidb,
            ).filter(isUserLink)
        ]))
        result.sort()
        return result

    def get_authors_from_template_args(template):
        args = get_template_args(template, expander)

        author_arg = args.get('Author', None)
        if author_arg:
            # userlinks = getUserLinks(author_arg)
            # if userlinks:
            #     return userlinks
            node = uparser.parseString('', raw=args['Author'], wikidb=wikidb)
            advtree.extendClasses(node)
            txt = node.getAllDisplayText().strip()
            if txt:
                return [txt]

        if args.args:
            return getUserLinks('\n'.join([args.get(i, u'') for i in range(len(args.args))]))

        return []

    expander = Expander(u'', title, wikidb)
    parsed_raw = [parse(raw, replace_tags=expander.replace_tags)]
    template = find_template(None, 'Information', parsed_raw[:])
    if template is not None:
        authors = get_authors_from_template_args(template)
        if authors:
            return authors
    authors = []
    for template in get_templates(raw):
        t = find_template(None, template, parsed_raw[:])
        if t is not None:
            authors.extend(get_authors_from_template_args(t))
    if authors:
        return authors
    return getUserLinks(raw)
Example #7
0
def getContributorsFromInformationTemplate(raw, title, wikidb):
    from mwlib.expander import find_template, get_templates, get_template_args, Expander
    from mwlib import uparser, parser, advtree
    from mwlib.templ.parser import parse

    def getUserLinks(raw):
        def isUserLink(node):
            return isinstance(node, parser.NamespaceLink) and node.namespace == 2  # NS_USER

        result = list(set([u.target for u in uparser.parseString(title, raw=raw, wikidb=wikidb).filter(isUserLink)]))
        result.sort()
        return result

    def get_authors_from_template_args(template):
        args = get_template_args(template, expander)

        author_arg = args.get("Author", None)
        if author_arg:
            # userlinks = getUserLinks(author_arg)
            # if userlinks:
            #     return userlinks
            node = uparser.parseString("", raw=args["Author"], wikidb=wikidb)
            advtree.extendClasses(node)
            txt = node.getAllDisplayText().strip()
            if txt:
                return [txt]

        if args.args:
            return getUserLinks("\n".join([args.get(i, u"") for i in range(len(args.args))]))

        return []

    expander = Expander(u"", title, wikidb)
    parsed_raw = [parse(raw, replace_tags=expander.replace_tags)]
    template = find_template(None, "Information", parsed_raw[:])
    if template is not None:
        authors = get_authors_from_template_args(template)
        if authors:
            return authors
    authors = []
    for template in get_templates(raw):
        t = find_template(None, template, parsed_raw[:])
        if t is not None:
            authors.extend(get_authors_from_template_args(t))
    if authors:
        return authors
    return getUserLinks(raw)
Example #8
0
 def getImageTemplatesAndArgs(self, name, wikidb=None):
     from mwlib.expander import get_templates, get_template_args
     page = self.get_image_description_page(name)
     if page is not None:
         templates = get_templates(page.rawtext)
         from mwlib.expander import find_template
         from mwlib.templ.evaluate import Expander
         from mwlib.templ.parser import parse
         from mwlib.templ.misc import DictDB
         args = set()
         e = Expander('', wikidb=DictDB())
         # avoid parsing with every call to find_template
         parsed_raw = [parse(page.rawtext, replace_tags=e.replace_tags)]
         for t in templates:
             tmpl = find_template(None, t, parsed_raw[:])
             arg_list = tmpl[1]
             for arg in arg_list:
                 if isinstance(arg, basestring) and len(arg) > 3 and ' ' not in arg:
                     args.add(arg)
         templates.update(args)
         return templates
     return []
Example #9
0
 def doit(source, expected):
     r = expander.get_templates(source, u'')
     assert r == expected, "expected %r, got %r" % (expected, r)
    def join(self):
        """Finish ZIP file by writing the actual content"""
        
        if self.status:
            self.status(status=u'fetching articles')
            self.fetcharticle_status = self.status.getSubRange(0, 20)
            self.fetchtemplate_status = self.status.getSubRange(21, 40)
            self.parse_status = self.status.getSubRange(41, 60)
            self.fetchimages_status = self.status.getSubRange(61, 100)
        else:
            self.fetcharticle_status = self.fetchtemplate_status = self.parse_status = self.fetchimages_status = None
        for info in self.article_jobs:
            self.fetchArticle(
                title=info['title'],
                revision=info['revision'],
                wikidb=info['wikidb'],
            )
        self.jobsched.join()
        if self.status:
            self.status(status=u'fetching templates', article='')
        templates = set()
        for info in self.article_jobs:
            try:
                raw = self.articles[info['title']]['content']
            except KeyError:
                continue

            for name in expander.get_templates(raw, info['title']):
                templates.add((name, info['wikidb']))
                            
        self.num_templates = len(templates)
        self.template_count = 0 
        for title, wikidb in templates:
            self.fetchTemplate(title, wikidb)
        self.jobsched.join()
        
        if self.status:
            self.status(status=u'parsing articles')
        n = len(self.article_jobs)
        for i, info in enumerate(self.article_jobs):
            try:
                raw = self.articles[info['title']]['content']
            except KeyError:
                continue
            if self.parse_status:
                self.parse_status(article=info['title'])
            self.parseArticle(
                title=info['title'],
                revision=info['revision'],
                raw=raw,
                wikidb=info['wikidb'],
                imagedb=info['imagedb'],
            )
            if self.parse_status:
                self.parse_status(progress=i*100/n)
        if self.status:
            self.status(status=u'fetching images', article='')
        self.num_images = len(self.image_infos)
        self.image_count = 0
        for i in self.image_infos:
            self.addImage(*i)
        self.jobsched.join()
      
        self.addObject('content.json', json.dumps(dict(
            articles=self.articles,
            templates=self.templates,
            sources=self.sources,
            images=self.images,
        )))