def get_authors_from_template_args(template): args = get_template_args(template, expander) author_arg = args.get("Author", None) if author_arg: # userlinks = getUserLinks(author_arg) # if userlinks: # return userlinks node = uparser.parseString("", raw=args["Author"], wikidb=wikidb) advtree.extendClasses(node) txt = node.getAllDisplayText().strip() if txt: return [txt] if args.args: return getUserLinks("\n".join([args.get(i, u"") for i in range(len(args.args))])) return []
def get_authors_from_template_args(template): args = get_template_args(template, expander) author_arg = args.get('Author', None) if author_arg: # userlinks = getUserLinks(author_arg) # if userlinks: # return userlinks node = uparser.parseString('', raw=args['Author'], wikidb=wikidb) advtree.extendClasses(node) txt = node.getAllDisplayText().strip() if txt: return [txt] if args.args: return getUserLinks('\n'.join([args.get(i, u'') for i in range(len(args.args))])) return []
logging.basicConfig() # Check that a date has been specified if not len(sys.argv) == 2: print "Please provide exactly one argument - the title page of the date to extract events from in wikipedia (e.g. January_1)" sys.exit(1) # Gather content from wikipedia title = sys.argv[1] page = urllib2.urlopen("http://en.wikipedia.org/w/index.php?action=raw&title=%s" % title).read().decode("utf-8") tree = parseString(title, page) # Add some utility methods to the item advtree.extendClasses(tree) # Iterate through the items listed in the wiki page results = {} for node in tree.allchildren(): if isinstance(node, Section) and node.children[0].asText().strip() in (u"Events", u"Births", u"Deaths"): section = node.children[0].asText().strip().lower().encode("utf-8") results[section] = [] for item in [x.children[0] for x in node.children[1].allchildren() if isinstance(x, Item)]: # Extract the year and text from the item raw = item.getAllDisplayText().strip()
def getContributors(self, name, wikidb=None): """Return list of image contributors @param name: image name without namespace (e.g. without "Image:") @type name: unicode @param wikidb: WikiDB instance (optional) @type wikidb: object @returns: list of contributors @rtype: [unicode] or None """ desc_url = self.getDescriptionURL(name) if desc_url is None: return None # Note: We're always guessing the API helper b/c we'll get problems when # fetching from en.wp if we should've used commons.wikimedia.org instead. # A passed wikidb is only used as a fallback here. api_helper = get_api_helper(desc_url) if api_helper is None: if wikidb is None: return None else: wikidb = WikiDB(api_helper=api_helper) title = 'Image:%s' % name raw = wikidb.getRawArticle(title) if not raw: return None expander = Expander(u'', title, wikidb) def getUserLinks(raw): def isUserLink(node): return isinstance(node, parser.NamespaceLink) and node.namespace == namespace.NS_USER result = list(set([ u.target for u in uparser.parseString(title, raw=raw, wikidb=wikidb, ).filter(isUserLink) ])) result.sort() return result template = find_template(raw, 'Information') if template is not None: author = get_template_args(template, expander).get('Author', '').strip() if author: users = getUserLinks(author) if users: users = list(set(users)) users.sort() return users node = uparser.parseString('', raw=author, wikidb=wikidb) advtree.extendClasses(node) return [node.getAllDisplayText()] users = getUserLinks(raw) if users: return users return wikidb.getAuthors(title)
# Check that a date has been specified if not len(sys.argv) == 2: print 'Please provide exactly one argument - the title page of the date to extract events from in wikipedia (e.g. January_1)' sys.exit(1) # Gather content from wikipedia title = sys.argv[1] page = urllib2.urlopen( 'http://en.wikipedia.org/w/index.php?action=raw&title=%s' % title).read().decode('utf-8') tree = parseString(title, page) # Add some utility methods to the item advtree.extendClasses(tree) # Iterate through the items listed in the wiki page results = {} for node in tree.allchildren(): if isinstance(node, Section) and node.children[0].asText().strip() in ( u'Events', u'Births', u'Deaths'): section = node.children[0].asText().strip().lower().encode('utf-8') results[section] = [] for item in [ x.children[0] for x in node.children[1].allchildren() if isinstance(x, Item)