def autodiscovery(document, url): """If the given url refers an actual feed, it returns the given url without any change. If the given url is a url of an ordinary web page (i.e. :mimetype:`text/html`), it finds the urls of the corresponding feed. It returns feed urls in feed types' lexicographical order. If autodiscovery failed, it raise :exc:`FeedUrlNotFoundError`. :param document: html, or xml strings :type document: :class:`str` :param url: the url used to retrieve the ``document``. if feed url is in html and represented in relative url, it will be rebuilt on top of the ``url`` :type url: :class:`str` :returns: list of :class:`FeedLink` objects :rtype: :class:`collections.MutableSequence` """ document = text(document) document_type = get_format(document) if document_type is None: parser = AutoDiscovery() feed_links = parser.find_feed_url(document) if not feed_links: raise FeedUrlNotFoundError('Cannot find feed url') for link in feed_links: if link.url.startswith('/'): absolute_url = urlparse.urljoin(url, link.url) feed_links[feed_links.index(link)] = \ FeedLink(link.type, absolute_url) return feed_links else: return [FeedLink(TYPE_TABLE[document_type], url)]
def string_chunks(consume_log, *chunks): """Iterate the given chunks of a text with logging consumed offsets to test laziness of the parser. If an argument is a list (that consists of a string) it's treated as offset tagging. """ size = len(chunks) for i, chunk in enumerate(chunks): if type(chunk) is list: continue chunk = text(chunk) if size > i + 1 and type(chunks[i + 1]) is list: consume_log.append(chunks[i + 1][0]) if not isinstance(chunk, binary_type): # In IronPython str.encode() returns str instead of bytes, # and bytes(str, encoding) returns bytes. chunk = binary_type(chunk, 'utf-8') yield chunk