def _parse(self, input): doc, self.encoding = build_doc(input) doc = html_cleaner.clean_html(doc) base_href = self.options.get("url", None) if base_href: doc.make_links_absolute(base_href, resolve_base_href=True) else: doc.resolve_base_href() return doc
def parse(input, url): logging.debug('parse url: %s', url) raw_doc = build_doc(input) doc = html_cleaner.clean_html(raw_doc) if url: doc.make_links_absolute(url, resolve_base_href=True) else: doc.resolve_base_href() return doc
def _parse(self, input): doc, self.encoding = build_doc(input) doc = html_cleaner.clean_html(doc) base_href = self.options.get('url', None) if base_href: doc.make_links_absolute(base_href, resolve_base_href=True) else: doc.resolve_base_href() return doc
def _parse(self, input): doc = build_doc(input) doc = html_cleaner.clean_html(doc) base_href = self.options['url'] if base_href: doc.make_links_absolute(base_href, resolve_base_href=True) else: doc.resolve_base_href() return doc
def __init__(self, url, text=None, page=1, min_article_length=250, min_article_percentage=0.075): """ :param url: the url of the document :param text: optionally the string value of the page may be passed in :param page: if this is one in a series of documents in an article this should be set :param min_article_length: if an article is less than this number of characters it's not an article :param min_article_percentage: an article must be this % of the text on the page """ self.url = url self.page = page self._article = None self.min_article_length = min_article_length self.min_article_percentage = min_article_percentage if text: self.text = text else: self.text = requests.get(url).text # parses the HTML and cleans it up removing elements this doesn't want to deal with (e.g., head, script, form) doc, self.encoding = build_doc(self.text) doc = html_cleaner.clean_html(doc) doc.make_links_absolute(self.url, resolve_base_href=True) self.html = doc