Beispiel #1
0
 def _parse(self, input):
     doc, self.encoding = build_doc(input)
     doc = html_cleaner.clean_html(doc)
     base_href = self.options.get("url", None)
     if base_href:
         doc.make_links_absolute(base_href, resolve_base_href=True)
     else:
         doc.resolve_base_href()
     return doc
Beispiel #2
0
def parse(input, url):
    logging.debug('parse url: %s', url)
    raw_doc = build_doc(input)
    doc = html_cleaner.clean_html(raw_doc)
    if url:
        doc.make_links_absolute(url, resolve_base_href=True)
    else:
        doc.resolve_base_href()
    return doc
Beispiel #3
0
 def _parse(self, input):
     doc, self.encoding = build_doc(input)
     doc = html_cleaner.clean_html(doc)
     base_href = self.options.get('url', None)
     if base_href:
         doc.make_links_absolute(base_href, resolve_base_href=True)
     else:
         doc.resolve_base_href()
     return doc
	def _parse(self, input):
		doc = build_doc(input)
		doc = html_cleaner.clean_html(doc)
		base_href = self.options['url']
		if base_href:
			doc.make_links_absolute(base_href, resolve_base_href=True)
		else:
			doc.resolve_base_href()
		return doc
Beispiel #5
0
def parse(input, url):
    logging.debug('parse url: %s', url)
    raw_doc = build_doc(input)
    doc = html_cleaner.clean_html(raw_doc)
    if url:
        doc.make_links_absolute(url, resolve_base_href=True)
    else:
        doc.resolve_base_href()
    return doc
    def __init__(self, url, text=None, page=1, min_article_length=250, min_article_percentage=0.075):
        """
        :param url: the url of the document
        :param text: optionally the string value of the page may be passed in
        :param page: if this is one in a series of documents in an article this should be set
        :param min_article_length: if an article is less than this number of characters it's not an article
        :param min_article_percentage: an article must be this % of the text on the page
        """
        self.url = url
        self.page = page
        self._article = None
        self.min_article_length = min_article_length
        self.min_article_percentage = min_article_percentage

        if text:
            self.text = text
        else:
            self.text = requests.get(url).text

        # parses the HTML and cleans it up removing elements this doesn't want to deal with (e.g., head, script, form)
        doc, self.encoding = build_doc(self.text)
        doc = html_cleaner.clean_html(doc)
        doc.make_links_absolute(self.url, resolve_base_href=True)
        self.html = doc