def __init__(self, url=None, positive_keywords=None, negative_keywords=None, summary_sentences_qty=5, headers=None): """ Initialization of the class. If url is set, it gets performed. :param url: web-page url of the document :param positive_keywords: list of keywords, which are likely to be seen in classes or ids of tags :param negative_keywords: list of keywords, which are unlikely to be seen in classes or ids of tags :param summary_sentences_qty: maximum quantity of summary sentences :param headers: custom headers for GET request to obtain web page of the article """ # TODO: customizable redirects limit? self._article_extractor = ArticleExtractor( positive_keywords=positive_keywords, negative_keywords=negative_keywords) self.url = None # source web-page url self.canonical_url = None # canonical web-page url if present, otherwise same as url self.title = None # document's title self.image_url = None # document's image url self.language = None # document's article language self.clean_html = None # cleaned html of the article self.description = None # summarized description (text only) self.error_msg = None # error message self._source_html = None # source html of the document (lxml doc) self._charset = None # source html encoding self._headers = headers if type(headers) == dict else { } # custom headers for GET request # summarized text sentences quantity try: self._summary_sentences_qty = summary_sentences_qty except (TypeError, ValueError): self._summary_sentences_qty = 5 # perform the url if defined if url: self.perform_url(url)
def __init__(self, url=None, positive_keywords=None, negative_keywords=None, summary_sentences_qty=5, headers=None): """ Initialization of the class. If url is set, it gets performed. :param url: web-page url of the document :param positive_keywords: list of keywords, which are likely to be seen in classes or ids of tags :param negative_keywords: list of keywords, which are unlikely to be seen in classes or ids of tags :param summary_sentences_qty: maximum quantity of summary sentences :param headers: custom headers for GET request to obtain web page of the article """ # TODO: customizable redirects limit? self._article_extractor = ArticleExtractor(positive_keywords=positive_keywords, negative_keywords=negative_keywords) self.url = None # source web-page url self.canonical_url = None # canonical web-page url if present, otherwise same as url self.title = None # document's title self.image_url = None # document's image url self.language = None # document's article language self.clean_html = None # cleaned html of the article self.description = None # summarized description (text only) self.error_msg = None # error message self._source_html = None # source html of the document (lxml doc) self._charset = None # source html encoding self._headers = headers if type(headers) == dict else {} # custom headers for GET request # summarized text sentences quantity try: self._summary_sentences_qty = summary_sentences_qty except (TypeError, ValueError): self._summary_sentences_qty = 5 # perform the url if defined if url: self.perform_url(url)
class Wanish(): def __init__(self, url=None, positive_keywords=None, negative_keywords=None, summary_sentences_qty=5, headers=None): """ Initialization of the class. If url is set, it gets performed. :param url: web-page url of the document :param positive_keywords: list of keywords, which are likely to be seen in classes or ids of tags :param negative_keywords: list of keywords, which are unlikely to be seen in classes or ids of tags :param summary_sentences_qty: maximum quantity of summary sentences :param headers: custom headers for GET request to obtain web page of the article """ self._article_extractor = ArticleExtractor( positive_keywords=positive_keywords, negative_keywords=negative_keywords) self.url = None # source web-page url self.title = None # document's title self.image_url = None # document's image url self.language = None # document's article language self.clean_html = None # cleaned html of the article self.description = None # summarized description (text only) self.error_msg = None # error message self._source_html = None # source html of the document (lxml doc) self._charset = None # source html encoding self._headers = headers if type(headers) == dict else { } # custom headers for GET request # summarized text sentences quantity try: self._summary_sentences_qty = summary_sentences_qty except (TypeError, ValueError): self._summary_sentences_qty = 5 # perform the url if defined if url: self.perform_url(url) def perform_url(self, url): """ Perform an article document by designated url :param url: web-page url of the document """ self.url = url self.title = self.image_url = self.language = self.description = \ self.clean_html = self.error_msg = self._charset = None if not self.url: self.error_msg = 'Empty or null URL to perform' return # get the page (bytecode) try: web_page = requests.get(self.url, headers=self._headers) # perform http status codes if web_page.status_code not in [200, 301, 302]: self.error_msg = str('HTTP Error. Status: %s' % web_page.status_code) return self.url = web_page.url raw_html = web_page.content self._charset = get_encoding(raw_html) raw_html_str = raw_html.decode(self._charset) # getting and cleaning the document self._source_html = document_fromstring(raw_html_str) self._source_html = html_cleaner.clean_html(self._source_html) # making links absolute self._source_html.make_links_absolute(self.url, resolve_base_href=True) except (ConnectionError, Timeout, TypeError, Exception) as e: self.error_msg = str(e) finally: if self.error_msg: return if self._source_html is not None: # obtaining title self.title = shorten_title(self._source_html) # obtaining image url self.image_url = get_image_url(self._source_html, self.url) if self.image_url is not None: image_url_node = "<meta itemprop=\"image\" content=\"%s\">" % self.image_url image_url_img = "<img src=\"%s\" />" % self.image_url else: image_url_node = image_url_img = "" # clean html self.clean_html = self._article_extractor.get_clean_html( source_html=self._source_html) # summarized description, requires clean_html if self.clean_html: self.description, self.language = get_plain_text( etree.XML(self.clean_html), self._summary_sentences_qty) description_node = "<meta name=\"description\" content=\"%s\">" if self.description else "" # filling the template self.clean_html = ARTICLE_TEMPLATE % { 'language': self.language, 'title': self.title, 'image_url_node': image_url_node, 'image_url_img': image_url_img, 'description_node': description_node, 'clean_html': self.clean_html }
class Wanish(object): def __init__(self, url=None, positive_keywords=None, negative_keywords=None, summary_sentences_qty=5, headers=None): """ Initialization of the class. If url is set, it gets performed. :param url: web-page url of the document :param positive_keywords: list of keywords, which are likely to be seen in classes or ids of tags :param negative_keywords: list of keywords, which are unlikely to be seen in classes or ids of tags :param summary_sentences_qty: maximum quantity of summary sentences :param headers: custom headers for GET request to obtain web page of the article """ self._article_extractor = ArticleExtractor(positive_keywords=positive_keywords, negative_keywords=negative_keywords) self.url = None # source web-page url self.canonical_url = None # canonical web-page url if present, otherwise same as url self.title = None # document's title self.image_url = None # document's image url self.language = None # document's article language self.clean_html = None # cleaned html of the article self.description = None # summarized description (text only) self.error_msg = None # error message self._source_html = None # source html of the document (lxml doc) self._charset = None # source html encoding self._headers = headers if type(headers) == dict else {} # custom headers for GET request # summarized text sentences quantity try: self._summary_sentences_qty = summary_sentences_qty except (TypeError, ValueError): self._summary_sentences_qty = 5 # perform the url if defined if url: self.perform_url(url) def perform_url(self, url): """ Perform an article document by designated url :param url: web-page url of the document """ self.url = url self.title = self.image_url = self.language = self.description = \ self.clean_html = self.error_msg = self._charset = None if not self.url: self.error_msg = 'Empty or null URL to perform' return # get the page (bytecode) try: web_page = requests.get(self.url, headers=self._headers) # perform http status codes if web_page.status_code not in [200, 301, 302]: self.error_msg = str('HTTP Error. Status: %s' % web_page.status_code) return self.url = web_page.url raw_html = web_page.content self._charset = get_encoding(raw_html) our_parser = XMLParser(encoding=self._charset, recover=True) # getting and cleaning the document self._source_html = parse(BytesIO(raw_html), parser=our_parser) self._source_html = fromstring(tostring(self._source_html)) # searching for canonical url link_canonicals = self._source_html.xpath("//link[normalize-space(@rel)='canonical']/@href") self.canonical_url = link_canonicals[0] if len(link_canonicals) > 0 else self.url self._source_html = html_cleaner.clean_html(self._source_html) # making links absolute self._source_html.make_links_absolute(self.url, resolve_base_href=True) strip_elements(self._source_html, 'blockquote', 'code', 'table', 'ol', 'ul', 'embedded', 'input', 'address', 'iframe', 'textarea', 'dl') except (ConnectionError, Timeout, TypeError, Exception) as e: self.error_msg = str(e) finally: if self.error_msg: return if self._source_html is not None: # obtaining title self.title = shorten_title(self._source_html) # obtaining image url self.image_url = get_image_url(self._source_html, self.url) if self.image_url is not None: image_url_node = "<meta itemprop=\"image\" content=\"%s\">" % self.image_url image_url_img = "<img src=\"%s\" />" % self.image_url else: image_url_node = image_url_img = "" # clean html self.clean_html = self._article_extractor.get_clean_html(source_html=self._source_html) # summarized description, requires clean_html if self.clean_html: self.description, self.language = get_plain_text(etree.XML(self.clean_html), self._summary_sentences_qty) description_node = "" if self.description: # Replacing \xc2\xa0 and \xa0 in result with space self.description = self.description.replace(u'\xc2\xa0', u' ').replace(u'\xa0', u' ') description_node = "<meta name=\"description\" content=\"%s\">" if self.description else "" # filling the template self.clean_html = ARTICLE_TEMPLATE % { 'language': self.language, 'title': self.title, 'image_url_node': image_url_node, 'image_url_img': image_url_img, 'description_node': description_node, 'clean_html': self.clean_html }
class Wanish(): def __init__(self, url=None, positive_keywords=None, negative_keywords=None, summary_sentences_qty=5, headers=None): """ Initialization of the class. If url is set, it gets performed. :param url: web-page url of the document :param positive_keywords: list of keywords, which are likely to be seen in classes or ids of tags :param negative_keywords: list of keywords, which are unlikely to be seen in classes or ids of tags :param summary_sentences_qty: maximum quantity of summary sentences :param headers: custom headers for GET request to obtain web page of the article """ self._article_extractor = ArticleExtractor(positive_keywords=positive_keywords, negative_keywords=negative_keywords) self.url = None # source web-page url self.title = None # document's title self.image_url = None # document's image url self.language = None # document's article language self.clean_html = None # cleaned html of the article self.description = None # summarized description (text only) self.error_msg = None # error message self._source_html = None # source html of the document (lxml doc) self._charset = None # source html encoding self._headers = headers if type(headers) == dict else {} # custom headers for GET request # summarized text sentences quantity try: self._summary_sentences_qty = summary_sentences_qty except (TypeError, ValueError): self._summary_sentences_qty = 5 # perform the url if defined if url: self.perform_url(url) def perform_url(self, url): """ Perform an article document by designated url :param url: web-page url of the document """ self.url = url self.title = self.image_url = self.language = self.description = \ self.clean_html = self.error_msg = self._charset = None if not self.url: self.error_msg = 'Empty or null URL to perform' return # get the page (bytecode) try: web_page = requests.get(self.url, headers=self._headers) # perform http status codes if web_page.status_code not in [200, 301, 302]: self.error_msg = str('HTTP Error. Status: %s' % web_page.status_code) return self.url = web_page.url raw_html = web_page.content self._charset = get_encoding(raw_html) raw_html_str = raw_html.decode(self._charset) # getting and cleaning the document self._source_html = document_fromstring(raw_html_str) self._source_html = html_cleaner.clean_html(self._source_html) # making links absolute self._source_html.make_links_absolute(self.url, resolve_base_href=True) except (ConnectionError, Timeout, TypeError, Exception) as e: self.error_msg = str(e) finally: if self.error_msg: return if self._source_html is not None: # obtaining title self.title = shorten_title(self._source_html) # obtaining image url self.image_url = get_image_url(self._source_html, self.url) if self.image_url is not None: image_url_node = "<meta itemprop=\"image\" content=\"%s\">" % self.image_url image_url_img = "<img src=\"%s\" />" % self.image_url else: image_url_node = image_url_img = "" # clean html self.clean_html = self._article_extractor.get_clean_html(source_html=self._source_html) # summarized description, requires clean_html if self.clean_html: self.description, self.language = get_plain_text(etree.XML(self.clean_html), self._summary_sentences_qty) description_node = "<meta name=\"description\" content=\"%s\">" if self.description else "" # filling the template self.clean_html = ARTICLE_TEMPLATE % { 'language': self.language, 'title': self.title, 'image_url_node': image_url_node, 'image_url_img': image_url_img, 'description_node': description_node, 'clean_html': self.clean_html }
class Wanish(object): def __init__(self, url=None, positive_keywords=None, negative_keywords=None, summary_sentences_qty=5, headers=None): """ Initialization of the class. If url is set, it gets performed. :param url: web-page url of the document :param positive_keywords: list of keywords, which are likely to be seen in classes or ids of tags :param negative_keywords: list of keywords, which are unlikely to be seen in classes or ids of tags :param summary_sentences_qty: maximum quantity of summary sentences :param headers: custom headers for GET request to obtain web page of the article """ # TODO: customizable redirects limit? self._article_extractor = ArticleExtractor( positive_keywords=positive_keywords, negative_keywords=negative_keywords) self.url = None # source web-page url self.canonical_url = None # canonical web-page url if present, otherwise same as url self.title = None # document's title self.image_url = None # document's image url self.language = None # document's article language self.clean_html = None # cleaned html of the article self.description = None # summarized description (text only) self.error_msg = None # error message self._source_html = None # source html of the document (lxml doc) self._charset = None # source html encoding self._headers = headers if type(headers) == dict else { } # custom headers for GET request # summarized text sentences quantity try: self._summary_sentences_qty = summary_sentences_qty except (TypeError, ValueError): self._summary_sentences_qty = 5 # perform the url if defined if url: self.perform_url(url) def perform_url(self, url): """ Perform an article document by designated url :param url: web-page url of the document """ self.url = url self.title = self.image_url = self.language = self.description = \ self.clean_html = self.error_msg = self._charset = None if not self.url: self.error_msg = 'Empty or null URL to perform' return # get the page (bytecode) try: web_page = requests.get(self.url, headers=self._headers) # perform http status codes if web_page.status_code not in [200, 301, 302]: self.error_msg = str('HTTP error. Status: %s' % web_page.status_code) return self.url = web_page.url raw_html = web_page.content # getting content_type from headers to obtain encoding, will use it if it is not specified on page page_encodings = get_encodings(raw_html) if len(page_encodings) > 0: self._charset = page_encodings[0] elif web_page.encoding is not None: self._charset = web_page.encoding else: res = chardet.detect(raw_html) self._charset = res['encoding'] string_source = raw_html.decode(self._charset, "ignore") self._source_html = fromstring(string_source) # searching for canonical url link_canonicals = self._source_html.xpath( "//link[normalize-space(@rel)='canonical']/@href") self.canonical_url = link_canonicals[0] if len( link_canonicals) > 0 else self.url self._source_html = html_cleaner.clean_html(self._source_html) # making links absolute self._source_html.make_links_absolute(self.url, resolve_base_href=True) strip_elements(self._source_html, 'blockquote', 'code', 'table', 'ol', 'ul', 'embedded', 'input', 'address', 'iframe', 'textarea', 'dl') except (ConnectionError, Timeout, TypeError, Exception) as e: self.error_msg = str(e) finally: if self.error_msg: return if self._source_html is not None: # clean html of the article and its starting node self.clean_html, starting_node = self._article_extractor.get_clean_html( source_html=self._source_html) # obtaining title short_title, title_node = shorten_title(self._source_html, starting_node) self.title = clean_entities(short_title) # obtaining image url self.image_url = get_image_url(self._source_html, self.url, self._headers, starting_node, title_node) if self.image_url is not None: image_url_node = "<meta itemprop=\"image\" content=\"%s\">" % self.image_url image_url_img = "<img src=\"%s\" />" % self.image_url else: image_url_node = image_url_img = "" # summarized description, requires clean_html if self.clean_html: self.description, self.language = get_plain_text( etree.XML(self.clean_html), self._summary_sentences_qty) description_node = "" if self.description: # Replacing \xc2\xa0 and \xa0 in result with space self.description = self.description.replace( u'\xc2\xa0', u' ').replace(u'\xa0', u' ') self.description = clean_entities(self.description) self.description = ' '.join(self.description.split()) description_node = "<meta name=\"description\" content=\"%s\">" if self.description else "" # filling the template self.clean_html = ARTICLE_TEMPLATE % { 'language': self.language, 'title': self.title, 'image_url_node': image_url_node, 'image_url_img': image_url_img, 'description_node': description_node, 'clean_html': self.clean_html }