def _generate_html(pages): """Generate html for given pages. :param pages: list of `Page.class` :return: path to tempdir. :rtype: string. """ tempdir = tempfile.mkdtemp() doc = html.html( html.head( # python don't allow keyword args with hypens html.meta(content="text/html; charset=utf-8", **{"http-equiv": "Content-Type"}), _get_tile(html, pages) ), html.body( html.div(_generate_body(html, pages, tempdir), id='article'), ) ) with open(os.path.join(tempdir, 'histmag.html'), 'wb') as out_file: logger.debug( 'Saving generated html to {file}'.format(file=os.path.join(tempdir, 'histmag.html')) ) out_file.write(doc.unicode(indent=2).encode('utf8')) return tempdir
def send_email_to_kindle(kindle_email, name='histmag.mobi'): """Sending html_article to kindle_email using mailgun api. Basic Usage:: >>>from histmag_to_kindle import send_email_to_kindle >>>send_email_to_kindle(kindle_email='your_kindle_email', name=html) <Response [200]> :param kindle_email: your kindle email :param name: path to file to send :return: response """ if not os.environ.get('MAILGUN_API_KEY') or not os.environ.get('EMAIL_SERVER'): raise ImproperlyConfigured('Either MAILGUN_API_KEY or EMAIL_SERVER variable not found in enviroment variables.') api_key = os.environ.get('MAILGUN_API_KEY') server = os.environ.get('EMAIL_SERVER') logger.debug('Sending html_article to {email}'.format(email=kindle_email)) return requests.post("https://api.mailgun.net/v3/{server}/messages".format(server=server), auth=("api", api_key), files=[("attachment", open(name, 'rb'))], data={"from": "Excited User <mailgun@{server}>".format(server=server), "to": kindle_email, "subject": "Upload", "text": "send to kidle"})
def parse_page(self, url): """Parse page and retrive its contents. :param url: webpage address with 'http://'. :type url: string. :return: Page object with addr and contents :rtype: :class:`Page` """ logger.debug('Started parsing page with url: {url}'.format(url=url)) response = requests.get(url) parsed_page = html.fromstring(response.content) page_contents = [] Element = namedtuple('Elements', ['tag', 'value']) for elem in parsed_page.xpath('{root}//child::p[not(contains(@class, "article-tags")) ' 'and not(contains(@class, "article-info"))] ' '| {root}//a[contains(@href, "author")]' '| {root}//em ' '| {root}//img ' '| {root}//span'.format(root=self.xpath_root)): if elem.tag == 'img': page_contents.append(Element(elem.tag, elem.attrib['src'])) elif elem.tag == 'span': page_contents.append(Element(elem.tag, elem.text_content() or '')) else: page_contents.append(Element(elem.tag, elem.text or '')) return Page(url, contents=page_contents)
def get_first_link(self, url, word='następna'): """Get first link that contains given word. :param url: webpage address with 'http://'. :type url: string. :param word: word that is in <a> tag. :type word: string. :return: link with full path. ":rtype: string. """ response = requests.get(url) parsed_page = html.fromstring(response.content) link_xpath = '//a[contains(.,"' + word + '")]/@href)[1]' try: return parsed_page.xpath('(' + self.xpath_root + link_xpath)[0] except IndexError: logger.debug('No link found for word: {word}'.format(word=word)) return None
def _generate_body(html, pages, tempdir): """Generate html body. :param html: `Py.xml` html object. :param pages: list of `Page.class`. :param tempdir: `tempfile` tempdir object. :return: `Py.xml` html objects. :rtype: list. """ list_of_bodies = [] for page in pages: for content in page.contents: if content.value not in [i[0] for i in list_of_bodies if i.xmlname != 'img']: if content.tag == 'img': content = _download_images(content, tempdir) list_of_bodies.append(getattr(html, content.tag)(src=content.value)) else: list_of_bodies.append(getattr(html, content.tag)(content.value)) logger.debug('Html body generated') return list_of_bodies
def get_articles(self): """Get full article with subpages for `self.addr`. :return: all subpages in form of :class:`Page` :rtype: list """ urls_queue = deque([self.addr]) found_urls = {self.addr} articles = [] while len(urls_queue): url = urls_queue.popleft() logger.debug('Found url: {url}'.format(url=url)) current_page = self.parse_page(url) link = self.get_first_link(url) if link: found_urls.add(link) urls_queue.append(link) articles.append(current_page) logger.info('Articles has been extracted') return articles