def parse(self): requests.packages.urllib3.disable_warnings(InsecureRequestWarning) site = NewsSite(self.url) home_raw = requests.get(self.url, headers=self.headers, verify=False) home = html.fromstring(home_raw.content) excerpts = home.xpath('//article') for excerpt in excerpts: title = excerpt.xpath('.//h2/a/text()')[0].strip() url = excerpt.xpath('.//h2/a/@href')[0] article_raw = requests.get(url, headers=self.headers) article = html.fromstring(article_raw.content) # for script in article.xpath('//script'): # script.getparent().remove(script) # for style in article.xpath('//style'): # style.getparent().remove(style) # for div in article.xpath('//div[contains(@class, "wipa-container")]'): # div.getparent().remove(div) image_url = None image_raw = article.xpath('.//meta[@property="og:image"]/@content') if len(image_raw): image_url = image_raw[0] date_raw = article.xpath( '//div[contains(@class, "cb-entry-header")]//time/@datetime' )[0] date = datetime.strptime(date_raw.strip()[:10], "%Y-%m-%d") body_html_raw = article.xpath('//span[@class="cb-itemprop"]')[0] body_html = html.tostring(body_html_raw) body_text = body_html_raw.text_content().strip() site.add_article(title, url, image_url, date, body_html, body_text) return site.to_dict()
def parse(self): site = NewsSite(self.url) home_raw = requests.get(self.url, headers = self.headers) home = html.fromstring(home_raw.content) excerpts = home.xpath('//article[contains(@class, "post")]') for excerpt in excerpts: title = excerpt.xpath('.//h2/a/text()')[0].strip() url = excerpt.xpath('.//h2/a/@href')[0] article_raw = requests.get(url, headers = self.headers) article = html.fromstring(article_raw.content) for script in article.xpath('//script'): script.getparent().remove(script) for style in article.xpath('//style'): style.getparent().remove(style) for div in article.xpath('//div[@class="entry-meta-tags"]'): div.getparent().remove(div) for div in article.xpath('//div[@class="post-body-social"]'): div.getparent().remove(div) for div in article.xpath('//div[@class="entry-meta-author"]'): div.getparent().remove(div) image_url = article.xpath('.//meta[@property="og:image"]/@content')[0] date_raw = article.xpath('//time[@class="value-title"]/@datetime')[0] date = datetime.strptime(date_raw.strip()[:10], "%Y-%m-%d") body_html_raw = article.xpath('//div[contains(@class, "post-content")]')[0] body_html = html.tostring(body_html_raw) body_text = body_html_raw.text_content().strip() site.add_article(title, url, image_url, date, body_html, body_text) return site.to_dict()
def parse(self): site = NewsSite(self.url) home_raw = requests.get(self.url, headers=self.headers) home = html.fromstring(home_raw.content) excerpts = home.xpath('//a[@class="leafly-article"]') for excerpt in excerpts: title = excerpt.xpath('.//span[@class="leafly-title"]/text()')[0] #print u'Processing article: {}'.format(title) url = excerpt.xpath('./@href')[0] article_raw = requests.get(url, headers=self.headers) article = html.fromstring(article_raw.content) for script in article.xpath('//script'): script.getparent().remove(script) date_raw = article.xpath( '//meta[@property="article:published_time"]/@content') date = None if len(date_raw): date_raw = date_raw[0] date = datetime.strptime(date_raw.strip()[:10], "%Y-%m-%d") image_url = None image_url_raw = article.xpath( '//div[@class="leafly-standard-article-header"]//img/@src') if len(image_url_raw): image_url = image_url_raw[0] body_html = None body_text = None body_html_raw = article.xpath( '//div[@class="leafly-legacy-article"]') if len(body_html_raw): body_html = html.tostring(body_html_raw[0]) else: body_html_raw = article.xpath( '//div[@class="post-content style-light"]/div[2]//div[@class="uncode_text_column"]' ) if len(body_html_raw): body_html = html.tostring(body_html_raw[0]) if len(body_html_raw): body_text = body_html_raw[0].text_content().strip() site.add_article(title, url, image_url, date, body_html, body_text) return site.to_dict()
def parse(self): site_result = NewsSite(self.url) html_string = self.html_getter.get(self.url) if not html_string: return html_document = html.fromstring(html_string) news = HtmlUtils.get_elements(html_document, self.xpaths_container.get_news_xpath()) for n in news: title = self._get_title(n) url = self._get_url(n) if not url: continue html_string = self.html_getter.get(url) if not html_string: continue article = html.fromstring(html_string) image_url = self._get_image_url(article) date = self._get_date(article) self.remove_elements(article, self.xpaths_container.get_elements_to_remove_xpaths()) text_html = self._get_text_html(article) text_plain = self._get_text_plain(article) site_result.add_article(title, url, image_url, date, text_html, text_plain) return site_result.to_dict()