def scrape_unit(self, unit: NewsdeskUnit): article_element, article = unit try: article_element.find_element_by_css_selector(".extract_full_link_button").click() except NoSuchElementException: pass else: self.wait(".extract_full_link_button", on=article_element) try: inner = self.wait(".extract_inner", on=article_element) except NotVisible: if article.get_property("wordcount_int") <= 1: article.text = "[NO TEXT]" else: raise else: article_html = inner.get_attribute("innerHTML") article.text = html2text(article_html) # Cut off data urls at #3; no article actually has that many. All instances so far led to the same # data (even though the url differed). data_urls = list(get_data_urls(inner))[:3] for i, data_url in enumerate(data_urls): article.set_property("data{}_url".format(i), data_url) # Be gentle with servers time.sleep(random.uniform(0.1, 0.3)) return article
def scrape_unit(self, url): reader_url = "about:reader?url={}".format(url) doc = self.get_html(reader_url, wait_for="div.content p") for tag in REMOVE_TAGS: for element in doc.cssselect(tag): element.getparent().remove(element) article = doc.cssselect("div.content")[0] article_html = lxml.html.tostring(article).decode() title = doc.cssselect("h1.reader-title")[0].text_content().strip() text = html2text(article_html) if self.__class__.get_date is not GenericScraper.get_date: # Get contents of un-firefox-read-ed article self.wait(".reader-toolbar .close-button").click() time.sleep(0.3) doc_html = self.wait("html").get_attribute("outerHTML") doc = lxml.html.fromstring(doc_html, base_url=url) try: date = self.get_date(doc) except NotImplementedError: date = self.now except Exception as e: log.warning("get_date() failed for {} with: {}".format(url, e)) date = self.now else: date = self.now article = Article(date=date, title=title, text=text, url=url) return article
def scrape_unit(self, date_and_article_url): date, article_url = date_and_article_url log.info("Fetching {}".format(article_url)) article_doc = self.session.get_html(article_url) article_el = article_doc.cssselect("#content > article") if not article_el: log.error("Could not find article on {article_url}".format(**locals())) return None title = article_el[0].cssselect("h1")[0].text text = html2text(article_el[0].cssselect("p")) text = text.strip() or "." try: footer = article_el[0].cssselect("footer")[0] except IndexError as e: # Contains <embed> tag which is not closed gracefully :-( log.exception(e) return None author = footer.text.rsplit("|", 1)[0].strip() timestamp = parse_date(article_el[0].cssselect("footer > time")[0].get("datetime")) if not title: return None children = self._get_comments(title, article_url, article_doc) article = Article(date=timestamp, title=title, text=text) article.set_property("author", author) article.set_property("url", article_url) article.set_property("medium", "GeenStijl") return ArticleTree(article, [ArticleTree(c, []) for c in children])
def _scrape_unit(self, article): doc = self.session.get_html(article['url']) article['headline'] = doc.cssselect("div.intro h1")[0].text article['text'] = html2text( doc.cssselect("div.intro h2") + doc.cssselect("div.text")) if doc.cssselect("span.author"): article['author'] = doc.cssselect("span.author" )[0].text.split("Door:")[1]
def _scrape_unit(self, url): doc = self.session.get_html(url) article = { 'url' : doc.base_url, 'text' : tools.html2text(doc.cssselect("#article-content p")), 'headline' : doc.cssselect("#article h1")[0].text_content().strip(), 'date' : tools.read_date(doc.cssselect("abbr.page-last-modified")[0].text), 'externalid' : int(url.split("/")[-1].split("-")[0])} return article
def get_article_section_text(self, url): article_doc = self.session.get_html(url) print(url) yield article_doc.cssselect("#main")[0].text_content().strip() #yield article_doc.cssselect(".breadcrumb li.active")[0].text_content().strip() #text = html2text(article_doc.cssselect(".article.body")[0]).strip() text = html2text( article_doc.cssselect("div.block.article-body")[0]).strip() print(f"{text}") # Bylines are sometimes included in the main body text. We won't put in # any effort to pluck out those cases though.. byline = article_doc.cssselect(".article.header .item-excerpt") if byline: text += "\n\n" text += html2text(byline[0]) yield text
def scrape_unit(self, article_info: ArticleTuple): date, page_num, url = article_info try: text_url = strip_query(self.session.get_redirected_url(url)) except RedirectError as e: if e.status_code == 404: return None raise text_doc = self.session.get_html(text_url) for image in text_doc.cssselect(".image"): image.getparent().remove(image) date = datetime.datetime(date.year, date.month, date.day) try: title = text_doc.cssselect("article > h1")[0].text.strip() except: return None text = html2text(text_doc.cssselect("main > article > .body")) if not text.strip(): return None article = Article(title=title, date=date, text=text, url=url) if text_doc.cssselect("article > header.themed"): # New headers style author = text_doc.cssselect("article > header .author")[0].text section = text_doc.cssselect("article > header .title")[0].text article.set_property("author", author) else: # Old header style section = text_doc.cssselect("article > header > .title") section = section[0].text if section else "NOSECTION" author_a = text_doc.cssselect("article .author a") if author_a: author = author_a[0].text.strip() article.set_property("author", author) if author == section: section = "Opinie" download = text_doc.cssselect('form[name="download"]') if download: pdf_url = download[0].get("action") article.set_property("pdf_url", pdf_url) article.set_property("text_url", text_url) article.set_property("image_url", text_url + "?view=img") if section: article.set_property("section", section.strip()) return article
def _parse_comment(self, comment, base_title, base_url): text = html2text(comment.cssselect("p")) article_id = comment.get("id") title = "{base_title}#{article_id}".format(**locals()) url = "{base_url}#{article_id}".format(**locals()) author, timestamp = _parse_comment_footer(comment.cssselect("footer")[0].text_content()) article = Article(date=timestamp, title=title, text=text.strip() or ".", url=url) article.set_property("author", author.strip()) article.set_property("medium", "GeenStijl Comments") return article
def _get_deduplicate_units(self, date, edition=None): self.browser.get("https://digitalpublishing.telegraaf.nl/static/krant/") found = False for day_container in self.browser.find_elements_by_css_selector(".Day__date-container"): paper_date_string = " ".join(day_container.text.split()[1:3] + ["2018"]) paper_date = dutch_strptime(paper_date_string, "%d %B %Y").date() print(paper_date_string, paper_date, date, paper_date==date) if date == paper_date: self.wait(".Day__button", on=day_container).click() found = True break if found: self.wait("#next-page-button") while self.next_button().is_displayed(): for article in self.browser.find_elements_by_css_selector(".pages-swiper-slide-active .article-layer"): self.click(article) time.sleep(1.5) try: self.browser.switch_to_frame(self.wait("iframe.article-contents", timeout=10)) except NotVisible: print("Warning: article skipped because frame was not visible") continue article_html = self.wait("body").get_property("outerHTML") text = html2text(article_html) url = self.browser.current_url (scheme, netloc, path, params, query, fragment) = parse.urlparse(url) query += "&hash=" + hashlib.sha256(article_html.encode()).hexdigest()[:20] url = parse.urlunparse((scheme, netloc, path, params, query, fragment)) page_range = fragment.split("/")[-1] try: title = self.wait("body > .head", timeout=2).text.strip() except: continue else: yield TelegraafUnit(url, date, title, text, page_range) finally: self.browser.switch_to_default_content() # Close modal self.wait(".article-modal-default-button").click() time.sleep(0.5) self.next_button().click() time.sleep(0.5)
def _scrape_unit(self, url): doc = self.session.get_html(url) date_txt = doc.cssselect("div.dateplace-data")[0].text date = read_date(date_txt) if not date.date() in self.dates: return hours, minutes = map(int,date_txt.split()[-1].split(":")) date += timedelta(hours = hours, minutes = minutes) content = doc.cssselect("#leadarticle")[0] article = { 'text' : html2text(content.cssselect("div.content")), 'headline' : content.cssselect("h1")[0].text, 'section' : url.split("/")[3], 'date' : date, 'url' : url, } if content.cssselect("span.smallprint"): article['author'] = content.cssselect("span.smallprint" )[-1].text.split(":")[1].strip() return article
def scrape_unit(self, article_info: ArticleTuple): date, page_num, url = article_info try: text_url = strip_query(self.session.get_redirected_url(url)) except RedirectError as e: if e.status_code == 404: return None raise try: text_doc = self.session.get_html(text_url) except HTTPError as e: if e.response.status_code == 404: logging.warning(f"{url} returned 404 skipping") return None else: raise for image in text_doc.cssselect(".image"): image.getparent().remove(image) date = datetime.datetime(date.year, date.month, date.day) try: title = text_doc.cssselect("article > h1")[0].text.strip() except: return None text = html2text(text_doc.cssselect("main > article > .body")) if not text.strip(): return None article = Article(title=title, date=date, text=text, url=url) if text_doc.cssselect("article > header.themed"): # New headers style author = text_doc.cssselect("article > header .author")[0].text section = text_doc.cssselect("article > header .title")[0].text article.set_property("author", author) else: # Old header style section = text_doc.cssselect("article > header > .title") section = section[0].text if section else "NOSECTION" author_a = text_doc.cssselect("article .author a") if author_a: author = author_a[0].text.strip() article.set_property("author", author) if author == section: section = "Opinie" download = text_doc.cssselect('form[name="download"]') if download: pdf_url = download[0].get("action") article.set_property("pdf_url", pdf_url) article.set_property("text_url", text_url) article.set_property("image_url", text_url + "?view=img") if section: article.set_property("section", section.strip()) return article
def _get_deduplicate_units(self, date: datetime.datetime, edition=None): # Select edition logging.info(f"Selecting date {date}") self.accept_cookie(timeout=1) self.browser.get(self.login_url) if edition is not None: self.click( self.wait('//div[text() = "{}"]'.format(edition), by=By.XPATH)) # accept cookies self.accept_cookie2() # Go to archive and select paper of this date self.wait("paper-button.showMoreButton").click() # make sure right header is not hidden header = self.wait('#rightHeader', visible=False) self.browser.execute_script('arguments[0].removeAttribute("hidden");', header) # click "Archief" button self.wait('archive-calendar-button').click() # find correct year while True: picked_year = int( self.wait("#yearSelection > paper-slider").get_attribute( "value")) #picked_year = int(self.wait('input.style-scope.vl-date-picker').text) if picked_year == date.year: break year_buttons = self.browser.find_elements_by_css_selector( "#yearSelection iron-icon") if picked_year > date.year: year_buttons[0].click() elif len(year_buttons) > 1: year_buttons[1].click() else: raise Exception( f"Only one year button, but {picked_year} < {date.year}") # find correct month self.wait("#monthSelection").find_element_by_xpath( f"//paper-button[@data-month={date.month-1}]").click() # find correct day -- wait 3 seconds to give date picker time to load time.sleep(3) day_button = self.wait("#daySelection").find_element_by_xpath( f"//paper-button[@data-current and @data-day={date.day}]") logging.info( f'{date}: aria-disabled: {day_button.get_attribute("aria-disabled")}, active: {day_button.get_attribute("active")}' ) if day_button.get_attribute("aria-disabled") == "true": logging.warning(f"No newspaper for {date}, Sunday?") return if day_button.get_attribute("active") == "false": # Don't click if correct day was already selected day_button.click() self.wait("#selectButton").click() for archive_issue in self.browser.find_elements_by_css_selector( "archive-issue"): try: archive_date = archive_issue.find_element_by_css_selector( ".issueDate").text.strip() except NoSuchElementException: continue if not archive_date: continue if dutch_strptime(archive_date, "%d %B %Y").date() == date: archive_issue.click() break else: logging.warning(f"Could not find date {date}") return # Scrape unit self.browser.switch_to_frame(self.wait("iframe#issue")) seconds_forgone = 0 start = datetime.datetime.now() while seconds_forgone < 30: seconds_forgone = (datetime.datetime.now() - start).total_seconds() try: self.wait("#articleMenuItem", timeout=10).click() except ElementClickInterceptedException: pass else: break article_list_buttons = self.browser.find_elements_by_css_selector( "#articleListSectionsButtons > button") article_list_buttons = list(article_list_buttons) or [lambda: None] for article_list_button in article_list_buttons: if not "selected" in article_list_button.get_attribute("class"): article_list_button.click() articles = list( self.browser.find_elements_by_css_selector(".articleListItem")) for article in articles: page = int(article.get_attribute("data-page")) refid = article.get_attribute("data-refid") url = urljoin(self.browser.current_url + "/", refid) def collect_headers(els): for el in els: el_text = el.get_property("textContent").strip() if el_text: yield (el, el_text) h1s = list( collect_headers( article.find_elements_by_css_selector( ".articleListItem > h1"))) h2s = list( collect_headers( article.find_elements_by_css_selector( ".articleListItem > h2"))) h3s = list( collect_headers( article.find_elements_by_css_selector( ".articleListItem > h3"))) if h1s: _, title = h1s.pop(0) elif h2s: _, title = h2s.pop(0) else: _, title = h3s.pop(0) try: content = article.find_element_by_css_selector( "div.content").get_property("outerHTML") except NoSuchElementException: continue subtitles = [ element.get_property("outerHTML") for element, _ in h1s + h2s + h3s ] article_html = "".join(subtitles) + content text = html2text(article_html) #try: # author = article.find_element_by_css_selector(".byline").get_property("textContent").strip() #except NoSuchElementException: # pass #else: # print(author) # Screenshot code: # article.click() # self.browser.switch_to_frame(self.wait("#articleViewContent > iframe")) # screenshot = self.wait("#page article").screenshot_as_base64 # self.browser.switch_to_default_content() # self.browser.switch_to_frame(self.wait("#issue")) # self.wait("#articleNavigationBack").click() # time.sleep(0.5) screenshot = None yield EPagesUnit(url, date, title, page, screenshot, text)
def _get_deduplicate_units(self, date, edition=None): self.browser.get( "https://digitalpublishing.telegraaf.nl/static/krant/") found = False for day_container in self.browser.find_elements_by_css_selector( ".Day__date-container"): paper_date_string = " ".join(day_container.text.split()[1:3] + [str(date.year)]) paper_date = dutch_strptime(paper_date_string, "%d %B %Y").date() print( f"datum krant {paper_date_string}, {paper_date}, {date}, {paper_date==date}" ) if date == paper_date: self.wait(".Day__button", on=day_container).click() found = True break if found: self.wait("#next-page-button") while self.next_button().is_displayed(): try: articles = self.wait_multiple( ".pages-swiper-slide-active .article-layer") except NoSuchElementException: logging.warning(f"Could not find article layer in {url}") articles = [] for article in articles: self.click(article) time.sleep(1.5) try: self.browser.switch_to_frame( self.wait("iframe.article-contents", timeout=10)) except NotVisible: print( "Warning: article skipped because frame was not visible" ) continue article_html = self.wait("body").get_property("outerHTML") text = html2text(article_html) url = self.browser.current_url (scheme, netloc, path, params, query, fragment) = parse.urlparse(url) query += "&hash=" + hashlib.sha256( article_html.encode()).hexdigest()[:20] url = parse.urlunparse( (scheme, netloc, path, params, query, fragment)) page_range = fragment.split("/")[-1] try: title = self.wait("body > .head", timeout=2).text.strip() except NoSuchElementException: try: title = self.wait("body > .head1", timeout=2).text.strip() except NoSuchElementException: logging.warning(f"No title found: {url}") title = "-" if not title: logging.warning(f"Empty title for {url}") title = "-" yield TelegraafUnit(url, date, title, text, page_range) self.browser.switch_to_default_content() # Close modal self.wait(".article-modal-default-button").click() time.sleep(0.5) self.next_button().click() time.sleep(0.5)
def _get_deduplicate_units(self, date, edition=None): # Select edition self.browser.get(self.login_url) if edition is not None: self.click(self.wait('//div[text() = "{}"]'.format(edition), by=By.XPATH)) # Go to archive and select paper of this date self.wait("paper-button.showMoreButton").click() for archive_issue in self.browser.find_elements_by_css_selector("archive-issue"): try: archive_date = archive_issue.find_element_by_css_selector(".issueDate").text.strip() except NoSuchElementException: continue if not archive_date: continue if dutch_strptime(archive_date, "%d %B %Y").date() == date: archive_issue.click() break else: return # Scrape unit self.browser.switch_to_frame(self.wait("#issue")) seconds_forgone = 0 start = datetime.datetime.now() while seconds_forgone < 30: seconds_forgone = (datetime.datetime.now() - start).total_seconds() try: self.wait("#articleMenuItem", timeout=60).click() except ElementClickInterceptedException: pass else: break article_list_buttons = self.browser.find_elements_by_css_selector("#articleListSectionsButtons > button") article_list_buttons = list(article_list_buttons) or [lambda: None] time.sleep(2) for article_list_button in article_list_buttons: article_list_button.click() time.sleep(2) articles = list(self.browser.find_elements_by_css_selector(".articleListItem")) for article in articles: page = int(article.get_attribute("data-page")) refid = article.get_attribute("data-refid") url = urljoin(self.browser.current_url + "/", refid) def collect_headers(els): for el in els: el_text = el.get_property("textContent").strip() if el_text: yield (el, el_text) h1s = list(collect_headers(article.find_elements_by_css_selector(".articleListItem > h1"))) h2s = list(collect_headers(article.find_elements_by_css_selector(".articleListItem > h2"))) h3s = list(collect_headers(article.find_elements_by_css_selector(".articleListItem > h3"))) if h1s: _, title = h1s.pop(0) elif h2s: _, title = h2s.pop(0) else: _, title = h3s.pop(0) try: content = article.find_element_by_css_selector("div.content").get_property("outerHTML") except NoSuchElementException: continue subtitles = [element.get_property("outerHTML") for element, _ in h1s + h2s + h3s] article_html = "".join(subtitles) + content text = html2text(article_html) #try: # author = article.find_element_by_css_selector(".byline").get_property("textContent").strip() #except NoSuchElementException: # pass #else: # print(author) # Screenshot code: # article.click() # self.browser.switch_to_frame(self.wait("#articleViewContent > iframe")) # screenshot = self.wait("#page article").screenshot_as_base64 # self.browser.switch_to_default_content() # self.browser.switch_to_frame(self.wait("#issue")) # self.wait("#articleNavigationBack").click() # time.sleep(0.5) screenshot = None yield EPagesUnit(url, date, title, page, screenshot, text)