def _scrape_unit(self, article_id): article = HTMLDocument(url = self.article_url.format(**locals())) article.prepare(self) article.props.text = article.doc.cssselect("font.artbody") if len("".join([t.text_content() for t in article.props.text])) < 100: return for i, table in enumerate(article.doc.cssselect("table")): if table.get('class') == "body": table_after_body = article.doc.cssselect("table")[i + 1] page_date = re.search( "Pagina ([0-9]+), ([0-9]{2}\-[0-9]{2}\-[0-9]{4})", table_after_body.text_content()) article.props.pagenr = page_date.group(1) article.props.date = readDate(page_date.group(2)) article.props.section = self.current_section article.props.headline = article.doc.cssselect("td.artheader")[0].text_content().strip() if article.doc.cssselect(".artsubheader"): article.props.byline = article.doc.cssselect(".artsubheader")[0] if article.doc.cssselect("td.artauthor"): article.props.author = article.doc.cssselect("td.artauthor")[0].text.split(":")[1].strip() dateline_match = re.search( "^([A-Z][a-z]+(( |/)[A-Z][a-z]+)?)\n", "\n".join([n.text_content() for n in article.props.text]).strip()) if dateline_match: article.props.dateline = dateline_match.group(1) yield article
def _get_units(self): for section in self.sections: page = 1 url = self.page_url.format(**locals()) date = _date.today() ipage = self.getdoc(url) while date >= self.options['date']: if not ipage.cssselect("#main ul.snelnieuws_list li.item"): print("\nNo articles found as far back as given date\n") break for unit in ipage.cssselect( '#main ul.snelnieuws_list li.item'): href = unit.cssselect('a')[0].get('href') article = HTMLDocument(url=href) article.prepare(self) try: date = readDate( article.doc.cssselect("span.datum") [0].text).date() except IndexError: continue if date == self.options['date']: yield article elif date < self.options['date']: break page += 1 nxt_url = self.page_url.format(**locals()) ipage = self.getdoc(nxt_url)
def _get_units(self): for section in self.sections: page = 1 url = self.page_url.format(**locals()) date = _date.today() ipage = self.getdoc(url) while date >= self.options['date']: if not ipage.cssselect("#main ul.snelnieuws_list li.item"): print("\nNo articles found as far back as given date\n") break for unit in ipage.cssselect('#main ul.snelnieuws_list li.item'): href = unit.cssselect('a')[0].get('href') article = HTMLDocument(url=href) article.prepare(self) try: date = readDate(article.doc.cssselect("span.datum")[0].text).date() except IndexError: continue if date == self.options['date']: yield article elif date < self.options['date']: break page += 1 nxt_url = self.page_url.format(**locals()) ipage = self.getdoc(nxt_url)
def _scrape_unit(self, li): a = li.cssselect("li > a")[0] article = HTMLDocument(url=urljoin(self.index_url, a.get('href'))) article.props.headline = a.text article.props.kicker = li.cssselect("div.infoboard a.kicker")[0].text article.props.intro = li.cssselect("p") article.props.date = readDate( li.cssselect("div.infoboard span.time")[0].text_content()) article.prepare(self) articletime = article.doc.cssselect("p.articletime")[0].text_content() if len(articletime.split("|")) > 2: article.props.date = readDate(" ".join( articletime.split("|")[:-1])) article.props.author = articletime.split("|")[-1] else: article.props.author = articletime.strip() if " Korrespondent" in article.props.author: article.props.author = article.props.author.split( "Korrespondent")[1].strip() for ad in article.doc.cssselect("div.noprint"): ad.drop_tree() article.props.text = article.doc.cssselect( "p.articlelead, #articletext") article.props.section = article.doc.cssselect( "div.headtop span.sitetop")[0].text_content() yield article
def get_comments(self, page): for li in page.doc.cssselect("ul.commentlist li.comment"): comment = HTMLDocument() comment.parent = page try: dateauthor = li.cssselect("div.commentsbox")[0].text_content() except IndexError: comment.props.author = li.text_content().split(":")[0] comment.props.date = readDate(":".join(li.text_content().split(":")[1:2])) try: comment.props.text = li.cssselect("div.comment-text-reply")[0] except UnicodeDecodeError: continue else: comment.props.author = dateauthor.split("Geplaatst door")[1].split(" op ")[0] try: li.cssselect("div.commentsbox a")[0].drop_tree() except: pass comment.props.date = readDate(dateauthor.split(" op ")[1]) try: comment.props.text = li.cssselect("div.comment-text")[0] except UnicodeDecodeError: continue yield comment
def _scrape_unit(self, url): article = HTMLDocument(url=url) article.prepare(self) content = article.doc.cssselect("#content-column")[0] article.props.date = readDate(content.cssselect("p.article-meta")[0].text.split("|")[1]) article.props.headline = content.cssselect("h1")[0].text for x in [ content.cssselect("h1")[0], content.cssselect("p.article-meta")[0], content.cssselect("p.sharing")[0] ]: x.drop_tree() article.props.text = content.text_content() for block in article.doc.cssselect("#aside-column div.block"): title = block.cssselect("h2")[0].text if "Verantwoordelijk" in title and "ministerie" in title: article.props.author = "; ".join([a.text for a in block.cssselect("ul.list-common li a")]) break try: if len(article.props.author) > 100: article.props.author = article.props.author[:100] except AttributeError: pass yield article
def _scrape_unit(self, url): page = HTMLDocument(url=url) page.prepare(self) for comment in self.get_comments(page): yield comment comment.is_comment = True yield self.get_article(page)
def _scrape_unit(self, article_id): article = HTMLDocument(url=self.article_url.format(**locals())) article.prepare(self) article.props.text = article.doc.cssselect("font.artbody") if len("".join([t.text_content() for t in article.props.text])) < 100: return for i, table in enumerate(article.doc.cssselect("table")): if table.get('class') == "body": table_after_body = article.doc.cssselect("table")[i + 1] page_date = re.search( "Pagina ([0-9]+), ([0-9]{2}\-[0-9]{2}\-[0-9]{4})", table_after_body.text_content()) article.props.pagenr = page_date.group(1) article.props.date = readDate(page_date.group(2)) article.props.section = self.current_section article.props.headline = article.doc.cssselect( "td.artheader")[0].text_content().strip() if article.doc.cssselect(".artsubheader"): article.props.byline = article.doc.cssselect(".artsubheader")[0] if article.doc.cssselect("td.artauthor"): article.props.author = article.doc.cssselect( "td.artauthor")[0].text.split(":")[1].strip() dateline_match = re.search( "^([A-Z][a-z]+(( |/)[A-Z][a-z]+)?)\n", "\n".join([n.text_content() for n in article.props.text]).strip()) if dateline_match: article.props.dateline = dateline_match.group(1) yield article
def _scrape_unit(self, pagenr_url): pagenumber, url = pagenr_url page = HTMLDocument(date=self.options.get("date"), url=url, pagenumber=pagenumber) page.prepare(self) article = self._get_article(page) if article: yield article
def get_comments(self, page): for li in page.doc.cssselect("ul.commentlist li.comment"): comment = HTMLDocument() comment.parent = page try: dateauthor = li.cssselect("div.commentsbox")[0].text_content() except IndexError: comment.props.author = li.text_content().split(":")[0] comment.props.date = readDate(":".join( li.text_content().split(":")[1:2])) try: comment.props.text = li.cssselect( "div.comment-text-reply")[0] except UnicodeDecodeError: continue else: comment.props.author = dateauthor.split( "Geplaatst door")[1].split(" op ")[0] try: li.cssselect("div.commentsbox a")[0].drop_tree() except: pass comment.props.date = readDate(dateauthor.split(" op ")[1]) try: comment.props.text = li.cssselect("div.comment-text")[0] except UnicodeDecodeError: continue yield comment
def scrape_media(self,doc,_type): scrn = HTMLDocument() scrn.doc = doc try: scrn.props.text = scrn.doc.cssselect("div.mediaDescription")[0] except IndexError: scrn.props.text = "none" try: scrn.props.headline = "{} {}".format(scrn.doc.cssselect("div.screenshotAppName")[0].text,_type) except IndexError: scrn.props.headline = "unknown" author_url = "/".join(scrn.doc.cssselect("div.linkAuthor a")[0].get('href').split("/")[:-2]) scrn = self.get_author_props(scrn, author_url) for obj in scrn.doc.cssselect("div.rightDetailsBlock div.detailsStatRight"): try: scrn.props.date = readDate(obj.text) except ValueError: continue else: break if not scrn.doc.cssselect("div.commentthread_paging"): yield scrn;return if not scrn.doc.cssselect("div.commentthread_header div.commentthread_paging span")[1].text_content(): for comment in self.scrape_comments(scrn): yield comment else: raise NotImplementedError yield scrn
def get_comment(self, page, header,table): comment = HTMLDocument() comment.parent = page comment.props.date = readDate(header.cssselect("span.kmsgdate")[0].get('title')) comment.props.headline = header.cssselect("h2 span")[0].text_content() comment.props.author = table.cssselect("li.kpost-username")[0].text_content() comment.props.text = table.cssselect("div.kmsgtext")[0] return comment
def get_comment(self, page, header, table): comment = HTMLDocument() comment.parent = page comment.props.date = readDate(header.cssselect("span.kmsgdate")[0].get("title")) comment.props.headline = header.cssselect("h2 span")[0].text_content() comment.props.author = table.cssselect("li.kpost-username")[0].text_content() comment.props.text = table.cssselect("div.kmsgtext")[0] return comment
def get_comments(self,page): for div in page.doc.cssselect("div.comment"): comment = HTMLDocument() comment.props.text = div.cssselect("div.reactie")[0] comment.props.author = div.cssselect("li.naam")[0].text_content() comment.props.date = readDate(div.cssselect("li.date")[0].text_content()) comment.parent = page yield comment
def get_comments(self, page): for li in page.doc.cssselect("#detail_reactions #reaction ul.clear li"): comment = HTMLDocument() comment.props.author = li.cssselect("cite")[0].text.strip() comment.props.text = li.cssselect("blockquote")[0] comment.props.date = readDate(li.cssselect("span.time")[0].text) comment.parent = page yield comment
def _get_units(self): for url, doc in self.getdocs(): date = readDate(doc.cssselect("section.headlinedivider p.lfloat")[0].text_content().strip().split("am")[1]) print(date) if date.date() != self.options['date']: continue article = HTMLDocument(url = url, date = date) article.doc = doc yield article
def get_comments(self, page): for div in page.doc.cssselect("div.comment"): comment = HTMLDocument() comment.props.text = div.cssselect("div.reactie")[0] comment.props.author = div.cssselect("li.naam")[0].text_content() comment.props.date = readDate( div.cssselect("li.date")[0].text_content()) comment.parent = page yield comment
def _scrape_unit(self, url): article = HTMLDocument(url = url) article.prepare(self) article.props.date = self.options['date'] article.props.section = " > ".join([ a.text for a in article.doc.cssselect("#SubHeaderBreadcrumbs a")]) article.props.headline = article.doc.cssselect("title")[0].text.split("—")[0].strip() article.props.text = article.doc.cssselect( "#parent-fieldname-description") + article.doc.cssselect("#parent-fieldname-text") yield article
def _scrape_unit(self, ipage): page = ipage ipage = HTMLDocument(ipage) ipage.doc = self.open(page['url']) text = wegenertools.clean(ipage.doc.read()) err_text = "Uw account is niet geregistreerd voor de door u gekozen uitgave." if err_text in text: raise Exception(err_text) for article_ids in wegenertools.get_article_ids(text): body, headline, byline = wegenertools.get_article( text, article_ids) if len( body ) >= 300: #filtering non-articles, image links and other html crap artpage = HTMLDocument() stop = False for part in body.split("\n\n"): if part.isupper(): pass else: if "\n" in part: #when title has a linebreak it's probably not an article stop = True break else: artpage.props.headline = part break if stop: break else: p = re.compile("[\\\]udc[\w\w]") artpage.props.text = literal_eval(p.sub("", repr(body))) artpage.props.edition = page['edition'] artpage.props.byline = byline artpage.props.section = page['section'] if re.match("[A-Z][0-9]+", page['page_str']): artpage.props.section += " - section " + page[ 'page_str'][0] artpage.props.pagenr = int(page['page_str'][1:]) else: artpage.props.pagenr = int(page['page_str']) dateline_pattern = re.compile( "(^[^\n]+\n\n([A-Z]+( [A-Z]+)?) -\n)|(([A-Z]+( [A-Z]+)?)\n\n)" ) match = dateline_pattern.search(artpage.props.text) if match: #dateline and theme have the same syntax and are therefore undistinguishable artpage.props.dateline_or_theme = match.group( 2) or match.group(5) artpage.props.url = page['url'] yield artpage
def _scrape_unit(self, url): article = HTMLDocument(url = url) article.prepare(self.scraper) article.props.headline = article.doc.cssselect("#articleDetailTitle")[0].text_content() time_post = article.doc.cssselect("div.time_post")[0] if time_post.cssselect("span.author"): article.props.author = time_post.cssselect("span.author")[0].text_content().lstrip("Dor:") time_post.cssselect("span.author")[0].drop_tree() article.props.date = readDate(time_post.text_content()) article.props.text = article.doc.cssselect("#art_box2")[0] yield article
def _scrape_unit(self, url): article = HTMLDocument(url = url) article.prepare(self.scraper) article.props.date = readDate(article.doc.cssselect("div.dateplace-data")[0].text) article.props.headline = article.doc.cssselect("h1")[0].text_content().strip() [s.drop_tree() for s in article.doc.cssselect("script")] article.props.text = article.doc.cssselect("#leadarticle div.content")[0] author = article.doc.cssselect("#leadarticle span.smallprint") if author: article.props.author = author[0].text.strip("| ") yield article
def _scrape_unit(self, url): article = HTMLDocument(url = url) article.prepare(self.scraper) article.props.date = readDate(article.doc.cssselect("#artikel span.datum,#artikel span.datum-premium-content")[0].text_content()) article.props.headline = article.doc.cssselect("#artikel h1")[0].text_content() author = article.doc.cssselect("#artikel span.auteur") if author: article.props.author = author[0].text_content() [s.drop_tree() for s in article.doc.cssselect("#artikelKolom script")] article.props.text = article.doc.cssselect("#artikelKolom,#artikel div.zak-txt-premium-content")[0] yield article
def _scrape_unit(self, url): doc = self.getdoc(url) for a in doc.cssselect('#Articles a'): page = HTMLDocument(date=self.options.get('date')) page.coords = stoolkit.parse_coords(doc.cssselect('div.%s' % a.get('class'))) page.props.url = urljoin(url, '%s_text.html' % a.get('class')) page.prepare(self) article = self.get_article(page) if article: yield article
def get_article(self, url, datetime): page = HTMLDocument(url = url) page.prepare(self) page.props.headline = page.doc.cssselect("div.article h1")[0] page.props.text = page.doc.cssselect("#broodtekst")[0] page.props.date = datetime if page.doc.cssselect("div.auteursinfo"): page.props.author = page.doc.cssselect("div.auteursinfo h2")[0].text_content() page.props.section = url.split("/")[3] page.props.html = html.tostring(page.doc) return page
def _scrape_unit(self, url): article = HTMLDocument(url = url) article.prepare(self) article.props.date = readDate(article.doc.cssselect("#artikel span.datum")[0].text_content()) article.props.headline = article.doc.cssselect("#artikel h1")[0].text_content() author = article.doc.cssselect("#artikel span.auteur") if author: article.props.author = author[0].text_content() [s.drop_tree() for s in article.doc.cssselect("#artikelKolom script")] article.props.text = article.doc.cssselect("#artikelKolom")[0] yield article
def _scrape_unit(self, data): headline, article_date, pagenr, section, url = data art = HTMLDocument( headline = headline, date = article_date, pagenr = pagenr, section = section, url = url) art.doc = self.open(url).read() text = self.pdf_to_text(art.doc).decode('utf-8') art.props.text = self.fix_text(text) art.props.source = "dekrantvantoen.nl" yield art
def _scrape_unit(self, url): article = HTMLDocument(url = url) article.prepare(self) article.props.headline = article.doc.cssselect("#articleDetailTitle")[0].text_content() time_post = article.doc.cssselect("div.time_post")[0] if time_post.cssselect("span.author"): article.props.author = time_post.cssselect("span.author")[0].text_content().lstrip("Dor:") time_post.cssselect("span.author")[0].drop_tree() article.props.date = readDate(time_post.text_content()) article.props.text = article.doc.cssselect("#art_box2")[0] yield article
def _scrape_unit(self, url): article = HTMLDocument(url = url) article.prepare(self) article.props.date = readDate(article.doc.cssselect("div.dateplace-data")[0].text) article.props.headline = article.doc.cssselect("h1")[0].text_content().strip() [s.drop_tree() for s in article.doc.cssselect("script")] article.props.text = article.doc.cssselect("#leadarticle div.content")[0] author = article.doc.cssselect("#leadarticle span.smallprint") if author: article.props.author = author[0].text.strip("| ") yield article
def _scrape_unit(self, url): page = HTMLDocument(url = url) page.prepare(self) page.props.headline = page.doc.cssselect("#art_box2 h1")[0].text_content() for h1 in page.doc.cssselect("h1"): h1.drop_tree() page.props.author = self.getauthor(page.doc) page.props.text = page.doc.cssselect("#art_box2 p") page.props.date = readDate(page.doc.cssselect("div.time_post")[0].text.split("Bron:")[0]) page.props.section = re.search("parool/nl/[0-9]+/([\w\d\-]+)/article", page.props.url).group(1).capitalize() page.props.html = html.tostring(page.doc) yield page
def get_article(self, url): url = "{}_body.html".format(url[:-5]) pagenum = url.split("/")[7][0:5] article = HTMLDocument(url = url, pagenr = int(pagenum)) article.doc = self.getdoc(url) article.props.headline = article.doc.cssselect("td.artheader")[0].text_content().strip() article.props.text = article.doc.cssselect("table.body")[0] if article.doc.cssselect("td.artauthor"): article.props.author = article.doc.cssselect("td.artauthor")[0].text_content().lstrip("dor") article.props.date = self.options['date'] article.props.section = self.section return article
def get_article(self, url, datetime): page = HTMLDocument(url=url) page.prepare(self) page.props.headline = page.doc.cssselect("div.article h1")[0] page.props.text = page.doc.cssselect("#broodtekst")[0] page.props.date = datetime if page.doc.cssselect("div.auteursinfo"): page.props.author = page.doc.cssselect( "div.auteursinfo h2")[0].text_content() page.props.section = url.split("/")[3] return page
def _scrape_unit(self, tr): """gets articles from a page""" url = urljoin("http://forum.fok.nl",tr.cssselect("td.tTitel a")[0].get('href')) topic = HTMLDocument(url=url, section=tr.cssselect("td.tFolder")[0].text_content()) topic.prepare(self) content = topic.doc.text_content() if any([(s in content) for s in SEARCHTERMS]): for comment in self.get_comments(topic): comment.is_comment = True yield comment yield self.get_article(topic)
def _scrape_unit(self, data): headline, article_date, pagenr, section, url = data art = HTMLDocument(headline=headline, date=article_date, pagenr=pagenr, section=section, url=url) art.doc = self.open(url).read() text = self.pdf_to_text(art.doc).decode('utf-8') art.props.text = self.fix_text(text) art.props.source = "dekrantvantoen.nl" yield art
def _scrape_unit(self, url): article = HTMLDocument(url = url) article.prepare(self) firstitem = article.doc.cssselect("#content div.left_div div.item")[0] article.props.text = firstitem.cssselect("p, div") article.props.date = readDate(firstitem.cssselect("span.date")[0].text) article.props.section = "nieuws" article.props.headline = firstitem.cssselect("h3")[0].text article.props.externalid = url.split("/")[-1] em = firstitem.cssselect("em") if em: article.props.author = "".join(em[0].text.split("Door ")[1:]) yield article
def get_comments(self, article): for li in article.doc.cssselect("li.comment"): comment = HTMLDocument() comment.props.text = li.cssselect("div.comment-text")[0] pattern = re.compile("Geplaatst door ([\w ]+) op ([\w :]+)") result = pattern.search( li.cssselect("div.commentsbox span")[0].text_content() ) comment.props.author = result.group(1) comment.props.date = readDate(result.group(2)) comment.parent = article yield comment
def _scrape_unit(self, urldoc): article = HTMLDocument(url = urldoc[0]) article.doc = urldoc[1] _date = [ int(urldoc[0].split("/")[6]), int(urldoc[0].split("/")[7].split("_")[0]), int(urldoc[0].split("/")[8])] article.props.date = date(*_date) article.props.section = urldoc[0].split("/")[9] article.props.author = article.doc.cssselect("div.fullarticle_tagline")[0].text.split("|")[0] article.props.headline = article.doc.cssselect("h1.title")[0].text article.props.text = article.doc.cssselect("article")[0] yield article
def _scrape_unit(self, url): doc = self.getdoc(url) for li in doc.cssselect("div#article ul.news-list li"): url = li.cssselect("a")[0].get('href') url = urljoin(INDEX_URL, url) page = HTMLDocument(date=self.options['date'], url=url) page.prepare(self) page.doc = self.getdoc(page.props.url) try: yield self.get_article(page) except IndexError: pass
def scrape_comments(self,page): p = page.props.url+"?page={}" if not page.doc.cssselect("ul.pager"): return total = int(page.doc.cssselect("ul.pager li.pager-last a")[0].get('href').split("page=")[-1].split("&")[0]) + 1 docs = [self.getdoc(p.format(x)) for x in range(total)] for doc in docs: for div in doc.cssselect("#comments div.comment"): comment = HTMLDocument() comment.props.text = div.cssselect("div.content")[0] comment.props.author = div.cssselect("span.submitted-username")[0].text_content() comment.props.date = readDate(div.cssselect("div.submitted div.floatr")[0].text_content()) comment.parent = page yield comment
def _scrape_unit(self, bits): date, url = bits article = HTMLDocument(date = date, url = url) article.prepare(self) content = article.doc.cssselect("#content")[0] article.props.section = content.cssselect("div.info-block p.meta a.label")[0].text article.props.headline = content.cssselect("div.title h1")[0].text article.props.externalid = url.split("-")[-1].strip("W/") article.props.text = content.cssselect("div.article") article.props.author = content.cssselect("p.meta span.user a.label")[0].text.strip() article.props.tags = set([a.text for a in content.cssselect("ul.taglist li a")]) article.props.view_count = int(content.cssselect("div.info-block span.view-count")[0].text) yield article self.clearcookies()
def get_article(self, url): url = "{}_body.html".format(url[:-5]) pagenum = url.split("/")[7][0:5] article = HTMLDocument(url=url, pagenr=int(pagenum)) article.doc = self.getdoc(url) article.props.headline = article.doc.cssselect( "td.artheader")[0].text_content().strip() article.props.text = article.doc.cssselect("table.body")[0] if article.doc.cssselect("td.artauthor"): article.props.author = article.doc.cssselect( "td.artauthor")[0].text_content().lstrip("dor") article.props.date = self.options['date'] article.props.section = self.section return article
def _scrape_unit(self, props): props["date"] = self.options["date"] if props["type"] == "short": yield Document(**props) elif props["type"] == "full": article = HTMLDocument(**props) article.prepare(self) article.props.section = " > ".join( [li.text_content().strip("|") for li in article.doc.cssselect("#a-breadcrumb-component li.a-nav-item")] ) article.props.headline = article.doc.cssselect("div.column_main_wrapper h1.blue")[0].text article.props.text = article.doc.cssselect("div.column_main_wrapper div")[0] yield article
def _scrape_unit(self, url): doc = self.getdoc(url) for li in doc.cssselect("div#article ul.news-list li"): url = li.cssselect("a")[0].get('href') url = urljoin(INDEX_URL,url) page = HTMLDocument(date = self.options['date'],url=url) page.prepare(self) page.doc = self.getdoc(page.props.url) try: yield self.get_article(page) except IndexError: pass
def get_comments(self, topic): first = True for page in self.get_pages(topic.doc): if first == True: comments = page.cssselect("div.post")[1:] first = False else: comments = page.cssselect("div.post") for div in comments: comment = HTMLDocument() comment.parent = topic comment.props.author = div.cssselect("div.postholder_top a.username")[0] comment.props.date = readDate(div.cssselect("div.postholder_top span.post_time")[0].text_content()) comment.props.text = div.cssselect("div.postholder_bot div.contents")[0] yield comment
def _scrape_unit(self, xmlitem): html_content = html.fromstring(xmlitem.cssselect("description")[0].text) url = xmlitem.cssselect("link")[0].tail.split("&url=")[-1] article = HTMLDocument(url = url) article.props.headline = " - ".join(xmlitem.cssselect("title")[0].text.split(" - ")[:-1]) article.props.source = xmlitem.cssselect("title")[0].text.split(" - ")[-1] article.props.section = xmlitem.cssselect("category")[0].text article.props.date = readDate(xmlitem.cssselect("pubdate")[0].text) article.props.snippet = html_content.cssselect("div.lh font")[1].text try: article.prepare(self) except Exception: yield article;return article.props.html = html.tostring(article.doc) yield article
def _scrape_unit(self, urldoc): url, doc = urldoc article = HTMLDocument(url = url) article.doc = doc article.props.images = [self.open(img.get('src')).read() for img in article.doc.cssselect("div.broodMediaBox div.image img")] article.props.headline = "Poll:" + doc.cssselect("#artikel h1")[0].text_content().split(":")[1] article.props.byline = doc.cssselect("#artikel span.auteur")[0].text article.props.date = readDate(doc.cssselect("#artikel span.datum")[0].text) article.props.externalid = article.props.url.split("/")[-2] article.props.text = doc.cssselect("#artikelKolom div.zaktxt,p") article.props.dateline = doc.cssselect("#artikelKolom span.location")[0] for comment in self.get_comments(article): comment.is_comment = True yield comment yield article
def _scrape_unit(self, ipage): page = ipage ipage = HTMLDocument(ipage) ipage.doc = self.open(page['url']) text = wegenertools.clean(ipage.doc.read()) err_text = "Uw account is niet geregistreerd voor de door u gekozen uitgave." if err_text in text: raise Exception(err_text) for article_ids in wegenertools.get_article_ids(text): body,headline,byline = wegenertools.get_article(text,article_ids) if len(body) >= 300: #filtering non-articles, image links and other html crap artpage = HTMLDocument() stop = False for part in body.split("\n\n"): if part.isupper(): pass else: if "\n" in part: #when title has a linebreak it's probably not an article stop=True break else: artpage.props.headline = part break if stop: break else: p = re.compile("[\\\]udc[\w\w]") artpage.props.text = literal_eval(p.sub("",repr(body))) artpage.props.edition = page['edition'] artpage.props.byline = byline artpage.props.section = page['section'] if re.match("[A-Z][0-9]+", page['page_str']): artpage.props.section += " - section " + page['page_str'][0] artpage.props.pagenr = int(page['page_str'][1:]) else: artpage.props.pagenr = int(page['page_str']) dateline_pattern = re.compile("(^[^\n]+\n\n([A-Z]+( [A-Z]+)?) -\n)|(([A-Z]+( [A-Z]+)?)\n\n)") match = dateline_pattern.search(artpage.props.text) if match: #dateline and theme have the same syntax and are therefore undistinguishable artpage.props.dateline_or_theme = match.group(2) or match.group(5) artpage.props.url = page['url'] yield artpage
def scrape_file(self, _html, t): if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [ div for div in _html.cssselect("#sort div") if "sort_" in div.get('id') ] for div in divs: article = HTMLDocument() article.props.html = div article.props.headline = div.cssselect( "#articleTitle")[0].text_content() article.props.text = div.cssselect("#articleIntro")[0] articlepage = div.cssselect("#articlePage") if articlepage: article.props.pagenr, article.props.section = self.get_pagenum( articlepage[0].text) if not div.cssselect("#sourceTitle")[0].text: article.props.medium = Medium.get_or_create("unknown medium") else: article.props.medium = Medium.get_or_create( div.cssselect("#sourceTitle")[0].text) date_str = div.cssselect("#articleDate")[0].text try: article.props.date = readDate(date_str) except ValueError: log.error( "parsing date \"{date_str}\" failed".format(**locals())) else: yield article
def _get_units(self): for x in range(3): try: self._cookie() except: print( 'Error 503 at _cookie function, trying again in a minute...' ) time.sleep(60) else: break index_dict = { 'y': self.options['date'].year, 'm': self.options['date'].month, 'd': self.options['date'].day } url = INDEX_URL.format(**index_dict) for x in range(3): try: index = self.getdoc(url) except Exception: time.sleep(5) else: break articles = index.cssselect('.title') for article_unit in articles: href = article_unit.cssselect('a')[0].get('href') yield HTMLDocument(url=href, date=self.options['date'])
def _get_units(self): """ PhpBB forum scraper """ index = self.getdoc(self.index_url) for cat_title, cat_doc in self.get_categories(index): for page in self.get_pages(cat_doc): for fbg in page.cssselect('.forumbg'): for li in fbg.cssselect('.topics > li'): url = urljoin( self.index_url, li.cssselect("a.topictitle")[0].get('href')) _date = etree.tostring( li.cssselect("dd.lastpost")[0]).split("br />")[1] date = toolkit.readDate(_date) yield { 'date': date, 'object': HTMLDocument( headline=li.cssselect("a.topictitle")[0].text, url=url, category=cat_title) }
def _scrape_unit(self, url): """gets articles from a page""" _json = self.open(str(url)).read() data = json.loads(_json) done = False while data['has_more_items'] and done == False: doc = html.fromstring(data['items_html']) for div in doc.cssselect("div.tweet"): tweet = HTMLDocument() tweet.props.author = div.cssselect( "strong.fullname")[0].text_content() tweet.props.date = datetime.fromtimestamp( float( div.cssselect("a.tweet-timestamp ._timestamp")[0].get( 'data-time'))) tweet.props.text = div.cssselect("p.js-tweet-text")[0] maxid = div.get('data-tweet-id') if tweet.props.date.date() < self.options['date']: done = True break elif tweet.props.date.date() == self.options['date']: yield tweet if done == False: nexturl = url + "&max_id={}".format(maxid) data = json.loads(self.open(str(nexturl)).read())
def _get_units(self): back = (date.today() - self.options['date']).days index_text = self.open( self.index_url.format(**locals())).read().decode('utf-8') #A character in the header makes the html library fail to parse the page correctly (it silently returns half the page without warning -.-) #The character is located in the class attribute of each article tag that is to be scraped, so we take the article tag's inner wrapper and parse that instead. article = 0 arts = [] for part in index_text.split("<article"): article = part.split("</article>")[0] arts.append(article) for art in set(arts): item = html.fromstring(art) try: _time = time( *map(int, item.cssselect("div.time")[0].text.split(":"))) except IndexError: continue article = HTMLDocument( date=datetime.combine(self.options['date'], _time), headline=item.cssselect("h2.title")[0].text, url=urljoin( self.index_url.format(**locals()), item.cssselect("h2.title")[0].getparent().get('href')), ) yield article
def _get_units(self): self.open("http://www.powned.tv") self.open("http://cookies.publiekeomroep.nl/accept/") d = self.options['date'] docs = [] for x in range(d.day - 7, d.day + 7): archive_url = ARCHIVE_URL.format(**locals()) try: doc = self.getdoc(archive_url) except HTTPError: pass else: docs.append(doc) entries = set([]) for doc in docs: for li in doc.cssselect("ul.articlelist li"): _date = readDate( " ".join(li.cssselect("span.t")[0].text.split()[:2]) + " " + str(self.options['date'].year)).date() url = urljoin(archive_url, li.cssselect("a")[0].get('href')) entries.add((_date, url)) for _date, url in entries: if _date == self.options['date']: article = HTMLDocument(date=_date, url=url) yield article
def extract_articles(self, url): try: doc = self.getdoc(url) except HTTPError: return for tag in doc.cssselect("#main article.news"): if 'poll' in tag.get('class'): continue _date = datetime.fromtimestamp(int(tag.get('created'))) article = HTMLDocument(date = _date) if tag.cssselect("div.tweet"): article.props.type = "tweet" article.props.text = tag.cssselect("p")[0] article.props.author = article.props.text.cssselect("b a")[0].get('title') article.props.url = url.split("?")[0] elif tag.cssselect("div.quoteBody"): article.props.type = "quote" a = tag.cssselect("div.quoteBody a")[0] article.props.text = a.text_content() article.props.url = urljoin(url, a.get('href')) article.props.author = tag.cssselect("span.author")[0].text.strip() elif tag.cssselect("div.videoContainer") or 'promo' in tag.get('class'): continue elif tag.cssselect("div.tagline h4"): self.stories.add(urljoin(url, tag.cssselect("h4 a")[0].get('href'))) continue else: h = tag.cssselect("div.body h3")[0] article.props.type = "article" article.props.headline = h.text_content().strip() if h.cssselect("a"): article.props.url = urljoin(url, h.cssselect("a")[0].get('href')) else: article.props.url = url yield article
def _scrape_unit(self, url): article = HTMLDocument(url = url) article.prepare(self) article.props.text = article.doc.cssselect("#article p:not(#article-info):not(#metadata)") info = article.doc.cssselect("#article-info a") article.props.date = readDate(info[0].text) article.props.section = info[1].text article.props.page_str = info[2].text article.props.headline = article.doc.cssselect("#article h1")[0].text if article.doc.cssselect("#metadata"): metadata = article.doc.cssselect("#metadata")[0].text_content().split("|") for m in metadata: if m.strip().startswith("Trefwoord"): article.props.tags = [t.strip() for t in m.strip().split(";")] yield article