datestr = article.doc.cssselect("#page,#Page article span.date")[0].text article.props.date = self.extract_date(datestr) article.props.headline = article.doc.cssselect("#page,#Page article h1.title")[0].text article.props.kicker = article.doc.cssselect("#page,#Page article h2.preTitle")[0].text article.props.byline = article.doc.cssselect("#page,#Page article p.leadText")[0].text article.props.text = [p for p in article.doc.cssselect("#page,#Page article div.bodyText > p") if p.text_content().strip()] if article.props.date.date() == self.options['date']: yield article german_months = ["Januar","Februar","März","April","Mai","Juni","Juli","August","September","Oktober","November","Dezember"] def extract_date(self, datestr): for m in self.german_months: if m.lower() in datestr.lower(): month = self.german_months.index(m) break day = int(datestr.split(".")[0]) year,time = datestr.lower().split(m.lower())[1].strip().split() return datetime(int(year), month+1, day, int(time.split(":")[0]), int(time.split(":")[1])) if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scraping") cli.run_cli(Oe24Scraper)
def get_parents_from_columns(cols): """Assuming cols are inented list, return a list of code, parent pairs in the same order as cols""" parents = [] for i in range(len(cols[0])): j = get_index(cols, i) code = cols[j][i] parents = parents[:j] parent = parents[-1] if parents else None parents.append(code) yield code, parent if __name__ == '__main__': from amcat.scripts.tools import cli result = cli.run_cli() #print result.output() ########################################################################### # U N I T T E S T S # ########################################################################### from amcat.tools import amcattest def _run_test(bytes, **options): if 'project' not in options: options['project'] = amcattest.create_test_project().id if 'codebook_name' not in options: options['codebook_name'] = 'test' from tempfile import NamedTemporaryFile
print for a, _date in self.scrape_page(page_doc): if _date.date() in self.dates: yield a, _date elif _date.date() < self.dates[0]: br = True break if br: break try: data_after = page_doc.cssselect("#nextPage")[0].get( 'data-after') except IndexError: break for x in range(3): try: page_doc = json.loads( self.open(self.page_url.format(**locals())).read()) except Exception as e: print(self.page_url.format(**locals())) print(e) page_doc = html.fromstring(page_doc['content']['div#nextPage']) if __name__ == "__main__": from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scraping") cli.run_cli(Nieuws_nlArchiveScraper)
elif date.date() < self.options['date']: return pagenr += 1 def _scrape_unit(self, bits): date, url = bits article = HTMLDocument(date = date, url = url) article.prepare(self) content = article.doc.cssselect("#content")[0] article.props.section = content.cssselect("div.info-block p.meta a.label")[0].text article.props.headline = content.cssselect("div.title h1")[0].text article.props.externalid = url.split("-")[-1].strip("W/") article.props.text = content.cssselect("div.article") article.props.author = content.cssselect("p.meta span.user a.label")[0].text.strip() article.props.tags = set([a.text for a in content.cssselect("ul.taglist li a")]) article.props.view_count = int(content.cssselect("div.info-block span.view-count")[0].text) yield article self.clearcookies() def clearcookies(self): """Clear cookies so the site won't interrupt us after 3 articles""" self.opener.cookiejar._cookies = {} if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scraping") cli.run_cli(BoerderijScraper)
article.props.author = tag.cssselect("span.author")[0].text.strip() elif tag.cssselect("div.videoContainer") or 'promo' in tag.get('class'): continue elif tag.cssselect("div.tagline h4"): self.stories.add(urljoin(url, tag.cssselect("h4 a")[0].get('href'))) continue else: h = tag.cssselect("div.body h3")[0] article.props.type = "article" article.props.headline = h.text_content().strip() if h.cssselect("a"): article.props.url = urljoin(url, h.cssselect("a")[0].get('href')) else: article.props.url = url yield article def _scrape_unit(self, article): if article.props.type == "article": article.prepare(self) [div.drop_tree() for div in article.doc.cssselect("div.rtldart")] article.props.text = article.doc.cssselect("article.news div.body div.paragraph") print(article) yield article if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scraping") cli.run_cli(RTLScraper)
tweet.props.date = datetime.fromtimestamp( float(div.cssselect("span.js-short-timestamp")[0].get("data-time")) ) tweet.props.text = div.cssselect("p.ProfileTweet-text")[0] tweet.props.is_retweet = bool(div.get("data-retweeter")) if tweet.props.is_retweet: tweet.props.original_author = div.get("data-name") tweet.props.url = url if "maxid" in locals() and div.get("data-tweet-id") == maxid: # infinite loop done = True break maxid = div.get("data-tweet-id") if tweet.props.date.date() < self.options["date"]: done = True break elif tweet.props.date.date() == self.options["date"]: yield tweet if done == False: nexturl = url + "&max_id={}".format(maxid) data = json.loads(self.open(str(nexturl)).read()) if __name__ == "__main__": from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(TwitterPoliticiScraper)
for line in parser.get_textlines(page): page_txt += line.get_text() + "\n" res += page_txt + "\n\n" article = Article(text = res) article.headline = self.getheadline(_file) article.medium = self.options['medium'] article.section = self.options['section'] if self.options['date']: article.date = self.options['date'] else: article.date = date.today() yield article def getheadline(self, _file): hl = _file.name if hl.endswith(".pdf"): hl = hl[:-len(".pdf")] windows = hl.split("\\") other = hl.split("/") if len(windows) > len(other): #probably a windows path hl = windows[-1] else: hl = other[-1] return hl if __name__ == "__main__": from amcat.scripts.tools import cli cli.run_cli(RawPDFScraper)
return False #not enough articles else: return True from amcat.tools.amcatlogging import AmcatFormatter import sys def setup_logging(): loggers = (logging.getLogger("amcat"), logging.getLogger("scrapers"),logging.getLogger(__name__)) d = datetime.date.today() filename = "/home/amcat/log/daily_{d.year:04d}-{d.month:02d}-{d.day:02d}.txt".format(**locals()) sys.stderr = open(filename, 'a') handlers = (logging.StreamHandler(sys.stdout),logging.FileHandler(filename)) formatter = AmcatFormatter(date = True) for handler in handlers: handler.setLevel(logging.INFO) handler.setFormatter(formatter) for logger in loggers: logger.propagate = False logger.setLevel(logging.INFO) for handler in handlers: logger.addHandler(handler) logging.getLogger().handlers = [] if __name__ == '__main__': from amcat.scripts.tools import cli setup_logging() cli.run_cli(DailyScript)
return zip(metas, bodies) def parse_document(self, tupleText): meta, body = tupleText meta = meta.strip() meta = meta.split('\n') kargs = { 'externalid': int(meta[0].split('.')[0].lstrip('?')), 'headline': meta[0].partition('. ')[2] } medium_name, date, pagenr, length = meta[2].split(', ') kargs['medium'] = Medium.get_or_create(medium_name) kargs['date'] = readDate(date) kargs['pagenr'] = int(pagenr.strip('p.')) kargs['length'] = int(length.strip('w.')) body = body.split('\n') kargs['section'] = body[2] kargs['text'] = '\n'.join(body[5:]) kargs['project'] = self.options['project'] return Article(**kargs) if __name__ == '__main__': from amcat.scripts.tools import cli cli.run_cli(Mediargus, handle_output=False)
yield comment yield page def scrape_comments(self,page): p = page.props.url+"?page={}" if not page.doc.cssselect("ul.pager"): return total = int(page.doc.cssselect("ul.pager li.pager-last a")[0].get('href').split("page=")[-1].split("&")[0]) + 1 docs = [self.getdoc(p.format(x)) for x in range(total)] for doc in docs: for div in doc.cssselect("#comments div.comment"): comment = HTMLDocument() comment.props.text = div.cssselect("div.content")[0] comment.props.author = div.cssselect("span.submitted-username")[0].text_content() comment.props.date = readDate(div.cssselect("div.submitted div.floatr")[0].text_content()) comment.parent = page yield comment if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(WebTelegraafArchiveScraper)
kargs["medium"] = medium if self.parent_field: doc_id = kargs.get(self.id_field) parent_id = kargs.pop(self.parent_field) if parent_id: self.parents[doc_id] = parent_id article = Article(**kargs) if self.parent_field: self.articles[doc_id] = article return article def postprocess(self, articles): if self.parent_field: for doc_id, parent_id in self.parents.iteritems(): doc = self.articles[doc_id] doc.parent = self.articles[parent_id] if not doc.addressee and self.options['addressee_from_parent']: doc.addressee = doc.parent.author doc.save() super(CSV, self).postprocess(articles) if __name__ == '__main__': from amcat.scripts.tools import cli cli.run_cli(CSV)
return for i, table in enumerate(article.doc.cssselect("table")): if table.get('class') == "body": table_after_body = article.doc.cssselect("table")[i + 1] page_date = re.search( "Pagina ([0-9]+), ([0-9]{2}\-[0-9]{2}\-[0-9]{4})", table_after_body.text_content()) article.props.pagenr = page_date.group(1) article.props.date = readDate(page_date.group(2)) article.props.section = self.current_section article.props.headline = article.doc.cssselect("td.artheader")[0].text_content().strip() if article.doc.cssselect(".artsubheader"): article.props.byline = article.doc.cssselect(".artsubheader")[0] if article.doc.cssselect("td.artauthor"): article.props.author = article.doc.cssselect("td.artauthor")[0].text.split(":")[1].strip() dateline_match = re.search( "^([A-Z][a-z]+(( |/)[A-Z][a-z]+)?)\n", "\n".join([n.text_content() for n in article.props.text]).strip()) if dateline_match: article.props.dateline = dateline_match.group(1) yield article if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scraping") cli.run_cli(FDScraper)
break for a in html.cssselect("a"): pagenum = int(a.get("href").split("/")[1]) sections[pagenum] = a.text return sections def _scrape_unit(self, url): article = HTMLDocument(url=url, section=self.section) article.prepare(self) article.props.date = date(*[int(n) for n in url.split("/")[5:8]]) article.props.pagenr = self.pagenum article.props.headline = article.doc.cssselect("#article h1")[0].text_content() article.props.text = article.doc.cssselect("div.body")[0] dateline_pattern = re.compile("^([A-Z]+( [A-Z]+)?)$") b = article.props.text.cssselect("b") if b and dateline_pattern.search(b[0].text_content()): article.props.dateline = dateline_pattern.search(b[0].text_content()).group(1) if article.doc.cssselect("#article address"): article.props.author = article.doc.cssselect("#article address")[0].text_content().lstrip("dor").strip() yield article if __name__ == "__main__": from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping") cli.run_cli(SpitsKrantScraper)
def get_article(self, page): page.props.author = page.doc.cssselect( "#artikel-footer .author-date")[0].text.split("|")[0].strip() page.props.headline = page.doc.cssselect("div.acarhead h1")[0].text page.props.text = [ page.doc.cssselect("div.artikel-intro")[0], page.doc.cssselect("div.artikel-main")[0] ] page.props.section = page.props.url.split("/")[4] return page def get_comments(self, page): for div in page.doc.cssselect("#comments div.comment"): comment = HTMLDocument(parent=page) comment.props.section = page.props.section comment.props.url = page.props.url comment.props.text = div.cssselect("p")[0] footer = div.cssselect("p.footer")[0].text_content().split(" | ") comment.props.author = footer[0].strip() comment.props.date = readDate(footer[1].strip()) yield comment if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(PownewsScraper)
article = HTMLDocument( url = urljoin(index_url, a.get('href')), section = category, headline = a.text_content().strip(), ) yield article def _scrape_unit(self, article): article.prepare(self) article.props.date = readDate(article.doc.cssselect("#artikelbox div.dateandmore")[0].text_content()) if article.props.date.date() != self.options['date']: print('Faulty date') return article.doc.cssselect("#story div")[-1].drop_tree() article.props.text = article.doc.cssselect("#story") firstline = article.props.text[0].text_content().strip().split("\n")[0] if len(firstline.split()) <= 8 and "Von " in firstline: #at most 8 words article.props.author = firstline.split("Von ")[1] kurztext = article.doc.cssselect("#kurztext") article.props.byline = kurztext and kurztext[0].text_content().strip() or None yield article if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scraping") cli.run_cli(NoenScraper)
Runs on all daily scraper articlesets """ if __name__ == '__main__': from sys import argv from getopt import getopt opts, args = getopt(argv, "s") for opt, arg in opts: if opt == '-s': dedu = DeduplicateScript() dedu.run_scrapers() amcatlogging.info_module("amcat.scripts.maintenance.deduplicate") from amcat.scripts.tools import cli cli.run_cli(DeduplicateScript) ########################################################################### # U N I T T E S T S # ########################################################################### from amcat.tools import amcattest class TestDeduplicateScript(amcattest.PolicyTestCase): def test_deduplicate(self): """One article should be deleted from artset and added to project 2""" p = amcattest.create_test_project() art1 = amcattest.create_test_article(url='blaat1', project=p) art2 = amcattest.create_test_article(url='blaat2', project=p) art3 = amcattest.create_test_article(url='blaat1', project=p)
metadata, text = parse_page(paragraphs) metadata["medium"] = Medium.get_or_create(metadata["medium"]) return Article(text=text, **metadata) def split_file(self, file): original_rtf, fixed_rtf = file.bytes, fix_rtf(file.bytes) doc = parse_html(to_html(original_rtf, fixed_rtf)) for i, page in enumerate(get_pages(doc)): yield doc, page """if __name__ == '__main__': original_rtf = open(sys.argv[1], 'rb').read() fixed_rtf = fix_rtf(original_rtf) html = to_html(original_rtf, fixed_rtf) #html = open("blaat.html").read() doc = parse_html(html) pages = list(get_pages(doc)) for page in pages: metadata, text = parse_page((doc, page)) print(text) print("-----")""" if __name__ == '__main__': from amcat.scripts.tools.cli import run_cli run_cli(handle_output=False)
class DerStandardScraper(HTTPScraper, DatedScraper): medium_name = "derstandard.at" index_url = "http://derstandard.at/Archiv/{self.options[date].year}/{self.options[date].month}/{self.options[date].day}" def _get_units(self): index_url = self.index_url.format(**locals()) doc = self.getdoc(index_url) for li in doc.cssselect("#content ul.chronologie li"): article = HTMLDocument( date = readDate(li.cssselect("div.date")[0].text_content()), headline = li.cssselect("h3")[0].text_content().strip(), url = urljoin(index_url, li.cssselect("h3 a")[0].get('href')) ) kicker = li.cssselect("div.text h6 a") article.props.kicker = kicker and kicker[0].text or None yield article def _scrape_unit(self, article): article.prepare(self) article.props.section = " > ".join([span.text_content() for span in article.doc.cssselect("#breadcrumb span.item")[1:]]) article.props.text = article.doc.cssselect("#artikelBody div.copytext")[0] yield article if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scraping") cli.run_cli(DerStandardScraper)
# option) any later version. # # # # AmCAT is distributed in the hope that it will be useful, but WITHOUT # # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # # License for more details. # # # # You should have received a copy of the GNU Affero General Public # # License along with AmCAT. If not, see <http://www.gnu.org/licenses/>. # ########################################################################### try: from scrapers.newssites import nrc_weblogs except ImportError: try: from scraping.newssites import nrc_weblogs except ImportError: from amcatscraping.newssites import nrc_weblogs class ColumnNRCScraper(nrc_weblogs.WeblogNRCScraper): medium_name = "NRC website - blogs" t = "columns" if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(ColumnNRCScraper)
break def _scrape_unit(self, article): article.prepare(self) breadcrumbs = article.doc.cssselect("div.BreadCrumbs span:not(.Separator)")[1:-1] article.props.section = " > ".join([span.text_content().strip() for span in breadcrumbs]) if "Bezirk" in article.props.section: article.props.bezirk = article.props.section.split(".")[0].strip() article.props.section = "Bezirk" article.props.text = article.doc.cssselect("div.Article #article_lead") bodytext = article.doc.cssselect("#BodyText")[0] wrapper = bodytext.cssselect("div.SingleContentWrapper-450px")[0] for tag in wrapper.iter(): #removing html comments and other clutter if callable(tag.tag): tag.drop_tree() lastdiv = wrapper.cssselect(".SingleContentWrapper-450px > div") if lastdiv and "zum thema" in lastdiv[0].text_content(): lastdiv[0].drop_tree() article.props.text += wrapper yield article if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scraping") cli.run_cli(ViennaScraper)
# Free Software Foundation, either version 3 of the License, or (at your # # option) any later version. # # # # AmCAT is distributed in the hope that it will be useful, but WITHOUT # # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # # License for more details. # # # # You should have received a copy of the GNU Affero General Public # # License along with AmCAT. If not, see <http://www.gnu.org/licenses/>. # ########################################################################### try: from scrapers.newssites.google import GoogleNewsScraper except ImportError: try: from scraping.newssites.google import GoogleNewsScraper except ImportError: from amcatscraping.newssites.google import GoogleNewsScraper class GoogleAustriaScraper(GoogleNewsScraper): url_gtld = "at" if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scraping") cli.run_cli(GoogleAustriaScraper)
for page in self.get_pages(topic.doc): if first == True: comments = page.cssselect("div.post")[1:] first = False else: comments = page.cssselect("div.post") for div in comments: comment = HTMLDocument() comment.parent = topic comment.props.author = div.cssselect("div.postholder_top a.username")[0] comment.props.date = readDate(div.cssselect("div.postholder_top span.post_time")[0].text_content()) comment.props.text = div.cssselect("div.postholder_bot div.contents")[0] yield comment def get_pages(self,doc): yield doc for a in doc.cssselect("nav div.pagesholder a"): url = urljoin("http://forum.fok.nl",a.get('href')) yield self.getdoc(url) if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(FokForumScraper)
except IndexError: pass else: if comment.props.date.date( ) == self.options['date']: yield comment else: for li in nxt.cssselect("ol.reacties li.hidenum"): comment = HTMLDocument(parent=page) if not "<b>Reageer als eerste op dit bericht</b>" in etree.tostring( li): try: comment.props.text = li.cssselect( "div.reactie-body")[0] comment.props.author = li.cssselect("strong")[0].text comment.props.date = readDate( li.cssselect("span.tijdsverschil")[0].get( 'publicationdate')) if comment.props.date.date() == self.options['date']: yield comment except IndexError: pass if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(NuJijScraper)
class MoveArticlesForm(forms.Form): from_set = forms.ModelChoiceField(queryset = ArticleSet.objects.all()) to_set = forms.ModelChoiceField(queryset = ArticleSet.objects.all()) class MoveArticles(Script): options_form = MoveArticlesForm def run(self, _input): fr = self.options['from_set'] to = self.options['to_set'] log.debug("getting articles...") articles = list(Article.objects.filter( articlesetarticle__articleset = fr.id)) n = len(articles) log.debug("...done. {n} articles found".format(**locals())) log.debug("adding articles to new set...") to.add_articles(articles) to.save() log.info("moved {n} articles from {fr} to {to}".format(**locals())) if __name__ == "__main__": from amcat.scripts.tools import cli cli.run_cli(MoveArticles)
yield urljoin(INDEX_URL, href) def _get_units(self): for url in self.get_categories(): doc = self.getdoc(url) for item in doc.cssselect("item"): date = toolkit.readDate(item.cssselect("pubdate")[0].text) if date.date() != self.options['date']: continue link = item.cssselect("link")[0] doc = HTMLDocument( url=urljoin(INDEX_URL, html.tostring(link).lstrip("<link>")), date = date, headline = item.cssselect("title")[0].text ) yield doc def _scrape_unit(self, doc): doc.prepare(self) doc.props.text = doc.doc.cssselect("div.article-body") doc.props.html = html.tostring(doc.doc) yield doc if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping") cli.run_cli(MetroScraper)
# # # AmCAT is free software: you can redistribute it and/or modify it under # # the terms of the GNU Affero General Public License as published by the # # Free Software Foundation, either version 3 of the License, or (at your # # option) any later version. # # # # AmCAT is distributed in the hope that it will be useful, but WITHOUT # # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # # License for more details. # # # # You should have received a copy of the GNU Affero General Public # # License along with AmCAT. If not, see <http://www.gnu.org/licenses/>. # ########################################################################### import pcm class AlgemeenDagbladScraper(pcm.PCMScraper): medium_name = "Algemeen Dagblad" domain = "ad.nl" paper_id = 8001 context_id = "AD" if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(AlgemeenDagbladScraper)
return self.opener.getdoc(uri, encoding) def open(self, url, encoding=None): if isinstance(url, (str, unicode)): if isinstance(url, unicode): url = url.encode('utf-8') log.info('Retrieving "{url}"'.format(**locals())) try: return self.opener.opener.open(url, encoding) except UnicodeEncodeError: uri = iri2uri(url) return self.opener.opener.open(uri, encoding) else: req = url log.info('Retrieving "{url}"'.format(url = req.get_full_url())) return self.opener.opener.open(req, encoding) def _set_default(obj, attr, val): try: if getattr(obj, attr, None) is not None: return except ObjectDoesNotExist: pass # django throws DNE on x.y if y is not set and not nullable setattr(obj, attr, val) if __name__ == '__main__': from amcat.scripts.tools import cli cli.run_cli(Scraper)
def get_comment_pages(self, page): if not page.doc.cssselect("#reaction"): return n_id, c_id = page.props.url.split("/")[4::4] #5 and #9 doc = self.getdoc(self.comment_url.format(page=0, cid=c_id, nid=n_id)) try: total = int(doc.cssselect("div.pagenav")[0].text.split(" van ")[1]) except IndexError: yield doc return except AttributeError: return for x in range(total - 1): for a in doc.cssselect("div.pagenav a"): if "volgende" in a.text: onclick = a.get('onclick') start = onclick.find("getReactions(") + 13 end = onclick.find(")", start) href = [ arg.strip("\"';() ") for arg in onclick[start:end].split(",") ][0] yield self.getdoc(urljoin(doc.base_url, href)) if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(WebADScraper)
if text in MEDIUM_ALIASES.keys(): return Medium.get_or_create(MEDIUM_ALIASES[text]) else: return Medium.get_or_create(text) def get_pagenum(self, text): p = re.compile("pagina ([0-9]+)([,\-][0-9]+)?([a-zA-Z0-9 ]+)?") m = p.search(text.strip()) pagenum, otherpage, section = m.groups() if section: section = section.strip() return int(pagenum), section if __name__ == "__main__": from amcat.scripts.tools import cli cli.run_cli(BZK) ########################################################################### # U N I T T E S T S # ########################################################################### from amcat.tools import amcattest class TestBZK(amcattest.AmCATTestCase): def setUp(self): from django.core.files import File import os.path, json self.dir = os.path.join(os.path.dirname(__file__), 'test_files', 'bzk') self.bzk = BZK(project = amcattest.create_test_project().id, file = File(open(os.path.join(self.dir, 'test.html'))),
def _scrape_unit(self, doc): art = HTMLDocument() try: datestring = doc.cssselect("div.dateplace-data")[0].text_content().split("\n")[2] except IndexError: datestring = doc.cssselect("div.dateplace span")[0].text_content() art.props.date = readDate(datestring) art.props.headline = doc.cssselect("div.header h1")[0].text_content() if doc.cssselect("div.content center"): doc.cssselect("div.content center")[0].drop_tree() art.props.text = doc.cssselect("div.content")[0] try: art.props.author = doc.cssselect("span.smallprint")[0].text_content().strip() except IndexError as e: print(e) yield art print("\n") if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(TestScraper) ### works very well!
# option) any later version. # # # # AmCAT is distributed in the hope that it will be useful, but WITHOUT # # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # # License for more details. # # # # You should have received a copy of the GNU Affero General Public # # License along with AmCAT. If not, see <http://www.gnu.org/licenses/>. # ########################################################################### try: from scrapers.newssites.google import GoogleNewsScraper except ImportError: try: from scraping.newssites.google import GoogleNewsScraper except ImportError: from amcatscraping.newssites.google import GoogleNewsScraper class GoogleAustriaScraper(GoogleNewsScraper): url_gtld = "at" if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scraping") cli.run_cli(GoogleAustriaScraper)
from amcat.scraping.document import HTMLDocument from amcat.scraping.scraper import HTTPScraper, DatedScraper from amcat.tools.toolkit import readDate class GMXScraper(HTTPScraper, DatedScraper): medium_name = "gmx.at" index_url = "http://www.gmx.at/themen/all/{d.year}/{d.month}/{d.day}/" def _get_units(self): d = self.options['date'] index = self.getdoc(self.index_url.format(**locals())) for div in index.cssselect("#main div.unit"): yield HTMLDocument(url = div.cssselect("a")[0].get('href')) def _scrape_unit(self, article): article.prepare(self) article.props.date = readDate(article.doc.cssselect("#datetime")[0].text_content()) article.props.section = " > ".join(article.props.url.split("/")[4:-1]) article.props.headline = article.doc.cssselect("#headline")[0].text_content().strip() article.props.text = article.doc.cssselect("#teaser") + article.doc.cssselect("#main > p") yield article if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scraping") cli.run_cli(GMXScraper)
Hashes for two articles are equal, if and only if for each field that is not in ignore_fields, the values of thoses fields are equal in both articles. @param articleset The articleset that is to be searched @param ignore_fields A set of fields that should not be included in the calculated hashes @return An iterable of (<article_id>, <hash>) tuples. """ all_fields = STATIC_FIELDS + list(articleset.get_used_properties()) if not ignore_fields: fields = ["hash"] else: fields = sorted(f for f in all_fields if not f in ignore_fields) x = amcates.ES().scan(query={"query": {"constant_score": {"filter": {"term": {"sets": articleset.id}}}}}, fields=fields) for x in amcates.ES().scan(query={"query": {"constant_score": {"filter": {"term": {"sets": articleset.id}}}}}, _source=fields): if not ignore_fields: yield int(x['_id']), x['_source']['hash'] continue art_tuple = tuple(str(x['_source'].get(k, [None])) for k in fields) hash = hash_class(repr(art_tuple).encode()).hexdigest() yield int(x['_id']), hash if __name__ == '__main__': from amcat.scripts.tools.cli import run_cli run_cli()
byline = page.doc.cssselect("span.postedbyline")[0].text_content() page.props.author = byline[byline.find("Geschreven door") + 16:byline.find(" op ")] page.props.date = readDate( page.doc.cssselect("span.postedbyline")[0].text_content().split( " op ")[1]) for comment in self.get_comments(page): comment.is_comment = True yield comment yield page def get_comments(self, page): for div in page.doc.cssselect("div.reactieHolder"): comment = HTMLDocument() comment.props.author = div.cssselect("span.left a")[0].text comment.props.date = readDate(div.cssselect("a.timelink")[0].text) comment.props.text = div.cssselect("div.reactieBody")[0] comment.props.parent = page comment.props.section = page.props.section comment.props.url = page.props.url yield comment if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(FokScraper)
pagenum = int(a.get('href').split("/")[1]) sections[pagenum] = a.text return sections def _scrape_unit(self, url): article = HTMLDocument(url=url, section=self.section) article.prepare(self) article.props.date = date(*[int(n) for n in url.split("/")[5:8]]) article.props.pagenr = self.pagenum article.props.headline = article.doc.cssselect( "#article h1")[0].text_content() article.props.text = article.doc.cssselect("div.body")[0] dateline_pattern = re.compile("^([A-Z]+( [A-Z]+)?)$") b = article.props.text.cssselect("b") if b and dateline_pattern.search(b[0].text_content()): article.props.dateline = dateline_pattern.search( b[0].text_content()).group(1) if article.doc.cssselect("#article address"): article.props.author = article.doc.cssselect( "#article address")[0].text_content().lstrip("dor").strip() yield article if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping") cli.run_cli(SpitsKrantScraper)
info = page.doc.cssselect("div.nieuws_box p") for p in info: if "Plaatsingsdatum" in p.cssselect("b")[0].text: page.props.date = readDate(p.text_content().split(":")[1]) break for comment in self.scrape_comments(page): comment.is_comment = True yield comment yield page def scrape_comments(self,page): for li in page.doc.cssselect("ul.uiList li.fbFeedbackPost"): comment = HTMLDocument(parent=page,url=page.url) comment.props.text = li.cssselect("div.postText")[0].text comment.props.author = li.cssselect("a.profileName")[0].text comment.props.date = readDate(li.cssselect("abbr.timestamp")[0].get('title')) yield comment if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(Zorgportaal_nlNieuwsScraper)
text = text.replace("-\n", "") text = text.replace(" ", " ") text = text.replace("\n", " ") article.text = text date_pattern = re.compile("([0-9]{2,2})\-([0-9]{2,2})\-([0-9]{4,4})") result = date_pattern.search(lines[1]) article.date = date(int(result.group(3)), int(result.group(2)), int(result.group(1))) pagenum_pattern = re.compile("\(p.([0-9]+)([0-9\-]+)?\)") result = pagenum_pattern.search(lines[1]) if result: article.pagenr = int(result.group(1)) for h, medium in self.index: if article.headline.lower().strip() in h.lower().strip(): article.set_property("medium", self.get_medium(medium)) return article def get_medium(self, medium): if not medium or len(medium) < 1: medium = "unknown" return MEDIUM_ALIASES.get(medium, medium) if __name__ == "__main__": from amcat.scripts.tools import cli cli.run_cli(BZKPDFScraper)
return offset def getresponse(self, offset): _json = self.open(self.solr_url.format(**locals())).read() return json.loads(_json)["response"] article_url = "http://www.salzburg.com/nachrichten/id=112&tx_ttnews%5Btt_news%5D={urlid}&cHash=abc" def _scrape_unit(self, data): urlid = data['uri'].split('-')[-1] article_url = self.article_url.format(**locals()) yield HTMLDocument( date=readDate(data['date']), section=", ".join('ressort' in data.keys() and data['ressort'] or []), headline=data['title'], url=article_url, externalid=data['id'], text=data['text'], author=data['author'], tags='tag' in data.keys() and data['tag'], teaser='teaser' in data.keys() and data['teaser'], all_data=data) if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scraping") cli.run_cli(SalzburgScraper)
if stop: break else: p = re.compile("[\\\]udc[\w\w]") artpage.props.text = literal_eval(p.sub("",repr(body))) artpage.props.edition = page['edition'] artpage.props.byline = byline artpage.props.section = page['section'] if re.match("[A-Z][0-9]+", page['page_str']): artpage.props.section += " - section " + page['page_str'][0] artpage.props.pagenr = int(page['page_str'][1:]) else: artpage.props.pagenr = int(page['page_str']) dateline_pattern = re.compile("(^[^\n]+\n\n([A-Z]+( [A-Z]+)?) -\n)|(([A-Z]+( [A-Z]+)?)\n\n)") match = dateline_pattern.search(artpage.props.text) if match: #dateline and theme have the same syntax and are therefore undistinguishable artpage.props.dateline_or_theme = match.group(2) or match.group(5) artpage.props.url = page['url'] yield artpage if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(TubantiaScraper)
comment.parent = page yield comment yield page def get_article(self, page): page.props.author = page.doc.cssselect("article footer")[0].text_content().split("|")[0].strip() page.props.headline = page.doc.cssselect("article h1")[0].text.strip() if page.props.headline[0] == '#': page.props.headline = page.props.headline[1:].strip() datestring = page.doc.cssselect("footer time")[0].text_content() page.props.date = datetime.datetime.strptime(datestring, '%d-%m-%y | %H:%M') page.doc.cssselect("footer")[0].drop_tree() page.props.text = page.doc.cssselect("article")[0] page.coords = "" return page def get_comments(self,page): for article in page.doc.cssselect("#comments article"): comment = HTMLDocument(parent = page) footer = article.cssselect("footer")[0].text_content().split(" | ") comment.props.date = readDate(footer[1]) comment.props.author = footer[0] comment.props.text = article.cssselect("p") yield comment if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scraping") cli.run_cli(GeenstijlScraper)
paragraphs = [p for p in paragraphs if p] article.text = "" for p in paragraphs: article.text += p + "\n\n" if p.startswith("(") and len( p.split(",")) > 1: #laatste regel van normale content break # Add non-ascii characters # Takes the '=AB' occurrences and turns them into latin-1 characters. def character(match): code = match.group()[1:] char = r"\x{}".format(code).decode('string-escape').decode( 'latin-1') if code == "92": return "'" elif code == "85": return "..." return char article.text = re.sub("=[A-Z0-9]{2}", character, article.text) yield article if __name__ == "__main__": from amcat.scripts.tools import cli cli.run_cli(BZKEML)
logging.info("{n} duplicate articles found, run without dry_run to remove".format(**locals())) else: logging.info("Removing {n} articles from set".format(**locals())) articleset.remove_articles(to_remove) if save_duplicates_to: dupes_article_set = ArticleSet.create_set(articleset.project, save_duplicates_to, to_remove) return n, dry_run def get_hashes(self): fields = [f for f in FIELDS if not self.options.get("skip_{}".format(f))] if fields == FIELDS: fields = ["hash"] setid = self.options['articleset'].id for x in amcates.ES().scan(query={"query" : {"constant_score" : {"filter": {"term": {"sets": setid}}}}}, fields=fields): if fields == ["hash"]: hash = x['fields']['hash'][0] else: def get(flds, f): val = flds.get(f) return val[0] if val is not None else val d = {f: get(x['fields'], f) for f in fields} hash = hash_class(json.dumps(d)).hexdigest() yield int(x['_id']), hash if __name__ == '__main__': from amcat.scripts.tools.cli import run_cli run_cli()
# # # This file is part of AmCAT - The Amsterdam Content Analysis Toolkit # # # # AmCAT is free software: you can redistribute it and/or modify it under # # the terms of the GNU Affero General Public License as published by the # # Free Software Foundation, either version 3 of the License, or (at your # # option) any later version. # # # # AmCAT is distributed in the hope that it will be useful, but WITHOUT # # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # # License for more details. # # # # You should have received a copy of the GNU Affero General Public # # License along with AmCAT. If not, see <http://www.gnu.org/licenses/>. # ########################################################################### import nrc class NRCHandelsbladScraper(nrc.NRCScraper): medium_name = "NRC Handelsblad" nrc_version = "NH" if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(NRCHandelsbladScraper)
def parent_table(codebook): result = table3.ObjectTable(rows=codebook.codebookcodes) result.addColumn(lambda row : row.code.uuid, label="uuid") result.addColumn(lambda row : row.code.id, label="code_id") result.addColumn(lambda row : row.code, label="code") result.addColumn(lambda row : row.parent, label="parent") self.add_label_columns(result) return result def _get_tree(codebook): parents = {cc.code : cc.parent for cc in codebook.codebookcodes} for root in (code for (code, parent) in parents.iteritems() if parent is None): for row in _get_tree_rows(parents, 0, root): yield row def _get_tree_rows(parents, indent, parent): yield TreeRow(indent, parent) for child in (c for (c, p) in parents.iteritems() if p == parent): for row in _get_tree_rows(parents, indent+1, child): yield row if __name__ == '__main__': from amcat.scripts.tools import cli import sys #cli.run_cli().to_csv(stream=sys.stdout) print cli.run_cli().to_csv() #print result.output()
export_sql = ("SELECT {self.dest_project.id} AS projectid, {fields} FROM articles a" " WHERE article_id IN ({article_ids})").format(**locals()) export_sql = "COPY ({export_sql}) TO STDOUT WITH BINARY".format(**locals()) import_sql = "COPY articles (project_id, {fields}) FROM STDIN WITH BINARY".format(**locals()) dest_host = "-h {self.dest_host}".format(**locals()) if self.dest_host else "" source_host = "-h {self.source_host}".format(**locals()) if self.source_host else "" cmd = ('psql {source_host} {self.source_db} -c "{export_sql}" ' '| psql {dest_host} {self.dest_db} -c "{import_sql}"').format(**locals()) log.debug("Copying {n} articles...".format(n=len(aids))) #log.debug(cmd) subprocess.check_output(cmd, shell=True) log.debug("... Done!") def _add_to_set(self, uuids): log.debug("Adding {n} articles to set using uuids...".format(n=len(uuids))) aids = [aid for (aid,) in Article.objects.filter(uuid__in=uuids).values_list("id")] if len(aids) != len(uuids): raise Exception("|aids| != |uuids|, something went wrong importing...") self.dest_set.add_articles(aids) log.debug("... Done!") if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module() cli.run_cli()
page_txt = "" for line in parser.get_textlines(page): page_txt += line.get_text() + "\n" res += page_txt + "\n\n" article = Article(text=res) article.headline = self.getheadline(_file) article.medium = self.options['medium'] article.section = self.options['section'] if self.options['date']: article.date = self.options['date'] else: article.date = date.today() yield article def getheadline(self, _file): hl = _file.name if hl.endswith(".pdf"): hl = hl[:-len(".pdf")] windows = hl.split("\\") other = hl.split("/") if len(windows) > len(other): #probably a windows path hl = windows[-1] else: hl = other[-1] return hl if __name__ == "__main__": from amcat.scripts.tools import cli cli.run_cli(RawPDFScraper)
)) if not day_url.startswith(INDEX_URL): continue doc = self.getdoc(day_url) for article in doc.cssselect("div.lbox500 h2 a"): url = urljoin(day_url, article.get("href")) if '/video/' in url: continue yield HTMLDocument( url = urljoin(day_url, article.get("href")), headline = article.text, date = self.options['date'] ) def _scrape_unit(self, doc): doc.prepare(self) if doc.doc.cssselect("div.lbox440"): doc.props.text = doc.doc.cssselect("div.lbox440")[0].cssselect('p') else: doc.props.text = "" yield doc if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(DePersScraper)
# AmCAT is free software: you can redistribute it and/or modify it under # # the terms of the GNU Affero General Public License as published by the # # Free Software Foundation, either version 3 of the License, or (at your # # option) any later version. # # # # AmCAT is distributed in the hope that it will be useful, but WITHOUT # # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # # License for more details. # # # # You should have received a copy of the GNU Affero General Public # # License along with AmCAT. If not, see <http://www.gnu.org/licenses/>. # ########################################################################### import pcm class TrouwScraper(pcm.PCMScraper): medium_name = "Trouw" domain = "trouw.nl" paper_id = 8004 context_id = "NL" if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(TrouwScraper)
# AmCAT is distributed in the hope that it will be useful, but WITHOUT # # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # # License for more details. # # # # You should have received a copy of the GNU Affero General Public # # License along with AmCAT. If not, see <http://www.gnu.org/licenses/>. # ########################################################################### try: from scrapers.newspapers import tubantia except ImportError: try: from scraping.newspapers import tubantia except ImportError: from amcatscraping.newspapers import tubantia class GelderlanderScraper(tubantia.TubantiaScraper): medium_name = "De Gelderlander" paper = "dg" if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(GelderlanderScraper)
text = _convert_multiple(file, convertors) else: text = file.text return Article(text=text, **metadata) def explain_error(self, error): """Explain the error in the context of unit for the end user""" name = getattr(error.unit, "name", error.unit) return "Error in file {name} : {error.error!r}".format(**locals()) if __name__ == '__main__': from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scripts.article_upload.upload") #amcatlogging.debug_module("amcat.scraping.scraper") from amcat.scripts.tools.cli import run_cli run_cli(handle_output=False) ########################################################################### # U N I T T E S T S # ########################################################################### from amcat.tools import amcattest from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scripts.article_upload.upload") class TestUploadText(amcattest.AmCATTestCase): def test_article(self): from django.core.files import File base = dict(project=amcattest.create_test_project().id, articleset=amcattest.create_test_set().id, medium=amcattest.create_test_medium().id)
structure = forms.ChoiceField(choices=[(s, s.title()) for s in STRUCTURE]) def _run(self, codebook, structure, **kargs): codebook.cache_labels() result = STRUCTURE[structure](codebook) return result def _get_tree(codebook): parents = {cc.code: cc.parent for cc in codebook.codebookcodes} for root in (code for (code, parent) in parents.iteritems() if parent is None): for row in _get_tree_rows(parents, 0, root): yield row def _get_tree_rows(parents, indent, parent): yield TreeRow(indent, parent) for child in (c for (c, p) in parents.iteritems() if p == parent): for row in _get_tree_rows(parents, indent + 1, child): yield row if __name__ == '__main__': from amcat.scripts.tools import cli import sys #cli.run_cli().to_csv(stream=sys.stdout) print cli.run_cli().to_csv() #print result.output()
########################################################################### """ Script to get queries for a codebook """ import logging; log = logging.getLogger(__name__) from django import forms from amcat.scripts.script import Script from amcat.models import ArticleSet PLUGINTYPE_PARSER = 1 class RefreshIndex(Script): class options_form(forms.Form): articleset = forms.ModelChoiceField(queryset=ArticleSet.objects.all()) full_refresh = forms.BooleanField(initial=False, required=False) def _run(self, articleset, full_refresh): log.info("Refreshing {articleset}, full_refresh={full_refresh}".format(**locals())) articleset.refresh_index(full_refresh=full_refresh) if __name__ == '__main__': from amcat.scripts.tools import cli result = cli.run_cli() #print(result.output())
options_form = AddProjectForm output_type = Project def run(self, _input=None): p = Project.objects.create(**self.options) # Add user to project (as admin) pr = ProjectRole(project=p, user=self.options['owner']) pr.role = Role.objects.get(projectlevel=True, label='admin') pr.save() return p if __name__ == '__main__': from amcat.scripts.tools import cli cli.run_cli() ########################################################################### # U N I T T E S T S # ########################################################################### from amcat.tools import amcattest class TestAddProject(amcattest.PolicyTestCase): def test_add(self): u = amcattest.create_test_user() p = AddProject(owner=u.id, name='test', description='test',insert_user=u.id).run() #self.assertEqual(p.insert_user, current_user()) # current_user() doesn't exist anymore self.assertEqual(p.owner, u)
a = body_to_article(*fields) a.project = self.options['project'] yield a except: log.error( "Error on processing fields: {fields}".format(**locals())) raise from amcat.tools import amcatlogging amcatlogging.debug_module() if __name__ == '__main__': from amcat.scripts.tools import cli cli.run_cli(handle_output=False) ########################################################################### # U N I T T E S T S # ########################################################################### from amcat.tools import amcattest import datetime class TestLexisNexis(amcattest.AmCATTestCase): def setUp(self): import os.path, json self.dir = os.path.join(os.path.dirname(__file__), 'test_files', 'lexisnexis')
n_props = 0 for prop in self.article_properties: if hasattr(article, prop): n_props += 1 if article.metastring: n_props += len(eval(article.metastring)) articles_nprops[article] = n_props sortedlist = sorted(articles_nprops, key=articles_nprops.get) to_print = set(sortedlist[:3] + sortedlist[-3:]) log.info("Sample articles:") for article in to_print: for prop in self.article_properties: value = hasattr(article, prop) and getattr(article, prop) or None value = self.truncate(value) log.info("{prop} : {value}".format(**locals())) print("\n") def truncate(self, value): value = unicode(value) value = " ".join(value.split("\n")) if len(value) > 80: value = value[0:79] + "..." return value.encode('utf-8') if __name__ == "__main__": from amcat.scripts.tools import cli cli.run_cli(ValueArticleScript)
# comment.is_comment = True # yield comment def scrape_comments(self, article): for item in article.doc.cssselect("#commentsTab li.mainComment"): mcomment = self.scrape_comment(item, article) yield mcomment for li in item.cssselect("ul.answers li.commentItem"): yield self.scrape_comment(li, mcomment) def scrape_comment(self, html, parent): c = HTMLDocument( text = html.cssselect("div.text-holder"), headline = html.cssselect("a.commentTitle")[0].text_content().strip(), section = parent.props.section, date = readDate(" ".join([t.text for t in html.cssselect("ul.meta li.createdate, li.createtime")])), author = html.cssselect("ul.meta li.by")[0].text.strip().lstrip("By").strip(), url = parent.props.url + "#{}".format(html.cssselect("a.commentTitle")[0].get('id'))) c.props._parent = "{p.props.headline}, {p.props.date}".format(p = parent) return c if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.debug_module("amcat.scraping.scraper") amcatlogging.debug_module("amcat.scraping.document") cli.run_cli(HaaretzScraper)