def _setup_logging(self): """ Set up the logging facility. By default runs a file handler in _get_filename(".log") """ fn = self._get_filename(".log") amcatlogging.setFileHandler(fn) amcatlogging.info_module() amcatlogging.debug_module('amcat.tools.amcatsolr')
article.props.author = tag.cssselect("span.author")[0].text.strip() elif tag.cssselect("div.videoContainer") or 'promo' in tag.get('class'): continue elif tag.cssselect("div.tagline h4"): self.stories.add(urljoin(url, tag.cssselect("h4 a")[0].get('href'))) continue else: h = tag.cssselect("div.body h3")[0] article.props.type = "article" article.props.headline = h.text_content().strip() if h.cssselect("a"): article.props.url = urljoin(url, h.cssselect("a")[0].get('href')) else: article.props.url = url yield article def _scrape_unit(self, article): if article.props.type == "article": article.prepare(self) [div.drop_tree() for div in article.doc.cssselect("div.rtldart")] article.props.text = article.doc.cssselect("article.news div.body div.paragraph") print(article) yield article if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scraping") cli.run_cli(RTLScraper)
elif date.date() < self.options['date']: return pagenr += 1 def _scrape_unit(self, bits): date, url = bits article = HTMLDocument(date = date, url = url) article.prepare(self) content = article.doc.cssselect("#content")[0] article.props.section = content.cssselect("div.info-block p.meta a.label")[0].text article.props.headline = content.cssselect("div.title h1")[0].text article.props.externalid = url.split("-")[-1].strip("W/") article.props.text = content.cssselect("div.article") article.props.author = content.cssselect("p.meta span.user a.label")[0].text.strip() article.props.tags = set([a.text for a in content.cssselect("ul.taglist li a")]) article.props.view_count = int(content.cssselect("div.info-block span.view-count")[0].text) yield article self.clearcookies() def clearcookies(self): """Clear cookies so the site won't interrupt us after 3 articles""" self.opener.cookiejar._cookies = {} if __name__ == '__main__': from amcat.scripts.tools import cli from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scraping") cli.run_cli(BoerderijScraper)
def run_scrapers(self): """ Runs on all daily scraper articlesets """ if __name__ == '__main__': from sys import argv from getopt import getopt opts, args = getopt(argv, "s") for opt, arg in opts: if opt == '-s': dedu = DeduplicateScript() dedu.run_scrapers() amcatlogging.info_module("amcat.scripts.maintenance.deduplicate") from amcat.scripts.tools import cli cli.run_cli(DeduplicateScript) ########################################################################### # U N I T T E S T S # ########################################################################### from amcat.tools import amcattest class TestDeduplicateScript(amcattest.PolicyTestCase): def test_deduplicate(self): """One article should be deleted from artset and added to project 2""" p = amcattest.create_test_project() art1 = amcattest.create_test_article(url='blaat1', project=p)
L = len(numbers) if L == 0: return 0 elif L == 1: return numbers[0] else: pointer = int(L*(2.0/4.0)) return numbers[pointer] def set_scraper_stats(): for scraper in Scraper.objects.all(): try: ranges = scraper_ranges(scraper) except ValueError: continue scraper.statistics = ranges log.info("{scraper}: {scraper.statistics}".format(**locals())) scraper.save() if __name__ == '__main__': from amcat.tools import amcatlogging amcatlogging.info_module("amcat.scripts.maintenance.set_scraper_stats") set_scraper_stats()
'DURATION' : 'D', 'TIME' : 'D', 'NUMBER' : '#', 'ORDINAL' : '#', 'MISC' : '?', 'MONEY' : '#', 'SET' : '#', 'PERCENT' : '#', } if __name__ == '__main__': from amcat.tools import amcatlogging amcatlogging.setup() amcatlogging.info_module("amcat.contrib.corenlp") #from amcat.models import ArticleSet nlp = StanfordCoreNLP(corenlp_path="/home/amcat/resources/stanford-corenlp", models_version="2012-07-06") import sys if len(sys.argv) > 1: aids = map(int, sys.argv[1:]) delete_existing = True amcatlogging.debug_module("amcat.contrib.corenlp") else: aids = [int(aid) for aid in sys.stdin] #s = ArticleSet.objects.get(pk=22947) #aids = [aid for (aid,) in s.articles.values_list("id")] delete_existing = True
'run_daily': self.options['run_daily'] } log.info("new scraper with options {}".format(scraper_options)) Scraper.objects.create(**scraper_options) log.info("done") def articleset(self): if self.options['articleset']: return self.options['articleset'] elif self.options['new_set_project']: name = self.options[ 'new_set_name'] or self.options['label'] + " scraper" return ArticleSet.objects.create( name=name, project=self.options['new_set_project'], provenance="") else: raise ValueError( "please provice articleset or new_set_project, new_set_name is optional" ) if __name__ == '__main__': from amcat.tools import amcatlogging from amcat.scripts.tools import cli amcatlogging.info_module(__name__) cli.run_cli(RegisterScraperScript)
'run_daily' : self.options['run_daily']} log.info("new scraper with options {}".format(scraper_options)) Scraper.objects.create(**scraper_options) log.info("done") def articleset(self): if self.options['articleset']: return self.options['articleset'] elif self.options['new_set_project']: name = self.options['new_set_name'] or self.options['label'] + " scraper" return ArticleSet.objects.create(name = name, project = self.options['new_set_project'], provenance = "") else: raise ValueError("please provice articleset or new_set_project, new_set_name is optional") if __name__ == '__main__': from amcat.tools import amcatlogging from amcat.scripts.tools import cli amcatlogging.info_module(__name__) cli.run_cli(RegisterScraperScript)
return -article.id def deduplicate_scrapers(date): options = { 'last_date' : date, 'first_date' : date - timedelta(days = 7), } scrapers = Scraper.objects.filter(run_daily='t') for s in scrapers: options['articleset'] = s.articleset_id DeduplicateScript(**options).run(None) if __name__ == '__main__': amcatlogging.info_module("amcat.scripts.maintenance.deduplicate") from amcat.scripts.tools import cli cli.run_cli(DeduplicateScript) ########################################################################### # U N I T T E S T S # ########################################################################### from amcat.tools import amcattest class TestDeduplicateScript(amcattest.AmCATTestCase): def test_deduplicate(self): """One article should be deleted from artset and added to project 2""" p = amcattest.create_test_project()