Example #1
0
 def _setup_logging(self):
     """
     Set up the logging facility. By default runs a file handler in _get_filename(".log")
     """
     fn = self._get_filename(".log")
     amcatlogging.setFileHandler(fn)
     amcatlogging.info_module()
     amcatlogging.debug_module('amcat.tools.amcatsolr')
Example #2
0
                article.props.author = tag.cssselect("span.author")[0].text.strip()
            elif tag.cssselect("div.videoContainer") or 'promo' in tag.get('class'):
                continue
            elif tag.cssselect("div.tagline h4"):
                self.stories.add(urljoin(url, tag.cssselect("h4 a")[0].get('href')))
                continue
            else:
                h = tag.cssselect("div.body h3")[0]
                article.props.type = "article"
                article.props.headline = h.text_content().strip()
                if h.cssselect("a"):
                    article.props.url = urljoin(url, h.cssselect("a")[0].get('href'))
                else:
                    article.props.url = url
            yield article

    def _scrape_unit(self, article):
        if article.props.type == "article":
            article.prepare(self)
            [div.drop_tree() for div in article.doc.cssselect("div.rtldart")]
            article.props.text = article.doc.cssselect("article.news div.body div.paragraph")
        print(article)
        yield article

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scraping")
    cli.run_cli(RTLScraper)

Example #3
0
                elif date.date() < self.options['date']:
                    return
            pagenr += 1

    def _scrape_unit(self, bits):
        date, url = bits
        article = HTMLDocument(date = date, url = url)
        article.prepare(self)
        content = article.doc.cssselect("#content")[0]
        article.props.section = content.cssselect("div.info-block p.meta a.label")[0].text
        article.props.headline = content.cssselect("div.title h1")[0].text
        article.props.externalid = url.split("-")[-1].strip("W/")
        article.props.text = content.cssselect("div.article")
        article.props.author = content.cssselect("p.meta span.user a.label")[0].text.strip()
        article.props.tags = set([a.text for a in content.cssselect("ul.taglist li a")])
        article.props.view_count = int(content.cssselect("div.info-block span.view-count")[0].text)
        yield article
        self.clearcookies()

    def clearcookies(self):
        """Clear cookies so the site won't interrupt us after 3 articles"""
        self.opener.cookiejar._cookies = {}

if __name__ == '__main__':
    from amcat.scripts.tools import cli
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scraping")
    cli.run_cli(BoerderijScraper)


Example #4
0
    def run_scrapers(self):
        """
        Runs on all daily scraper articlesets
        """


if __name__ == '__main__':
    from sys import argv
    from getopt import getopt
    opts, args = getopt(argv, "s")
    for opt, arg in opts:
        if opt == '-s':
            dedu = DeduplicateScript()
            dedu.run_scrapers()

    amcatlogging.info_module("amcat.scripts.maintenance.deduplicate")
    from amcat.scripts.tools import cli
    cli.run_cli(DeduplicateScript)

###########################################################################
#                          U N I T   T E S T S                            #
###########################################################################

from amcat.tools import amcattest


class TestDeduplicateScript(amcattest.PolicyTestCase):
    def test_deduplicate(self):
        """One article should be deleted from artset and added to project 2"""
        p = amcattest.create_test_project()
        art1 = amcattest.create_test_article(url='blaat1', project=p)
Example #5
0
    L = len(numbers)
    if L == 0:
        return 0
    elif L == 1:
        return numbers[0]
    else:
        pointer = int(L*(2.0/4.0))
        return numbers[pointer]

def set_scraper_stats():
    for scraper in Scraper.objects.all():
        try:
            ranges = scraper_ranges(scraper)
        except ValueError:
            continue
        scraper.statistics = ranges
        log.info("{scraper}: {scraper.statistics}".format(**locals()))
        scraper.save()



if __name__ == '__main__':
    from amcat.tools import amcatlogging
    amcatlogging.info_module("amcat.scripts.maintenance.set_scraper_stats")
    set_scraper_stats()
        

    
        
        
Example #6
0
    'DURATION' : 'D',
    'TIME' : 'D',
    'NUMBER' : '#',
    'ORDINAL' : '#',
    'MISC' : '?',
    'MONEY' : '#',
    'SET' : '#',
    'PERCENT' : '#',
    }



if __name__ == '__main__':
    from amcat.tools import amcatlogging
    amcatlogging.setup()
    amcatlogging.info_module("amcat.contrib.corenlp")

    #from amcat.models import ArticleSet

    nlp = StanfordCoreNLP(corenlp_path="/home/amcat/resources/stanford-corenlp", models_version="2012-07-06")

    import sys
    if len(sys.argv) > 1:
	aids = map(int, sys.argv[1:])
	delete_existing = True
	amcatlogging.debug_module("amcat.contrib.corenlp")
    else:
        aids = [int(aid) for aid in sys.stdin]
	#s = ArticleSet.objects.get(pk=22947)
	#aids = [aid for (aid,) in s.articles.values_list("id")]
	delete_existing = True
Example #7
0
            'run_daily': self.options['run_daily']
        }

        log.info("new scraper with options {}".format(scraper_options))

        Scraper.objects.create(**scraper_options)

        log.info("done")

    def articleset(self):
        if self.options['articleset']:
            return self.options['articleset']
        elif self.options['new_set_project']:
            name = self.options[
                'new_set_name'] or self.options['label'] + " scraper"
            return ArticleSet.objects.create(
                name=name,
                project=self.options['new_set_project'],
                provenance="")
        else:
            raise ValueError(
                "please provice articleset or new_set_project, new_set_name is optional"
            )


if __name__ == '__main__':
    from amcat.tools import amcatlogging
    from amcat.scripts.tools import cli
    amcatlogging.info_module(__name__)
    cli.run_cli(RegisterScraperScript)
Example #8
0
            'run_daily' : self.options['run_daily']}
                
        log.info("new scraper with options {}".format(scraper_options))

        Scraper.objects.create(**scraper_options)

        log.info("done")

    def articleset(self):
        if self.options['articleset']:
            return self.options['articleset']
        elif self.options['new_set_project']:
            name = self.options['new_set_name'] or self.options['label'] + " scraper"
            return ArticleSet.objects.create(name = name,
                                     project = self.options['new_set_project'],
                                     provenance = "")
        else:
            raise ValueError("please provice articleset or new_set_project, new_set_name is optional")
            
        

          
            
    
if __name__ == '__main__':
    from amcat.tools import amcatlogging
    from amcat.scripts.tools import cli
    amcatlogging.info_module(__name__)
    cli.run_cli(RegisterScraperScript)

Example #9
0
            return -article.id
        
def deduplicate_scrapers(date):
    options = {
        'last_date' : date,
        'first_date' : date - timedelta(days = 7),
        }

    scrapers = Scraper.objects.filter(run_daily='t')
    for s in scrapers:
        options['articleset'] = s.articleset_id
        DeduplicateScript(**options).run(None)

        
if __name__ == '__main__':
    amcatlogging.info_module("amcat.scripts.maintenance.deduplicate")
    from amcat.scripts.tools import cli
    cli.run_cli(DeduplicateScript)
    

###########################################################################  
#                          U N I T   T E S T S                            #  
###########################################################################

from amcat.tools import amcattest    

class TestDeduplicateScript(amcattest.AmCATTestCase):
    def test_deduplicate(self):
        """One article should be deleted from artset and added to project 2"""
        p = amcattest.create_test_project()