Esempio n. 1
0
    def handle(self, *args, **options):

        if (not len(args) == 1) or (args[0] == u"help"):
            self.stdout.write(u"Usage: {0}\n".format(self.args))
            self.stdout.write(self.help)
        else:
            settings = get_project_settings()
            settings.overrides["URLS"] = args[0]
            crawler = Crawler(settings)
            spider = GeneralSpider()
            crawler.configure()
            crawler.crawl(spider)
            crawler.start()
            log.start_from_crawler(crawler)

            # stop the reactor once the spider has finished
            crawler.signals.connect(reactor.stop, signal=signals.spider_closed)

            try:
                log.msg("Running reactor...")
                reactor.run()
            except KeyboardInterrupt:
                stop_reactor()
            finally:
                log.msg("Reactor stopped")
                log.msg("#" * 40)
Esempio n. 2
0
def run_spider(spider):
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import log
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_exception(sender, failure, response, spider):
        print "Response: %s [%s]" % (response.body, response.meta)
        sys.stdout.flush()

    dispatcher.connect(catch_exception, signal=signals.spider_error)

    def catch_resp_dld(sender, response, request, spider):
        print "Downloaded (%s) Response %s" % (response.status, response.url)
        sys.stdout.flush()

    dispatcher.connect(catch_resp_dld, signal=signals.response_downloaded)

    # settings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        from scrapy.conf import settings as default_settings

    default_settings.overrides.update({
        'LOG_ENABLED': False,
        'LOG_LEVEL': 'CRITICAL',
        'BOT_NAME': 'project',
    })
    # Update general settings with spider-specific ones
    for k,v in spider.settings.iteritems():
        if isinstance(v, dict) and k in default_settings.overrides:
            default_settings.overrides[k].update(v)
        else:
            default_settings.overrides[k] = v

    # set up crawler
    from twisted.internet import reactor
    from scrapy.crawler import Crawler

    crawler = Crawler(default_settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(spider)

    log.start_from_crawler(crawler)

    # start engine scrapy/twisted
    crawler.start()

    if not reactor.running:
        reactor.run()

    crawler.uninstall()
Esempio n. 3
0
def run_spider(spider, settings=None):
    """Run a spider instance through the scrapy crawler.

    This function is suitable for standalone scripts.
    """
    crawler = CrawlerProcess(_build_settings(settings))
    crawler.install()
    crawler.configure()
    log.start_from_crawler(crawler)
    crawler.crawl(spider)
    crawler.start()
def run_spider(spider, settings=None):
    """Run a spider instance through the scrapy crawler.

    This function is suitable for standalone scripts.
    """
    crawler = CrawlerProcess(_build_settings(settings))
    crawler.install()
    crawler.configure()
    log.start_from_crawler(crawler)
    crawler.crawl(spider)
    crawler.start()
Esempio n. 5
0
def run_retry_spider():
    spider = retrySpider.RetrySpider()

    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start_from_crawler(crawler)
    reactor.run()
Esempio n. 6
0
def start():
    uid_list = map(lambda x: x.strip(), open('E:/PyCharm/CatPackages/resources/doc/user_500.txt').readlines())
    spider = userSpider.UserSpider(uid_list=uid_list)

    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start_from_crawler(crawler)
    reactor.run()
Esempio n. 7
0
def run_weibo_spider():
    uid_list = read_uid_list('E:/PyCharm/CatPackages/resources/doc/user_500.txt')
    print(uid_list)
    spider = weiboSpider.WeiboSpider(uid_list, start='2015-03-15', end='2015-04-15')

    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start_from_crawler(crawler)
    reactor.run()
Esempio n. 8
0
    def _start_crawler(self):
        if not self.crawlers or self.stopping:
            return

        name, crawler = self.crawlers.popitem()
        self._active_crawler = crawler
        sflo = log.start_from_crawler(crawler)
        crawler.configure()
        crawler.install()
        crawler.signals.connect(crawler.uninstall, signals.engine_stopped)
        if sflo:
            crawler.signals.connect(sflo.stop, signals.engine_stopped)
        crawler.signals.connect(self._check_done, signals.engine_stopped)
        crawler.start()
        return name, crawler
Esempio n. 9
0
    def _start_crawler(self):
        if not self.crawlers or self.stopping:
            return

        name, crawler = self.crawlers.popitem()
        self._active_crawler = crawler
        sflo = log.start_from_crawler(crawler)
        crawler.configure()
        crawler.install()
        crawler.signals.connect(crawler.uninstall, signals.engine_stopped)
        if sflo:
            crawler.signals.connect(sflo.stop, signals.engine_stopped)
        crawler.signals.connect(self._check_done, signals.engine_stopped)
        crawler.start()
        return name, crawler
Esempio n. 10
0
 def _setup_crawler_logging(self, crawler):
     log_observer = log.start_from_crawler(crawler)
     if log_observer:
         crawler.signals.connect(log_observer.stop, signals.engine_stopped)
Esempio n. 11
0
 def crawler(self):
     if not self.configured:
         log.start_from_crawler(self._crawler)
         self._crawler.configure()
         self.configured = True
     return self._crawler
Esempio n. 12
0
 def _setup_crawler_logging(self, crawler):
     log_observer = scrapy_log.start_from_crawler(crawler)
     if log_observer:
         monkey_patch_and_connect_log_observer(crawler, log_observer)
     if self.log_observer:
         monkey_patch_and_connect_log_observer(crawler, self.log_observer)
Esempio n. 13
0
 def crawler(self):
     if not self.configured:
         log.start_from_crawler(self._crawler)
         self._crawler.configure()
         self.configured = True
     return self._crawler
Esempio n. 14
0
 def _create_logged_crawler(self, spidercls):
     crawler = self._create_crawler(spidercls)
     log_observer = log.start_from_crawler(crawler)
     if log_observer:
         crawler.signals.connect(log_observer.stop, signals.engine_stopped)
     return crawler
Esempio n. 15
0
 def _setup_crawler_logging(self, crawler):
     log_observer = scrapy_log.start_from_crawler(crawler)
     if log_observer:
         monkey_patch_and_connect_log_observer(crawler, log_observer)
     if self.log_observer:
         monkey_patch_and_connect_log_observer(crawler, self.log_observer)
#!/usr/bin/python2
from angellist import settings
from scrapy import log
from scrapy.crawler import CrawlerProcess
from scrapy.settings import CrawlerSettings

MySettings = CrawlerSettings(settings_module = settings)
MyCrawler = CrawlerProcess(MySettings)

log.start_from_crawler(MyCrawler)
MyCrawler.configure()

for spider_object in MyCrawler.spiders._spiders.itervalues():
	MyCrawler.crawl(spider_object())

MyCrawler.start()
Esempio n. 17
0
 def _setup_crawler_logging(self, crawler):
     log_observer = log.start_from_crawler(crawler)
     if log_observer:
         crawler.signals.connect(log_observer.stop, signals.engine_stopped)
Esempio n. 18
0
 def _create_logged_crawler(self, spidercls):
     crawler = self._create_crawler(spidercls)
     log_observer = log.start_from_crawler(crawler)
     if log_observer:
         crawler.signals.connect(log_observer.stop, signals.engine_stopped)
     return crawler