Ejemplo n.º 1
0
def setup_crawler(origem,destino,ano_saida,mes_saida,dia_saida,ano_chegada,mes_chegada,dia_chegada):
    spider = SubmarinoSpiderSpider(origem=origem,destino=destino,ano_saida=ano_saida,mes_saida=mes_saida,dia_saida=dia_saida, ano_chegada=ano_chegada,mes_chegada=mes_chegada,dia_chegada=dia_chegada,user_browser=random_header())

    crawler = Crawler(Settings())
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 2
0
 def setup_crawler(self, spider):
     crawler = Crawler(get_project_settings())
     crawler.signals.connect(self.spider_closed, signals.spider_closed)
     crawler.configure()
     crawler.crawl(spider)
     self.crawler = crawler
     self.crawler.start()
Ejemplo n.º 3
0
def goGrabSomeBags():
    spider = PriceWatcherSpider(domain='barneys.com')
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 4
0
    def __crawl(self, hiddenWebSite, localPort, extraPath='', crawlImages=True, crawlLinks=True,crawlContents=True, crawlFormData=True):
        def catch_item(sender, item, **kwargs):
            item['url'] = item['url'].replace('http://127.0.0.1:'+str(localPort)+extraPath, hiddenWebSite)
            print "[+]Processing URL %s ...  " %(item['url'])
            from core.tortazo.databaseManagement.TortazoDatabase import TortazoDatabase
            database = TortazoDatabase()
            database.initDatabaseDeepWebCrawlerPlugin()
            self.__processPage(item, database)

        # setup crawler
        dispatcher.connect(catch_item, signal=signals.item_passed)
        dispatcher.connect(reactor.stop, signal=signals.spider_closed)

        settings = get_project_settings()
        settings.set('ITEM_PIPELINES', {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}, priority='cmdline')
        settings.set('IMAGES_STORE', config.deepWebCrawlerOutdir+hiddenWebSite)

        crawler = Crawler(settings)
        crawler.configure()
        spider = HiddenSiteSpider("http://127.0.0.1:"+str(localPort)+extraPath, hiddenWebSite, self.extractorRules)
        spider.setImages(crawlImages)
        spider.setLinks(crawlLinks)
        spider.setContents(crawlContents)
        spider.setForms(crawlFormData)

        crawler.crawl(spider)
        print "\n[+] Starting scrapy engine... this process could take some time, depending on the crawling and extractor rules applied... \n"
        crawler.start()
        reactor.run()
        print "[+] Crawler finished."
Ejemplo n.º 5
0
def setupCrawler(spider):
    crawler = Crawler(settings)
    crawler.configure()
    crawler.signals.connect(crawler_started, signals.engine_started)
    crawler.signals.connect(crawler_stopped, signals.engine_stopped)
    crawler.crawl(crawler.spiders.create(spider))
    crawler.start()
Ejemplo n.º 6
0
class SWACrawlerScript(object):
	def __init__(self, origin, destination, date, debug=False, defaultSettings=True):
		self.debug = debug
		
		self.origin = origin
		self.destination = destination
		self.date = date
		
		# initialize spider
		self.spider = SWAFareSpider(self.origin, self.date, self.destination)
		
		# initialize settings
		settingValues = self.loadSettings() if defaultSettings else dict()
		self.settings = Settings(values=settingValues)

		# initialize crawler
		self.crawler = Crawler(self.settings)
		self.crawler.configure()
		
		print "Set up"
	def loadSettings(self):	
		settingsList = [i for i in dir(swa.settings) if i[0] != "_"]
		settingsDict = {}
		for s in settingsList:
			# yikes
			settingsDict[s] = eval("swa.settings.%s" % s)
		return settingsDict
	
	def run(self):
		print "Running"
		self.crawler.crawl(self.spider)
		self.crawler.start()
		if ( self.debug ): log.start(loglevel=log.DEBUG)
		reactor.run()
Ejemplo n.º 7
0
def setup_crawler(domain, spidername):
    spider_class = globals()[spidername]
    spider = spider_class(domain=domain)
    crawler = Crawler(Settings())
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 8
0
    def test_priorization(self):
        webdriver = Mock()
        settings = self.settings(WEBDRIVER_BROWSER=webdriver)
        webdriver.get.side_effect = self._wait
        webdriver.page_source = u''

        dispatcher.connect(self._stop_reactor, signal=signals.spider_closed)

        crawler = Crawler(Settings(values=settings))
        crawler.configure()
        spider = self.Spider(name='test', domain='testdomain')
        crawler.crawl(spider)
        crawler.start()
        log.start(loglevel='ERROR')
        reactor.run()

        # I suspect web actions may be broken...
        assert webdriver.get.mock_calls == [
            call('http://testdomain/path?wr=0'),
            call('http://testdomain/path?wr=0&wa=0'),
            call('http://testdomain/path?wr=0&wa=1'),
            call('http://testdomain/path?wr=1'),
            call('http://testdomain/path?wr=1&wa=0'),
            call('http://testdomain/path?wr=1&wa=1'),

            #call('http://testdomain/path?wr=0&wa=0&wr=0'),
            call('http://testdomain/path?wr=0&wa=1&wr=0'),
            call('http://testdomain/path?wr=0&wa=1&wr=0'),

            #call('http://testdomain/path?wr=1&wa=0&wr=0'),
            call('http://testdomain/path?wr=1&wa=1&wr=0'),
            call('http://testdomain/path?wr=1&wa=1&wr=0')
        ]
Ejemplo n.º 9
0
def setup_crawler(
        spider_class,
        **kwargs
    ):
    """
    Use scrapy in a script
    see http://doc.scrapy.org/en/latest/topics/practices.html

    :param spider_class: Spider class to test
    :type spider_class: text
    """

    def add_item(item):
        items.append(item)

    items = []
    # create Crawler
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    # connect collecting function on item_passed
    crawler.signals.connect(add_item, signals.item_passed)
    # create & connect spider
    spider = spider_class(**kwargs)
    crawler.crawl(spider)
    # start crawler
    log.start()
    crawler.start()
    # run crawler
    task.deferLater(reactor, 1, reactor.stop)
    reactor.run()
    return items
Ejemplo n.º 10
0
    def kickoff(self):
        """
        Starts a new crawler
        :return: 
        """
        settings = Settings()

        # settings.set("USER_AGENT", "Test")
        settings.set('JOBDIR', self.args.data_dir)
        self.spider = MavenDataSpider()

        # Wrap with crawler, configure
        crawler = Crawler(self.spider, settings)
        crawler.signals.connect(spider_closing, signal=signals.spider_closed)

        logger.info('Starting crawler')
        crawler.crawl(self.spider, app=self, dbsess=self.session)

        self.spider = crawler.spider
        self.spider.link_queue_mode = False
        if self.args.debug:
            coloredlogs.install(level=logging.DEBUG)

        # Keeping thread working
        reactor.run()
Ejemplo n.º 11
0
 def setup(self):
     settings = get_project_settings()
     crawler = Crawler(settings)
     crawler.configure()
     crawler.signals.connect(self._next_crawl, signal=signals.spider_closed)
     crawler.crawl(self.spider)
     crawler.start()
Ejemplo n.º 12
0
def setup_crawler():
    spider = ScsSpider()
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 13
0
def setup_crawler(spider_name):
    exec("spider = " + spider_name)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 14
0
    def test_priorization(self):
        webdriver = Mock()
        settings = self.settings(WEBDRIVER_BROWSER=webdriver)
        webdriver.get.side_effect = self._wait
        webdriver.page_source = u''

        dispatcher.connect(self._stop_reactor, signal=signals.spider_closed)

        crawler = Crawler(Settings(values=settings))
        crawler.configure()
        spider = self.Spider(name='test', domain='testdomain')
        crawler.crawl(spider)
        crawler.start()
        log.start(loglevel='ERROR')
        reactor.run()

        assert webdriver.get.mock_calls == [
            call('http://testdomain/path?wr=0'),
            call('http://testdomain/path?wr=0&wa=0'),
            call('http://testdomain/path?wr=0&wa=1'),
            call('http://testdomain/path?wr=1'),
            call('http://testdomain/path?wr=1&wa=0'),
            call('http://testdomain/path?wr=1&wa=1'),
            call('http://testdomain/path?wr=0&wa=0&wr=0'),
            call('http://testdomain/path?wr=0&wa=1&wr=0'),
            call('http://testdomain/path?wr=1&wa=0&wr=0'),
            call('http://testdomain/path?wr=1&wa=1&wr=0')]
Ejemplo n.º 15
0
def set_crawler(spider, receiver):
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(receiver.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 16
0
def setupCrawler(spider):
    crawler = Crawler(settings)
    crawler.configure()
    crawler.signals.connect(crawler_started, signals.engine_started)
    crawler.signals.connect(crawler_stopped, signals.engine_stopped)
    crawler.crawl(crawler.spiders.create(spider))
    crawler.start()
Ejemplo n.º 17
0
    def setup_crawler(self, supermarket, reactor_control):
        """Set up the Scrapy crawler. 
        See http://doc.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script.
        
        Keyword arguments:
        supermarket -- the supermarket whose crawler should be set up
        """

        cachefile = supermarket_filename(supermarket)
        if isfile(cachefile):
            remove(cachefile)

        settings = get_project_settings()

        url = supermarket_url(supermarket)
        settings.set('FEED_URI', supermarket_filename(supermarket))

        spider = MySupermarketSpider(url)
        crawler = Crawler(settings)
        crawler.signals.connect(reactor_control.remove_crawler,
                                signal=signals.spider_closed)
        crawler.configure()
        crawler.crawl(spider)
        crawler.start()
        reactor_control.add_crawler()
    def test_priorization(self):
        webdriver = Mock()
        settings = self.settings(WEBDRIVER_BROWSER=webdriver)
        webdriver.get.side_effect = self._wait
        webdriver.page_source = u""

        dispatcher.connect(self._stop_reactor, signal=signals.spider_closed)

        crawler = Crawler(Settings(values=settings))
        crawler.configure()
        spider = self.Spider(name="test", domain="testdomain")
        crawler.crawl(spider)
        crawler.start()
        log.start(loglevel="ERROR")
        reactor.run()

        # I suspect web actions may be broken...
        assert webdriver.get.mock_calls == [
            call("http://testdomain/path?wr=0"),
            call("http://testdomain/path?wr=0&wa=0"),
            call("http://testdomain/path?wr=0&wa=1"),
            call("http://testdomain/path?wr=1"),
            call("http://testdomain/path?wr=1&wa=0"),
            call("http://testdomain/path?wr=1&wa=1"),
            # call('http://testdomain/path?wr=0&wa=0&wr=0'),
            call("http://testdomain/path?wr=0&wa=1&wr=0"),
            call("http://testdomain/path?wr=0&wa=1&wr=0"),
            # call('http://testdomain/path?wr=1&wa=0&wr=0'),
            call("http://testdomain/path?wr=1&wa=1&wr=0"),
            call("http://testdomain/path?wr=1&wa=1&wr=0"),
        ]
Ejemplo n.º 19
0
def setup_crawler():
    spider = doubanMovieSpider()
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 20
0
def runspider():
	date = datetime.datetime.utcnow()
	unix_date = calendar.timegm(date.utctimetuple())
	
	route = request.args.get('route')
	domain = request.args.get('domain')
	
	directory = r"{0}\initiator\static\scrapes\{1}\{2}".format(os.getcwd(), domain, unix_date)
	
	if not os.path.exists(directory):
		os.makedirs(directory)
	
	logfile = open('testlog.log', 'w')
	log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG)
	log_observer.start()
	log.start(loglevel=logging.DEBUG)
	
	dispatcher.connect(stop_reactor, signal=signals.spider_closed)
	
	spider = MySpider(route, unix_date)
	
	settings_module = importlib.import_module('SiteCrawler.settings')
	settings = CrawlerSettings(settings_module)
	crawler = Crawler(settings)
	
	crawler.configure()
	crawler.crawl(spider)
	crawler.start()
	
	log.msg('Running reactor...')
	reactor.run()  # the script will block here until the spider is closed
	log.msg('Reactor stopped.')
	return redirect(url_for('choose_graph', domain = domain, date = unix_date))
Ejemplo n.º 21
0
def setup_crawler(ticker):
    spider = StatsSpider(ticker=ticker)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 22
0
def setup_crawler(user, website, validator_set, parameters):
    spider = WebQualitySpider(user=user, website=website, validators=validator_set, parameters=parameters)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 23
0
 def _crawl_next(self, spider):
     settings = get_project_settings()
     crawler = Crawler(settings)
     crawler.configure()
     crawler.signals.connect(self._done_task, signal=signals.spider_closed)
     crawler.crawl(spider)
     crawler.start()
Ejemplo n.º 24
0
def call_spider(file):
    """
    Crea el spider y ejecuta el reactor. Copia los resultados del crawling a los archivos .json para luego
    transformarlos a los archivos data.json correspondientes.
    """
    with open(file, "r") as f:
        list_url = f.readlines()
        domains = []
        urls = []
        created_files = []
        for u in list_url:
            domain = u.strip('\n')
            url_aux = domain.split("/")
            domain_type = False
            if (len(url_aux) > 1):
                domain = url_aux[0]
                url = "http://" + url_aux[0] + "/datos/data"
                if domain == 'www.paraguay.gov.py':
                    url = "http://" + url_aux[0] + "/datos"
            else:
                url = "http://" + u.strip('\n') + "/data"
                domain_type = True
            print "============= Domain " + domain
            print "============= Start url " + url
            response = requests.get(url + "/data.json")
            if response.status_code == 200:
                filename = FileController.FileController(
                ).save_existing_data_json(response, domain, True)
                created_files.append({
                    'modalidad': 'recolecta',
                    'archivo': filename
                })
            else:
                domains.append(domain)
                urls.append(url)

        spider = DataSpider(domains=domains,
                            start_urls=urls,
                            domain_type=domain_type)
        settings = get_project_settings()
        crawler = Crawler(settings)
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.configure()
        crawler.crawl(spider)
        crawler.start()
        log.start(logfile="log.txt", loglevel=log.DEBUG, logstdout=False)
        reactor.run()  # the script will block here
        """ Copiar los datos a los archivos .json """
        data_spider.copy_items_to_files()
        """ Eliminar archivos temporales """
        FileController.FileController().clean_tmp_files()
        """ Convertir los archivos .json a data.json (formato POD) """
        for domain in domains:
            filename = DataJson.DataJson().convert(domain)
            created_files.append({
                'modalidad': 'data-hunting',
                'archivo': filename
            })

        return created_files
Ejemplo n.º 25
0
def main():
    """Setups item signal and run the spider"""
    from twisted.internet import reactor
    from scrapy import signals
    from scrapy.settings import Settings
    from scrapy.crawler import Crawler

    def catch_item(sender, item, **kwargs):
        print "Got:", item

    settings = Settings()

    # set up crawler
    crawler = Crawler(settings)
    # shut off log
    crawler.settings.set('LOG_ENABLED', False, priority='cmdline')
    # set up signal to catch items scraped
    crawler.signals.connect(catch_item,   signal=signals.item_passed)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)

    crawler.install()
    crawler.configure()

    # schedule spider
    spider = MySpider()
    crawler.crawl(spider)

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    reactor.run()
    print "ENGINE STOPPED"
Ejemplo n.º 26
0
def setup_crawler(domain):
    spider = FollowAllSpider(domain=domain)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 27
0
 def _setup(self, project):
     spider = crawlspider.LinkSpider(project)
     settings = get_project_settings()
     crawler = Crawler(settings)
     crawler.configure()
     crawler.crawl(spider)
     self.add_crawler()
Ejemplo n.º 28
0
def setup_crawler():
    spider = DmmDirectSpider(url=sys.argv[1])
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 29
0
def parse_careers(spider):
    crawler = Crawler(Settings())
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    spider.start()
Ejemplo n.º 30
0
    def handle(self, url_slug, **options):
        page = Page.objects.get(url_slug=url_slug)
        feed = page.feed
        store = page.store
        store_slug = store.slug.lower()
        opts = {
            'recreate_tiles': options['recreate_tiles'],
            'skip_images': not options['update_images'],
            'skip_tiles': True,
        }

        start_urls = []
        for tile in feed.tiles.all():
            if tile.product:
                start_urls.append(tile.product.url)
            for content in tile.content.all():
                for prod in content.tagged_products.all():
                    start_urls.append(prod.url)
        start_urls = set(start_urls)

        # set up standard framework for running spider in a script
        settings = get_project_settings()
        crawler = Crawler(settings)
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.configure()

        spider = crawler.spiders.create(store_slug, **opts)
        spider.start_urls = start_urls
        spider.feed_id = feed.id

        crawler.crawl(spider)
        logging.info('Starting spider with options: {}'.format(opts))
        crawler.start()

        reactor.run()
Ejemplo n.º 31
0
 def _setup(self, project):
     spider = crawlspider.LinkSpider(project)
     settings = get_project_settings()
     crawler = Crawler(settings)
     crawler.configure()
     crawler.crawl(spider)
     self.add_crawler()
Ejemplo n.º 32
0
class CrawlerWorker(multiprocessing.Process):

    def __init__(self, spider, result_queue):
        multiprocessing.Process.__init__(self)
        self.result_queue = result_queue

        self.crawler = Crawler(Settings())
        self.crawler.configure()

        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)
        dispatcher.connect(self._stop_reactor, signal=signals.spider_closed)

    def _item_passed(self, item):
        self.items.append(item)

    def _stop_reactor(self):
        reactor.stop()

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        reactor.run()
        self.result_queue.put(self.items)
Ejemplo n.º 33
0
def setup_crawler(domain):
    spider = FollowAllSpider(domain=domain)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 34
0
    def handle(self, *args, **options):

        if (not len(args) == 1) or (args[0] == u"help"):
            self.stdout.write(u"Usage: {0}\n".format(self.args))
            self.stdout.write(self.help)
        else:
            settings = get_project_settings()
            settings.overrides["URLS"] = args[0]
            crawler = Crawler(settings)
            spider = GeneralSpider()
            crawler.configure()
            crawler.crawl(spider)
            crawler.start()
            log.start_from_crawler(crawler)

            # stop the reactor once the spider has finished
            crawler.signals.connect(reactor.stop, signal=signals.spider_closed)

            try:
                log.msg("Running reactor...")
                reactor.run()
            except KeyboardInterrupt:
                stop_reactor()
            finally:
                log.msg("Reactor stopped")
                log.msg("#" * 40)
Ejemplo n.º 35
0
def setup_crawler(spider, stop=False):
    '''
    Takes a spider class object
    '''
    # Deferred means other functions can wait on this finishing
    # Wait until the callback is triggered by spider close
    # See twisted docs
    d = defer.Deferred()

    def foo(*a, **kw):
        # The result to be passed to any callbacks to deferred
        # (we don't use it, so True could've been False, None w/e)
        d.callback(True)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    # Ref to foo otherwise it gets GC'd (garbage collected)
    crawler._tempref = foo
    # foo is the handler for the closed signal from this spider
    # N.B. dispatch returns spider and reason (e.g. 'finished') to foo.
    crawler.signals.connect(foo, signal=signals.spider_closed)
    crawler.crawl(spider)
    # N.B log is scrapy log. log2 is python color logger
    # The crawler arg is necessary for log_count/{ERROR, DEBUG, INFO..} stats
    # which you will want for stats mailer extension.
    # Starting this each time will cause the big torrade of ESMTP Error
    # log.start(crawler=crawler)
    crawler.start()
    return d
Ejemplo n.º 36
0
    def crawl(cls, sites):
        stat = {"spiders": 0}

        def soft_stop_reactor():
            stat["spiders"] -= 1
            if not stat["spiders"]:
                reactor.stop()

        for site in sites:
            try:
                spider = site.parser.spider(site)
            except (NotImplementedError, ObjectDoesNotExist):
                logger.error(
                    _('Spider not implemented for "%s" site', site.label))
            else:
                stat["spiders"] += 1
                with spider_project(spider) as settings:
                    crawler = Crawler(settings)
                    crawler.signals.connect(
                        soft_stop_reactor,
                        signal=signals.spider_closed)  # reactor.stop
                    crawler.configure()
                    crawler.crawl(spider)
                    crawler.start()

        logfile = open('crawl.log', 'w')
        log_observer = log.ScrapyFileLogObserver(logfile, level=logging.INFO)
        log_observer.start()

        # the script will block here until the spider_closed signal was sent
        reactor.run()
def start_crawler(spider, search):
    # Set up spider
    spider = TripAdvisorSpider(search=search)

    # Set up settings
    settings = Settings()
    # settings.overrides['FEED_FORMAT']='csv'
    # settings.overrides['FEED_URI']='tripadvisor_{0}.csv'.format(search)
    settings.set('CLOSESPIDER_ITEMCOUNT', False)
    settings.set('ROBOTSTXT_OBEY', False)
    settings.set('COOKIES_ENABLED', False)
    settings.set(
        'ITEM_PIPELINES',
        {'tripadvisor_scraper.pipelines.TripadvisorScraperPipeline': 300})
    settings.set('DOWNLOAD_DELAY', 3)
    settings.set('LOG_FILENAME', 'log.log')
    # settings.overrides['LOG_FILENAME'] = 'log.log'
    # settings.overrides['ROBOTSTXT_OBEY'] = False # Ignore robots.txt
    # settings.overrides['CLOSESPIDER_ITEMCOUNT']=1
    # settings.overrides['DOWNLOAD_DELAY'] = 3
    # settings.overrides['COOKIES_ENABLED'] = False
    # settings.overrides['ITEM_PIPELINES'] = {
    #    'tripadvisor_scraper.pipelines.TripadvisorScraperPipeline': 300,
    # }

    # Set up crawler
    crawler = Crawler(spider, settings)
    # crawler.configure()
    crawler.signals.connect(spider_closed, signal=signals.spider_closed)
    crawler.crawl(spider)
Ejemplo n.º 38
0
def setup_crawler(id="550", publisher="rbd"):
    spider = DmmQuerySpider(id, publisher)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 39
0
    def crawl(cls, sites):
        stat = {"spiders": 0}

        def soft_stop_reactor():
            stat["spiders"] -= 1
            if not stat["spiders"]:
                reactor.stop()

        for site in sites:
            try:
                spider = site.parser.spider(site)
            except (NotImplementedError, ObjectDoesNotExist):
                logger.error(_('Spider not implemented for "%s" site', site.label))
            else:
                stat["spiders"] += 1
                with spider_project(spider) as settings:
                    crawler = Crawler(settings)
                    crawler.signals.connect(soft_stop_reactor, signal=signals.spider_closed)  # reactor.stop
                    crawler.configure()
                    crawler.crawl(spider)
                    crawler.start()

        logfile = open('crawl.log', 'w')
        log_observer = log.ScrapyFileLogObserver(logfile, level=logging.INFO)
        log_observer.start()

        # the script will block here until the spider_closed signal was sent
        reactor.run()
class Wallhaven_Crawler:
    def __init__(self, query):
        self.query = query
        
        # Creation of spider from query
        self.spider = WallhavenSpider(self.query)
        
        # Getting scrapy project settings
        self.settings = get_project_settings()
        
        # Creation of crawler from spider and scrapy project settings
        self.crawler = Crawler(self.settings)
        self.crawler.signals.connect(reactor.stop, signal = signals.spider_closed)
        self.crawler.configure()
        
    def start(self):
        # Crawling from spider
        self.crawler.crawl(self.spider)
        self.crawler.start()
        
        # Logging all process
        #log.start()
        #log.msg('Reactor activated.')
        # Execution of twisted reactor
        reactor.run() # The script will block here until the 'spider_closed' signal is sent
Ejemplo n.º 41
0
class startPageSpiderService(service.Service):

    def __init__(self, parent):
        self.spiderService = parent
        self._crawler = Crawler(settings)
        self._crawler.configure()
        self._spider = startPageSpider(taskId=self.spiderService.taskId)

    def getStats(self):
        return self._crawler.stats.get_stats()

    def startService(self):
        service.Service.startService(self)
        #dispatcher.connect(self.stopService, signals.spider_closed)
        self._crawler.signals.connect(self.stopService, signals.spider_closed)
#         self._crawler.signals.connect(self.test2, 'writeListQuque')
        #_startPageSpider = startPageSpider(taskId=self.spiderService.taskId)
        self._crawler.crawl(self._spider)
        #self._crawler.start()
        self.startCrawl()
        
    def startCrawl(self):
        if not self._crawler.engine.running:
            self._crawler.start()
#     def test2(self):
#         print '================>111111111111111111111111<=========================='
    def stopService(self):
        log.msg(format='startPageSpiderService->stopService stop startPageSpiderService serviceName=(%(serviceName)s)',serviceName=self.name)
        service.Service.stopService(self)
        self.spiderService.removeSpiderService()
        self._crawler.stop()
        if self.name in self.spiderService.namedServices:
            self.spiderService.removeService(self)
Ejemplo n.º 42
0
class CrawlerScript(Process):
    """Runs Spider multiple times within one script by utilizing billiard package
    (tackle the ReactorNotRestartable error).

    Parameters
    ----------
    current_dt: datetime.datetime()
        Timestamp of real-time data (EST).
    server: list
        List of Kafka brokers addresses.
    topic: str
        Specify Kafka topic to which the stream of data records will be published.

    """
    def __init__(self, current_dt, server, topic):

        Process.__init__(self)

        self.current_dt = current_dt
        self.server = server
        self.topic = topic

        self.crawler = Crawler(VIXSpiderSpider,
                               settings={'USER_AGENT': user_agent})

        self.crawler.signals.connect(reactor.stop,
                                     signal=scrapy_signals.spider_closed)

    def run(self):
        self.crawler.crawl(self.current_dt, self.server, self.topic)
        reactor.run()
Ejemplo n.º 43
0
Archivo: app.py Proyecto: uzhare/Bugle
def call_spider(spider):
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    reactor.run()
Ejemplo n.º 44
0
 def _create_spider (portion_item,name,wrk_urls):
     spider = HPizzaDetailSpider(portion_item, name=name,start_urls=wrk_urls)
     spiders.append(name)
     crawler = Crawler(Settings({'BOT_NAME':'hpizza_ab','DOWNLOAD_DELAY':4}))
     crawler.signals.connect(lambda x=name: _chk_signals(x), signal=signals.spider_closed)
     crawler.configure()
     crawler.crawl(spider)
     crawler.start()
Ejemplo n.º 45
0
def setup_crawler(spider):
    #spider = FollowAllSpider(domain=domain)
    settings = get_project_settings()
    crawler = Crawler(settings)
    spider = crawler.spiders.create(spider_name)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 46
0
    def handle(self, *args, **options):
        self.stdout.write('Start')
        spider = LinuxFoundationSpider(year=options.get('year'))
        crawler = Crawler(spider, settings.SPIDER_SETTINGS)
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)

        crawler.crawl()
        reactor.run()  # the script will block here until the spider_closed signal is sent'''
Ejemplo n.º 47
0
def runSpider(args):
    spider = args[0]
    settings = args[1]
    crawler = Crawler(settings)
    crawler.signals.connect(stopCrawler, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 48
0
def setup_crawler(spider_class):
    obj_spider = spider_class()
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(obj_spider)
    crawler.start()
Ejemplo n.º 49
0
def crawl():
    spider = StackserviceSpider()
    crawler = Crawler(Settings())
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    reactor.run()  # the script will block here
Ejemplo n.º 50
0
Archivo: boot.py Proyecto: Varato/qfbot
 def setup(self):
     settings = get_project_settings()
     crawler = Crawler(settings)
     crawler.configure()
     crawler.signals.connect(self._next_crawl,
                             signal=signals.spider_closed)
     crawler.crawl(self.spider)
     crawler.start()
Ejemplo n.º 51
0
def setup_crawler(domain):
    spider = MovieSpider()
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 52
0
def crawl():
    crawler = Crawler(settings)
    spider = MySpider()
    crawler.signals.connect(callback, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    reactor.run()
Ejemplo n.º 53
0
 def setup_crawler(stuff):
     spider = MySpider(stuff=stuff)
     settings = Settings()
     #settings.setdict(env_overrides, priority='project')
     crawler = Crawler(settings)
     crawler.signals.connect(crawlstack, signal=signals.spider_closed)
     crawler.configure()
     crawler.crawl(spider)
     crawler.start()
Ejemplo n.º 54
0
 def run(self):
     dispatcher.connect(self.restart_crawler, signal=signals.spider_closed)
     settings = get_project_settings()
     crawler = Crawler(petitionspider.PetitionCountSpider, settings)
     crawler.crawl(start_urls=[self.get_setup_url()],
                   collection=self.mg_collection,
                   petition_number=self.petition_num)
     yields = reactor.run()
     print("yield from petnum={} : {}".format(self.petition_num, yields))
Ejemplo n.º 55
0
def spider_closing(spider):
    print("closing spider")

    settings = get_project_settings()
    crawler = Crawler(petitionspider.PetitionCountSpider, settings)
    global surls, cllct
    crawler.crawl(start_urls=surls, collection=cllct, petition_number=202136)
    time.sleep(5)
    reactor.run()
Ejemplo n.º 56
0
def startCrawler():
    RUNNING_CRAWLERS.append(1)
    crawler = Crawler(EstateListSpider, settings)
    # stop reactor when spider closes
    crawler.signals.connect(spider_closing, signal=signals.spider_closed)

    crawler.crawl()

    reactor.run()
Ejemplo n.º 57
0
def setup_crawler():
    crawler = Crawler(settings)
    crawler.configure()
    crawler.signals.connect(reactor_control.remove_crawler,
                            signal=signals.spider_closed)
    spider = AutoRobot_Prenium()
    crawler.crawl(spider)
    reactor_control.add_crawler()
    crawler.start()
Ejemplo n.º 58
0
 def config_spider(self, spid, spider):
   """The boring startup routine"""
   proj_settings = get_project_settings()
   crawler = Crawler(proj_settings)
   self._ids_to_crawlers_map[spid] = {"spider":spider, "crawler":crawler}
   # connect each spider's closed signal to self. When all spiders done, stop the reactor
   crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) # i do not really now if that is appended or overwritten
   crawler.configure()
   crawler.crawl(spider)
   crawler.start()  
Ejemplo n.º 59
0
def get_more_entropy():
  spider = TruenetSpider(domain='truenet.co.nz')
  settings = get_project_settings()
  crawler = Crawler(settings)
  crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
  crawler.configure()
  crawler.crawl(spider)
  crawler.start()
  log.start()
  reactor.run()