def setup_crawler(origem,destino,ano_saida,mes_saida,dia_saida,ano_chegada,mes_chegada,dia_chegada):
    spider = SubmarinoSpiderSpider(origem=origem,destino=destino,ano_saida=ano_saida,mes_saida=mes_saida,dia_saida=dia_saida, ano_chegada=ano_chegada,mes_chegada=mes_chegada,dia_chegada=dia_chegada,user_browser=random_header())

    crawler = Crawler(Settings())
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    def test_priorization(self):
        webdriver = Mock()
        settings = self.settings(WEBDRIVER_BROWSER=webdriver)
        webdriver.get.side_effect = self._wait
        webdriver.page_source = u''

        dispatcher.connect(self._stop_reactor, signal=signals.spider_closed)

        crawler = Crawler(Settings(values=settings))
        crawler.configure()
        spider = self.Spider(name='test', domain='testdomain')
        crawler.crawl(spider)
        crawler.start()
        log.start(loglevel='ERROR')
        reactor.run()

        assert webdriver.get.mock_calls == [
            call('http://testdomain/path?wr=0'),
            call('http://testdomain/path?wr=0&wa=0'),
            call('http://testdomain/path?wr=0&wa=1'),
            call('http://testdomain/path?wr=1'),
            call('http://testdomain/path?wr=1&wa=0'),
            call('http://testdomain/path?wr=1&wa=1'),
            call('http://testdomain/path?wr=0&wa=0&wr=0'),
            call('http://testdomain/path?wr=0&wa=1&wr=0'),
            call('http://testdomain/path?wr=1&wa=0&wr=0'),
            call('http://testdomain/path?wr=1&wa=1&wr=0')]
Beispiel #3
0
def goGrabSomeBags():
    spider = PriceWatcherSpider(domain='barneys.com')
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Beispiel #4
0
def setup_crawler(user, website, validator_set, parameters):
    spider = WebQualitySpider(user=user, website=website, validators=validator_set, parameters=parameters)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Beispiel #5
0
def setup_crawler(domain):
    spider = FollowAllSpider(domain=domain)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Beispiel #6
0
def setup_crawler(ticker):
    spider = StatsSpider(ticker=ticker)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Beispiel #7
0
def setupCrawler(spider):
    crawler = Crawler(settings)
    crawler.configure()
    crawler.signals.connect(crawler_started, signals.engine_started)
    crawler.signals.connect(crawler_stopped, signals.engine_stopped)
    crawler.crawl(crawler.spiders.create(spider))
    crawler.start()
Beispiel #8
0
def setup_crawler(domain, spidername):
    spider_class = globals()[spidername]
    spider = spider_class(domain=domain)
    crawler = Crawler(Settings())
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
def set_crawler(spider, receiver):
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(receiver.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Beispiel #10
0
def setupCrawler(spider):
    crawler = Crawler(settings)
    crawler.configure()
    crawler.signals.connect(crawler_started, signals.engine_started)
    crawler.signals.connect(crawler_stopped, signals.engine_stopped)
    crawler.crawl(crawler.spiders.create(spider))
    crawler.start()
Beispiel #11
0
    def setup_crawler(self, supermarket, reactor_control):
        """Set up the Scrapy crawler. 
        See http://doc.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script.
        
        Keyword arguments:
        supermarket -- the supermarket whose crawler should be set up
        """

        cachefile = supermarket_filename(supermarket)
        if isfile(cachefile):
            remove(cachefile)

        settings = get_project_settings()

        url = supermarket_url(supermarket)
        settings.set('FEED_URI', supermarket_filename(supermarket))

        spider = MySupermarketSpider(url)
        crawler = Crawler(settings)
        crawler.signals.connect(reactor_control.remove_crawler,
                                signal=signals.spider_closed)
        crawler.configure()
        crawler.crawl(spider)
        crawler.start()
        reactor_control.add_crawler()
    def test_priorization(self):
        webdriver = Mock()
        settings = self.settings(WEBDRIVER_BROWSER=webdriver)
        webdriver.get.side_effect = self._wait
        webdriver.page_source = u""

        dispatcher.connect(self._stop_reactor, signal=signals.spider_closed)

        crawler = Crawler(Settings(values=settings))
        crawler.configure()
        spider = self.Spider(name="test", domain="testdomain")
        crawler.crawl(spider)
        crawler.start()
        log.start(loglevel="ERROR")
        reactor.run()

        # I suspect web actions may be broken...
        assert webdriver.get.mock_calls == [
            call("http://testdomain/path?wr=0"),
            call("http://testdomain/path?wr=0&wa=0"),
            call("http://testdomain/path?wr=0&wa=1"),
            call("http://testdomain/path?wr=1"),
            call("http://testdomain/path?wr=1&wa=0"),
            call("http://testdomain/path?wr=1&wa=1"),
            # call('http://testdomain/path?wr=0&wa=0&wr=0'),
            call("http://testdomain/path?wr=0&wa=1&wr=0"),
            call("http://testdomain/path?wr=0&wa=1&wr=0"),
            # call('http://testdomain/path?wr=1&wa=0&wr=0'),
            call("http://testdomain/path?wr=1&wa=1&wr=0"),
            call("http://testdomain/path?wr=1&wa=1&wr=0"),
        ]
Beispiel #13
0
def setup_crawler():
    spider = doubanMovieSpider()
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Beispiel #14
0
def runspider():
	date = datetime.datetime.utcnow()
	unix_date = calendar.timegm(date.utctimetuple())
	
	route = request.args.get('route')
	domain = request.args.get('domain')
	
	directory = r"{0}\initiator\static\scrapes\{1}\{2}".format(os.getcwd(), domain, unix_date)
	
	if not os.path.exists(directory):
		os.makedirs(directory)
	
	logfile = open('testlog.log', 'w')
	log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG)
	log_observer.start()
	log.start(loglevel=logging.DEBUG)
	
	dispatcher.connect(stop_reactor, signal=signals.spider_closed)
	
	spider = MySpider(route, unix_date)
	
	settings_module = importlib.import_module('SiteCrawler.settings')
	settings = CrawlerSettings(settings_module)
	crawler = Crawler(settings)
	
	crawler.configure()
	crawler.crawl(spider)
	crawler.start()
	
	log.msg('Running reactor...')
	reactor.run()  # the script will block here until the spider is closed
	log.msg('Reactor stopped.')
	return redirect(url_for('choose_graph', domain = domain, date = unix_date))
Beispiel #15
0
def setup_crawler():
    spider = ScsSpider()
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Beispiel #16
0
class SWACrawlerScript(object):
	def __init__(self, origin, destination, date, debug=False, defaultSettings=True):
		self.debug = debug
		
		self.origin = origin
		self.destination = destination
		self.date = date
		
		# initialize spider
		self.spider = SWAFareSpider(self.origin, self.date, self.destination)
		
		# initialize settings
		settingValues = self.loadSettings() if defaultSettings else dict()
		self.settings = Settings(values=settingValues)

		# initialize crawler
		self.crawler = Crawler(self.settings)
		self.crawler.configure()
		
		print "Set up"
	def loadSettings(self):	
		settingsList = [i for i in dir(swa.settings) if i[0] != "_"]
		settingsDict = {}
		for s in settingsList:
			# yikes
			settingsDict[s] = eval("swa.settings.%s" % s)
		return settingsDict
	
	def run(self):
		print "Running"
		self.crawler.crawl(self.spider)
		self.crawler.start()
		if ( self.debug ): log.start(loglevel=log.DEBUG)
		reactor.run()
Beispiel #17
0
 def _crawl_next(self, spider):
     settings = get_project_settings()
     crawler = Crawler(settings)
     crawler.configure()
     crawler.signals.connect(self._done_task, signal=signals.spider_closed)
     crawler.crawl(spider)
     crawler.start()
Beispiel #18
0
    def test_priorization(self):
        webdriver = Mock()
        settings = self.settings(WEBDRIVER_BROWSER=webdriver)
        webdriver.get.side_effect = self._wait
        webdriver.page_source = u''

        dispatcher.connect(self._stop_reactor, signal=signals.spider_closed)

        crawler = Crawler(Settings(values=settings))
        crawler.configure()
        spider = self.Spider(name='test', domain='testdomain')
        crawler.crawl(spider)
        crawler.start()
        log.start(loglevel='ERROR')
        reactor.run()

        # I suspect web actions may be broken...
        assert webdriver.get.mock_calls == [
            call('http://testdomain/path?wr=0'),
            call('http://testdomain/path?wr=0&wa=0'),
            call('http://testdomain/path?wr=0&wa=1'),
            call('http://testdomain/path?wr=1'),
            call('http://testdomain/path?wr=1&wa=0'),
            call('http://testdomain/path?wr=1&wa=1'),

            #call('http://testdomain/path?wr=0&wa=0&wr=0'),
            call('http://testdomain/path?wr=0&wa=1&wr=0'),
            call('http://testdomain/path?wr=0&wa=1&wr=0'),

            #call('http://testdomain/path?wr=1&wa=0&wr=0'),
            call('http://testdomain/path?wr=1&wa=1&wr=0'),
            call('http://testdomain/path?wr=1&wa=1&wr=0')
        ]
Beispiel #19
0
def setup_crawler():
    spider = DmmDirectSpider(url=sys.argv[1])
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Beispiel #20
0
class startPageSpiderService(service.Service):

    def __init__(self, parent):
        self.spiderService = parent
        self._crawler = Crawler(settings)
        self._crawler.configure()
        self._spider = startPageSpider(taskId=self.spiderService.taskId)

    def getStats(self):
        return self._crawler.stats.get_stats()

    def startService(self):
        service.Service.startService(self)
        #dispatcher.connect(self.stopService, signals.spider_closed)
        self._crawler.signals.connect(self.stopService, signals.spider_closed)
#         self._crawler.signals.connect(self.test2, 'writeListQuque')
        #_startPageSpider = startPageSpider(taskId=self.spiderService.taskId)
        self._crawler.crawl(self._spider)
        #self._crawler.start()
        self.startCrawl()
        
    def startCrawl(self):
        if not self._crawler.engine.running:
            self._crawler.start()
#     def test2(self):
#         print '================>111111111111111111111111<=========================='
    def stopService(self):
        log.msg(format='startPageSpiderService->stopService stop startPageSpiderService serviceName=(%(serviceName)s)',serviceName=self.name)
        service.Service.stopService(self)
        self.spiderService.removeSpiderService()
        self._crawler.stop()
        if self.name in self.spiderService.namedServices:
            self.spiderService.removeService(self)
Beispiel #21
0
    def crawl(cls, sites):
        stat = {"spiders": 0}

        def soft_stop_reactor():
            stat["spiders"] -= 1
            if not stat["spiders"]:
                reactor.stop()

        for site in sites:
            try:
                spider = site.parser.spider(site)
            except (NotImplementedError, ObjectDoesNotExist):
                logger.error(
                    _('Spider not implemented for "%s" site', site.label))
            else:
                stat["spiders"] += 1
                with spider_project(spider) as settings:
                    crawler = Crawler(settings)
                    crawler.signals.connect(
                        soft_stop_reactor,
                        signal=signals.spider_closed)  # reactor.stop
                    crawler.configure()
                    crawler.crawl(spider)
                    crawler.start()

        logfile = open('crawl.log', 'w')
        log_observer = log.ScrapyFileLogObserver(logfile, level=logging.INFO)
        log_observer.start()

        # the script will block here until the spider_closed signal was sent
        reactor.run()
Beispiel #22
0
def main():
    """Setups item signal and run the spider"""
    from twisted.internet import reactor
    from scrapy import signals
    from scrapy.settings import Settings
    from scrapy.crawler import Crawler

    def catch_item(sender, item, **kwargs):
        print "Got:", item

    settings = Settings()

    # set up crawler
    crawler = Crawler(settings)
    # shut off log
    crawler.settings.set('LOG_ENABLED', False, priority='cmdline')
    # set up signal to catch items scraped
    crawler.signals.connect(catch_item,   signal=signals.item_passed)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)

    crawler.install()
    crawler.configure()

    # schedule spider
    spider = MySpider()
    crawler.crawl(spider)

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    reactor.run()
    print "ENGINE STOPPED"
Beispiel #23
0
    def handle(self, *args, **options):

        if (not len(args) == 1) or (args[0] == u"help"):
            self.stdout.write(u"Usage: {0}\n".format(self.args))
            self.stdout.write(self.help)
        else:
            settings = get_project_settings()
            settings.overrides["URLS"] = args[0]
            crawler = Crawler(settings)
            spider = GeneralSpider()
            crawler.configure()
            crawler.crawl(spider)
            crawler.start()
            log.start_from_crawler(crawler)

            # stop the reactor once the spider has finished
            crawler.signals.connect(reactor.stop, signal=signals.spider_closed)

            try:
                log.msg("Running reactor...")
                reactor.run()
            except KeyboardInterrupt:
                stop_reactor()
            finally:
                log.msg("Reactor stopped")
                log.msg("#" * 40)
Beispiel #24
0
    def handle(self, url_slug, **options):
        page = Page.objects.get(url_slug=url_slug)
        feed = page.feed
        store = page.store
        store_slug = store.slug.lower()
        opts = {
            'recreate_tiles': options['recreate_tiles'],
            'skip_images': not options['update_images'],
            'skip_tiles': True,
        }

        start_urls = []
        for tile in feed.tiles.all():
            if tile.product:
                start_urls.append(tile.product.url)
            for content in tile.content.all():
                for prod in content.tagged_products.all():
                    start_urls.append(prod.url)
        start_urls = set(start_urls)

        # set up standard framework for running spider in a script
        settings = get_project_settings()
        crawler = Crawler(settings)
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.configure()

        spider = crawler.spiders.create(store_slug, **opts)
        spider.start_urls = start_urls
        spider.feed_id = feed.id

        crawler.crawl(spider)
        logging.info('Starting spider with options: {}'.format(opts))
        crawler.start()

        reactor.run()
Beispiel #25
0
def setup_crawler(
        spider_class,
        **kwargs
    ):
    """
    Use scrapy in a script
    see http://doc.scrapy.org/en/latest/topics/practices.html

    :param spider_class: Spider class to test
    :type spider_class: text
    """

    def add_item(item):
        items.append(item)

    items = []
    # create Crawler
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    # connect collecting function on item_passed
    crawler.signals.connect(add_item, signals.item_passed)
    # create & connect spider
    spider = spider_class(**kwargs)
    crawler.crawl(spider)
    # start crawler
    log.start()
    crawler.start()
    # run crawler
    task.deferLater(reactor, 1, reactor.stop)
    reactor.run()
    return items
Beispiel #26
0
def call_spider(file):
    """
    Crea el spider y ejecuta el reactor. Copia los resultados del crawling a los archivos .json para luego
    transformarlos a los archivos data.json correspondientes.
    """
    with open(file, "r") as f:
        list_url = f.readlines()
        domains = []
        urls = []
        created_files = []
        for u in list_url:
            domain = u.strip('\n')
            url_aux = domain.split("/")
            domain_type = False
            if (len(url_aux) > 1):
                domain = url_aux[0]
                url = "http://" + url_aux[0] + "/datos/data"
                if domain == 'www.paraguay.gov.py':
                    url = "http://" + url_aux[0] + "/datos"
            else:
                url = "http://" + u.strip('\n') + "/data"
                domain_type = True
            print "============= Domain " + domain
            print "============= Start url " + url
            response = requests.get(url + "/data.json")
            if response.status_code == 200:
                filename = FileController.FileController(
                ).save_existing_data_json(response, domain, True)
                created_files.append({
                    'modalidad': 'recolecta',
                    'archivo': filename
                })
            else:
                domains.append(domain)
                urls.append(url)

        spider = DataSpider(domains=domains,
                            start_urls=urls,
                            domain_type=domain_type)
        settings = get_project_settings()
        crawler = Crawler(settings)
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.configure()
        crawler.crawl(spider)
        crawler.start()
        log.start(logfile="log.txt", loglevel=log.DEBUG, logstdout=False)
        reactor.run()  # the script will block here
        """ Copiar los datos a los archivos .json """
        data_spider.copy_items_to_files()
        """ Eliminar archivos temporales """
        FileController.FileController().clean_tmp_files()
        """ Convertir los archivos .json a data.json (formato POD) """
        for domain in domains:
            filename = DataJson.DataJson().convert(domain)
            created_files.append({
                'modalidad': 'data-hunting',
                'archivo': filename
            })

        return created_files
Beispiel #27
0
    def __crawl(self, hiddenWebSite, localPort, extraPath='', crawlImages=True, crawlLinks=True,crawlContents=True, crawlFormData=True):
        def catch_item(sender, item, **kwargs):
            item['url'] = item['url'].replace('http://127.0.0.1:'+str(localPort)+extraPath, hiddenWebSite)
            print "[+]Processing URL %s ...  " %(item['url'])
            from core.tortazo.databaseManagement.TortazoDatabase import TortazoDatabase
            database = TortazoDatabase()
            database.initDatabaseDeepWebCrawlerPlugin()
            self.__processPage(item, database)

        # setup crawler
        dispatcher.connect(catch_item, signal=signals.item_passed)
        dispatcher.connect(reactor.stop, signal=signals.spider_closed)

        settings = get_project_settings()
        settings.set('ITEM_PIPELINES', {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}, priority='cmdline')
        settings.set('IMAGES_STORE', config.deepWebCrawlerOutdir+hiddenWebSite)

        crawler = Crawler(settings)
        crawler.configure()
        spider = HiddenSiteSpider("http://127.0.0.1:"+str(localPort)+extraPath, hiddenWebSite, self.extractorRules)
        spider.setImages(crawlImages)
        spider.setLinks(crawlLinks)
        spider.setContents(crawlContents)
        spider.setForms(crawlFormData)

        crawler.crawl(spider)
        print "\n[+] Starting scrapy engine... this process could take some time, depending on the crawling and extractor rules applied... \n"
        crawler.start()
        reactor.run()
        print "[+] Crawler finished."
Beispiel #28
0
class CrawlerWorker(multiprocessing.Process):

    def __init__(self, spider, result_queue):
        multiprocessing.Process.__init__(self)
        self.result_queue = result_queue

        self.crawler = Crawler(Settings())
        self.crawler.configure()

        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)
        dispatcher.connect(self._stop_reactor, signal=signals.spider_closed)

    def _item_passed(self, item):
        self.items.append(item)

    def _stop_reactor(self):
        reactor.stop()

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        reactor.run()
        self.result_queue.put(self.items)
Beispiel #29
0
def setup_crawler(domain):
    spider = FollowAllSpider(domain=domain)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
class Wallhaven_Crawler:
    def __init__(self, query):
        self.query = query
        
        # Creation of spider from query
        self.spider = WallhavenSpider(self.query)
        
        # Getting scrapy project settings
        self.settings = get_project_settings()
        
        # Creation of crawler from spider and scrapy project settings
        self.crawler = Crawler(self.settings)
        self.crawler.signals.connect(reactor.stop, signal = signals.spider_closed)
        self.crawler.configure()
        
    def start(self):
        # Crawling from spider
        self.crawler.crawl(self.spider)
        self.crawler.start()
        
        # Logging all process
        #log.start()
        #log.msg('Reactor activated.')
        # Execution of twisted reactor
        reactor.run() # The script will block here until the 'spider_closed' signal is sent
Beispiel #31
0
def setup_crawler(id="550", publisher="rbd"):
    spider = DmmQuerySpider(id, publisher)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Beispiel #32
0
def setup_crawler(spider, stop=False):
    '''
    Takes a spider class object
    '''
    # Deferred means other functions can wait on this finishing
    # Wait until the callback is triggered by spider close
    # See twisted docs
    d = defer.Deferred()

    def foo(*a, **kw):
        # The result to be passed to any callbacks to deferred
        # (we don't use it, so True could've been False, None w/e)
        d.callback(True)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    # Ref to foo otherwise it gets GC'd (garbage collected)
    crawler._tempref = foo
    # foo is the handler for the closed signal from this spider
    # N.B. dispatch returns spider and reason (e.g. 'finished') to foo.
    crawler.signals.connect(foo, signal=signals.spider_closed)
    crawler.crawl(spider)
    # N.B log is scrapy log. log2 is python color logger
    # The crawler arg is necessary for log_count/{ERROR, DEBUG, INFO..} stats
    # which you will want for stats mailer extension.
    # Starting this each time will cause the big torrade of ESMTP Error
    # log.start(crawler=crawler)
    crawler.start()
    return d
Beispiel #33
0
    def crawl(cls, sites):
        stat = {"spiders": 0}

        def soft_stop_reactor():
            stat["spiders"] -= 1
            if not stat["spiders"]:
                reactor.stop()

        for site in sites:
            try:
                spider = site.parser.spider(site)
            except (NotImplementedError, ObjectDoesNotExist):
                logger.error(_('Spider not implemented for "%s" site', site.label))
            else:
                stat["spiders"] += 1
                with spider_project(spider) as settings:
                    crawler = Crawler(settings)
                    crawler.signals.connect(soft_stop_reactor, signal=signals.spider_closed)  # reactor.stop
                    crawler.configure()
                    crawler.crawl(spider)
                    crawler.start()

        logfile = open('crawl.log', 'w')
        log_observer = log.ScrapyFileLogObserver(logfile, level=logging.INFO)
        log_observer.start()

        # the script will block here until the spider_closed signal was sent
        reactor.run()
Beispiel #34
0
 def setup(self):
     settings = get_project_settings()
     crawler = Crawler(settings)
     crawler.configure()
     crawler.signals.connect(self._next_crawl, signal=signals.spider_closed)
     crawler.crawl(self.spider)
     crawler.start()
Beispiel #35
0
def parse_careers(spider):
    crawler = Crawler(Settings())
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    spider.start()
Beispiel #36
0
def setup_crawler(spider_name):
    exec("spider = " + spider_name)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Beispiel #37
0
def setup_crawler(spider):
    #spider = FollowAllSpider(domain=domain)
    settings = get_project_settings()
    crawler = Crawler(settings)
    spider = crawler.spiders.create(spider_name)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Beispiel #38
0
 def _create_spider (portion_item,name,wrk_urls):
     spider = HPizzaDetailSpider(portion_item, name=name,start_urls=wrk_urls)
     spiders.append(name)
     crawler = Crawler(Settings({'BOT_NAME':'hpizza_ab','DOWNLOAD_DELAY':4}))
     crawler.signals.connect(lambda x=name: _chk_signals(x), signal=signals.spider_closed)
     crawler.configure()
     crawler.crawl(spider)
     crawler.start()
Beispiel #39
0
def setup_crawler():
    spider = DmozSpider(domain='http://zzk.xywy.com/')
    settings = get_project_settings()
    print settings
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Beispiel #40
0
def spider_setup():
	spider=Lily_bbs()
	crawler=Crawler(Settings())
	crawler.configure()
	crawler.crawl(spider)
	crawler.start()
	log.start()
	reactor.run()
Beispiel #41
0
def call_spider(spider):
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    reactor.run()
def crawl():
    crawler = Crawler(settings)
    spider = MySpider()
    crawler.signals.connect(callback, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    reactor.run()
Beispiel #43
0
def crawl():
    spider = StackserviceSpider()
    crawler = Crawler(Settings())
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    reactor.run()  # the script will block here
Beispiel #44
0
 def setup(self):
     settings = get_project_settings()
     crawler = Crawler(settings)
     crawler.configure()
     crawler.signals.connect(self._next_crawl,
                             signal=signals.spider_closed)
     crawler.crawl(self.spider)
     crawler.start()
Beispiel #45
0
def setup_crawler(domain):
    spider = MovieSpider()
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Beispiel #46
0
def setup_crawler(spider_class):
    obj_spider = spider_class()
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(obj_spider)
    crawler.start()
Beispiel #47
0
def runSpider(args):
    spider = args[0]
    settings = args[1]
    crawler = Crawler(settings)
    crawler.signals.connect(stopCrawler, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Beispiel #48
0
 def setup_crawler(stuff):
     spider = MySpider(stuff=stuff)
     settings = Settings()
     #settings.setdict(env_overrides, priority='project')
     crawler = Crawler(settings)
     crawler.signals.connect(crawlstack, signal=signals.spider_closed)
     crawler.configure()
     crawler.crawl(spider)
     crawler.start()
Beispiel #49
0
def setup_crawler():
    crawler = Crawler(settings)
    crawler.configure()
    crawler.signals.connect(reactor_control.remove_crawler,
                            signal=signals.spider_closed)
    spider = AutoRobot_Prenium()
    crawler.crawl(spider)
    reactor_control.add_crawler()
    crawler.start()
Beispiel #50
0
def get_more_entropy():
  spider = TruenetSpider(domain='truenet.co.nz')
  settings = get_project_settings()
  crawler = Crawler(settings)
  crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
  crawler.configure()
  crawler.crawl(spider)
  crawler.start()
  log.start()
  reactor.run()
Beispiel #51
0
    def run(self, args, otps):
        setting = get_project_settings()

        for spider_name in self.crawler.spiders.list():
            craw = Crawler(settings)
            craw.configure()
            spider = craw.spiders.create(spideer_name)
            craw.crawl(spider)
            craw.start()
        self.craw.start()
Beispiel #52
0
 def config_spider(self, spid, spider):
   """The boring startup routine"""
   proj_settings = get_project_settings()
   crawler = Crawler(proj_settings)
   self._ids_to_crawlers_map[spid] = {"spider":spider, "crawler":crawler}
   # connect each spider's closed signal to self. When all spiders done, stop the reactor
   crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) # i do not really now if that is appended or overwritten
   crawler.configure()
   crawler.crawl(spider)
   crawler.start()  
Beispiel #53
0
def do_scrape(spider_name):
    """
    Asynchronous task for individual scrapes that is executed by Celery workers.
    :param spider_name: str name of the spider that should be run
    :return: the full path of the jsonlines output file to which results are stored
    """
    # create and configure the spider
    crawl_settings = get_project_settings()
    # configure the output
    # Technically don't need this unless we actually do the scrape, but need to put
    # up here before the crawler is instantiated so the FEED_URI override is active
    output_name = generate_scrape_name(spider_name)
    output_path = os.path.join(crawl_settings.get('DATA_DIR_BASE'), 'scrapes',
                               output_name)
    crawl_settings.overrides['FEED_URI'] = output_path
    crawler = Crawler(crawl_settings)
    crawler.configure()
    try:
        spider = crawler.spiders.create(spider_name)
    except KeyError as e:
        # No spider found.
        raise RuntimeError(
            'Could not find spider with name {}'.format(spider_name))

    # Check to see if we're already running a scrape by looking for open ScrapeJobs
    is_scraping = is_spider_scraping(spider_name)
    if is_scraping is False:
        logger.info('Starting new scrape of {}'.format(spider_name))
        # Create the ScrapeJob record
        job_id = do_scrape.request.id
        if job_id is None:
            # Case if called directly without using Celery, put in a dummy job id
            timestamp = datetime.now().strftime('%y%m%d%H%M')
            job_id = 'MANUAL_RUN{}'.format(timestamp)
        job = ScrapeJob.objects.create(
            spider=spider_name,
            scheduled=datetime.now(),
            # see http://stackoverflow.com/questions/18872854/getting-task-id-inside-a-celery-task
            job_id=job_id,
            raw_response=output_path)
        # and set up the callback for updating it
        complete_cb = complete_job(job.id)

        # Connect the signals and logging, then start it up
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.signals.connect(complete_cb, signal=signals.spider_closed)
        log.start(loglevel=log.INFO, logstdout=True)
        crawler.crawl(spider)
        crawler.start()
        reactor.run()
    else:
        logger.info('Pending job found for spider {}'.format(spider_name))
        job = is_scraping

    return job.raw_response
Beispiel #54
0
def setup_crawler(keywords):
    spider = BaiduSpider(keywords=keywords)
    settings = get_project_settings()
    crawler = Crawler(settings)
    # stop reactor when spider closes
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start(loglevel=log.DEBUG)
    reactor.run()
Beispiel #55
0
  def crawla(self):
	#dispatcher.connect(reactor.stop(), signal=signals.spider_closed)
	spider = Titlespider()
	settings = get_project_settings()
	crawler = Crawler(settings)
	crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
	crawler.configure()
	crawler.crawl(spider)
	crawler.start()
	log.start()
	reactor.run()
Beispiel #56
0
def run():
    print "pastebin!!!"
    spider = PastebinSpider()
    settings = get_project_settings()
    crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    print "Pastebin scraping has finished"
    print "Iteration: " + str(datetime.datetime.now())
Beispiel #57
0
def setup_crawler(keyword):
    print 'schedule run script is running.........'
    spider = BaiduSpider(keyword=keyword)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start(loglevel=log.DEBUG)
    reactor.run()
Beispiel #58
0
def _cron_kaohsiung():
    dispatcher.connect(stop_reactor, signal=signals.spider_closed)
    spider = KaohsiungSpider()
    crawler = Crawler(Settings())
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    log.msg('Running reactor...')
    reactor.run()  # the script will block here until the spider is closed
    log.msg('Reactor stopped.')
Beispiel #59
0
    def setupCrawler(self, spiderName):
        crawler = Crawler(get_project_settings())
        crawler.signals.connect(self.spiderClosed,
                                signal=signals.spider_closed)
        #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.configure()

        spider = crawler.spiders.create(spiderName)

        crawler.crawl(spider)
        crawler.start()