def setup_crawler(origem,destino,ano_saida,mes_saida,dia_saida,ano_chegada,mes_chegada,dia_chegada): spider = SubmarinoSpiderSpider(origem=origem,destino=destino,ano_saida=ano_saida,mes_saida=mes_saida,dia_saida=dia_saida, ano_chegada=ano_chegada,mes_chegada=mes_chegada,dia_chegada=dia_chegada,user_browser=random_header()) crawler = Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start()
def setup_crawler(self, spider): crawler = Crawler(get_project_settings()) crawler.signals.connect(self.spider_closed, signals.spider_closed) crawler.configure() crawler.crawl(spider) self.crawler = crawler self.crawler.start()
def goGrabSomeBags(): spider = PriceWatcherSpider(domain='barneys.com') crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start()
def __crawl(self, hiddenWebSite, localPort, extraPath='', crawlImages=True, crawlLinks=True,crawlContents=True, crawlFormData=True): def catch_item(sender, item, **kwargs): item['url'] = item['url'].replace('http://127.0.0.1:'+str(localPort)+extraPath, hiddenWebSite) print "[+]Processing URL %s ... " %(item['url']) from core.tortazo.databaseManagement.TortazoDatabase import TortazoDatabase database = TortazoDatabase() database.initDatabaseDeepWebCrawlerPlugin() self.__processPage(item, database) # setup crawler dispatcher.connect(catch_item, signal=signals.item_passed) dispatcher.connect(reactor.stop, signal=signals.spider_closed) settings = get_project_settings() settings.set('ITEM_PIPELINES', {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}, priority='cmdline') settings.set('IMAGES_STORE', config.deepWebCrawlerOutdir+hiddenWebSite) crawler = Crawler(settings) crawler.configure() spider = HiddenSiteSpider("http://127.0.0.1:"+str(localPort)+extraPath, hiddenWebSite, self.extractorRules) spider.setImages(crawlImages) spider.setLinks(crawlLinks) spider.setContents(crawlContents) spider.setForms(crawlFormData) crawler.crawl(spider) print "\n[+] Starting scrapy engine... this process could take some time, depending on the crawling and extractor rules applied... \n" crawler.start() reactor.run() print "[+] Crawler finished."
def setupCrawler(spider): crawler = Crawler(settings) crawler.configure() crawler.signals.connect(crawler_started, signals.engine_started) crawler.signals.connect(crawler_stopped, signals.engine_stopped) crawler.crawl(crawler.spiders.create(spider)) crawler.start()
class SWACrawlerScript(object): def __init__(self, origin, destination, date, debug=False, defaultSettings=True): self.debug = debug self.origin = origin self.destination = destination self.date = date # initialize spider self.spider = SWAFareSpider(self.origin, self.date, self.destination) # initialize settings settingValues = self.loadSettings() if defaultSettings else dict() self.settings = Settings(values=settingValues) # initialize crawler self.crawler = Crawler(self.settings) self.crawler.configure() print "Set up" def loadSettings(self): settingsList = [i for i in dir(swa.settings) if i[0] != "_"] settingsDict = {} for s in settingsList: # yikes settingsDict[s] = eval("swa.settings.%s" % s) return settingsDict def run(self): print "Running" self.crawler.crawl(self.spider) self.crawler.start() if ( self.debug ): log.start(loglevel=log.DEBUG) reactor.run()
def setup_crawler(domain, spidername): spider_class = globals()[spidername] spider = spider_class(domain=domain) crawler = Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start()
def test_priorization(self): webdriver = Mock() settings = self.settings(WEBDRIVER_BROWSER=webdriver) webdriver.get.side_effect = self._wait webdriver.page_source = u'' dispatcher.connect(self._stop_reactor, signal=signals.spider_closed) crawler = Crawler(Settings(values=settings)) crawler.configure() spider = self.Spider(name='test', domain='testdomain') crawler.crawl(spider) crawler.start() log.start(loglevel='ERROR') reactor.run() # I suspect web actions may be broken... assert webdriver.get.mock_calls == [ call('http://testdomain/path?wr=0'), call('http://testdomain/path?wr=0&wa=0'), call('http://testdomain/path?wr=0&wa=1'), call('http://testdomain/path?wr=1'), call('http://testdomain/path?wr=1&wa=0'), call('http://testdomain/path?wr=1&wa=1'), #call('http://testdomain/path?wr=0&wa=0&wr=0'), call('http://testdomain/path?wr=0&wa=1&wr=0'), call('http://testdomain/path?wr=0&wa=1&wr=0'), #call('http://testdomain/path?wr=1&wa=0&wr=0'), call('http://testdomain/path?wr=1&wa=1&wr=0'), call('http://testdomain/path?wr=1&wa=1&wr=0') ]
def setup_crawler( spider_class, **kwargs ): """ Use scrapy in a script see http://doc.scrapy.org/en/latest/topics/practices.html :param spider_class: Spider class to test :type spider_class: text """ def add_item(item): items.append(item) items = [] # create Crawler settings = get_project_settings() crawler = Crawler(settings) crawler.configure() # connect collecting function on item_passed crawler.signals.connect(add_item, signals.item_passed) # create & connect spider spider = spider_class(**kwargs) crawler.crawl(spider) # start crawler log.start() crawler.start() # run crawler task.deferLater(reactor, 1, reactor.stop) reactor.run() return items
def kickoff(self): """ Starts a new crawler :return: """ settings = Settings() # settings.set("USER_AGENT", "Test") settings.set('JOBDIR', self.args.data_dir) self.spider = MavenDataSpider() # Wrap with crawler, configure crawler = Crawler(self.spider, settings) crawler.signals.connect(spider_closing, signal=signals.spider_closed) logger.info('Starting crawler') crawler.crawl(self.spider, app=self, dbsess=self.session) self.spider = crawler.spider self.spider.link_queue_mode = False if self.args.debug: coloredlogs.install(level=logging.DEBUG) # Keeping thread working reactor.run()
def setup(self): settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.signals.connect(self._next_crawl, signal=signals.spider_closed) crawler.crawl(self.spider) crawler.start()
def setup_crawler(): spider = ScsSpider() settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def setup_crawler(spider_name): exec("spider = " + spider_name) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def test_priorization(self): webdriver = Mock() settings = self.settings(WEBDRIVER_BROWSER=webdriver) webdriver.get.side_effect = self._wait webdriver.page_source = u'' dispatcher.connect(self._stop_reactor, signal=signals.spider_closed) crawler = Crawler(Settings(values=settings)) crawler.configure() spider = self.Spider(name='test', domain='testdomain') crawler.crawl(spider) crawler.start() log.start(loglevel='ERROR') reactor.run() assert webdriver.get.mock_calls == [ call('http://testdomain/path?wr=0'), call('http://testdomain/path?wr=0&wa=0'), call('http://testdomain/path?wr=0&wa=1'), call('http://testdomain/path?wr=1'), call('http://testdomain/path?wr=1&wa=0'), call('http://testdomain/path?wr=1&wa=1'), call('http://testdomain/path?wr=0&wa=0&wr=0'), call('http://testdomain/path?wr=0&wa=1&wr=0'), call('http://testdomain/path?wr=1&wa=0&wr=0'), call('http://testdomain/path?wr=1&wa=1&wr=0')]
def set_crawler(spider, receiver): settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(receiver.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start()
def setup_crawler(self, supermarket, reactor_control): """Set up the Scrapy crawler. See http://doc.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script. Keyword arguments: supermarket -- the supermarket whose crawler should be set up """ cachefile = supermarket_filename(supermarket) if isfile(cachefile): remove(cachefile) settings = get_project_settings() url = supermarket_url(supermarket) settings.set('FEED_URI', supermarket_filename(supermarket)) spider = MySupermarketSpider(url) crawler = Crawler(settings) crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() reactor_control.add_crawler()
def test_priorization(self): webdriver = Mock() settings = self.settings(WEBDRIVER_BROWSER=webdriver) webdriver.get.side_effect = self._wait webdriver.page_source = u"" dispatcher.connect(self._stop_reactor, signal=signals.spider_closed) crawler = Crawler(Settings(values=settings)) crawler.configure() spider = self.Spider(name="test", domain="testdomain") crawler.crawl(spider) crawler.start() log.start(loglevel="ERROR") reactor.run() # I suspect web actions may be broken... assert webdriver.get.mock_calls == [ call("http://testdomain/path?wr=0"), call("http://testdomain/path?wr=0&wa=0"), call("http://testdomain/path?wr=0&wa=1"), call("http://testdomain/path?wr=1"), call("http://testdomain/path?wr=1&wa=0"), call("http://testdomain/path?wr=1&wa=1"), # call('http://testdomain/path?wr=0&wa=0&wr=0'), call("http://testdomain/path?wr=0&wa=1&wr=0"), call("http://testdomain/path?wr=0&wa=1&wr=0"), # call('http://testdomain/path?wr=1&wa=0&wr=0'), call("http://testdomain/path?wr=1&wa=1&wr=0"), call("http://testdomain/path?wr=1&wa=1&wr=0"), ]
def setup_crawler(): spider = doubanMovieSpider() settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def runspider(): date = datetime.datetime.utcnow() unix_date = calendar.timegm(date.utctimetuple()) route = request.args.get('route') domain = request.args.get('domain') directory = r"{0}\initiator\static\scrapes\{1}\{2}".format(os.getcwd(), domain, unix_date) if not os.path.exists(directory): os.makedirs(directory) logfile = open('testlog.log', 'w') log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG) log_observer.start() log.start(loglevel=logging.DEBUG) dispatcher.connect(stop_reactor, signal=signals.spider_closed) spider = MySpider(route, unix_date) settings_module = importlib.import_module('SiteCrawler.settings') settings = CrawlerSettings(settings_module) crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start() log.msg('Running reactor...') reactor.run() # the script will block here until the spider is closed log.msg('Reactor stopped.') return redirect(url_for('choose_graph', domain = domain, date = unix_date))
def setup_crawler(ticker): spider = StatsSpider(ticker=ticker) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def setup_crawler(user, website, validator_set, parameters): spider = WebQualitySpider(user=user, website=website, validators=validator_set, parameters=parameters) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def _crawl_next(self, spider): settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.signals.connect(self._done_task, signal=signals.spider_closed) crawler.crawl(spider) crawler.start()
def call_spider(file): """ Crea el spider y ejecuta el reactor. Copia los resultados del crawling a los archivos .json para luego transformarlos a los archivos data.json correspondientes. """ with open(file, "r") as f: list_url = f.readlines() domains = [] urls = [] created_files = [] for u in list_url: domain = u.strip('\n') url_aux = domain.split("/") domain_type = False if (len(url_aux) > 1): domain = url_aux[0] url = "http://" + url_aux[0] + "/datos/data" if domain == 'www.paraguay.gov.py': url = "http://" + url_aux[0] + "/datos" else: url = "http://" + u.strip('\n') + "/data" domain_type = True print "============= Domain " + domain print "============= Start url " + url response = requests.get(url + "/data.json") if response.status_code == 200: filename = FileController.FileController( ).save_existing_data_json(response, domain, True) created_files.append({ 'modalidad': 'recolecta', 'archivo': filename }) else: domains.append(domain) urls.append(url) spider = DataSpider(domains=domains, start_urls=urls, domain_type=domain_type) settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start(logfile="log.txt", loglevel=log.DEBUG, logstdout=False) reactor.run() # the script will block here """ Copiar los datos a los archivos .json """ data_spider.copy_items_to_files() """ Eliminar archivos temporales """ FileController.FileController().clean_tmp_files() """ Convertir los archivos .json a data.json (formato POD) """ for domain in domains: filename = DataJson.DataJson().convert(domain) created_files.append({ 'modalidad': 'data-hunting', 'archivo': filename }) return created_files
def main(): """Setups item signal and run the spider""" from twisted.internet import reactor from scrapy import signals from scrapy.settings import Settings from scrapy.crawler import Crawler def catch_item(sender, item, **kwargs): print "Got:", item settings = Settings() # set up crawler crawler = Crawler(settings) # shut off log crawler.settings.set('LOG_ENABLED', False, priority='cmdline') # set up signal to catch items scraped crawler.signals.connect(catch_item, signal=signals.item_passed) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.install() crawler.configure() # schedule spider spider = MySpider() crawler.crawl(spider) # start engine scrapy/twisted print "STARTING ENGINE" crawler.start() reactor.run() print "ENGINE STOPPED"
def setup_crawler(domain): spider = FollowAllSpider(domain=domain) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def _setup(self, project): spider = crawlspider.LinkSpider(project) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) self.add_crawler()
def setup_crawler(): spider = DmmDirectSpider(url=sys.argv[1]) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def parse_careers(spider): crawler = Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start() log.start() spider.start()
def handle(self, url_slug, **options): page = Page.objects.get(url_slug=url_slug) feed = page.feed store = page.store store_slug = store.slug.lower() opts = { 'recreate_tiles': options['recreate_tiles'], 'skip_images': not options['update_images'], 'skip_tiles': True, } start_urls = [] for tile in feed.tiles.all(): if tile.product: start_urls.append(tile.product.url) for content in tile.content.all(): for prod in content.tagged_products.all(): start_urls.append(prod.url) start_urls = set(start_urls) # set up standard framework for running spider in a script settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() spider = crawler.spiders.create(store_slug, **opts) spider.start_urls = start_urls spider.feed_id = feed.id crawler.crawl(spider) logging.info('Starting spider with options: {}'.format(opts)) crawler.start() reactor.run()
class CrawlerWorker(multiprocessing.Process): def __init__(self, spider, result_queue): multiprocessing.Process.__init__(self) self.result_queue = result_queue self.crawler = Crawler(Settings()) self.crawler.configure() self.items = [] self.spider = spider dispatcher.connect(self._item_passed, signals.item_passed) dispatcher.connect(self._stop_reactor, signal=signals.spider_closed) def _item_passed(self, item): self.items.append(item) def _stop_reactor(self): reactor.stop() def run(self): self.crawler.crawl(self.spider) self.crawler.start() reactor.run() self.result_queue.put(self.items)
def handle(self, *args, **options): if (not len(args) == 1) or (args[0] == u"help"): self.stdout.write(u"Usage: {0}\n".format(self.args)) self.stdout.write(self.help) else: settings = get_project_settings() settings.overrides["URLS"] = args[0] crawler = Crawler(settings) spider = GeneralSpider() crawler.configure() crawler.crawl(spider) crawler.start() log.start_from_crawler(crawler) # stop the reactor once the spider has finished crawler.signals.connect(reactor.stop, signal=signals.spider_closed) try: log.msg("Running reactor...") reactor.run() except KeyboardInterrupt: stop_reactor() finally: log.msg("Reactor stopped") log.msg("#" * 40)
def setup_crawler(spider, stop=False): ''' Takes a spider class object ''' # Deferred means other functions can wait on this finishing # Wait until the callback is triggered by spider close # See twisted docs d = defer.Deferred() def foo(*a, **kw): # The result to be passed to any callbacks to deferred # (we don't use it, so True could've been False, None w/e) d.callback(True) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() # Ref to foo otherwise it gets GC'd (garbage collected) crawler._tempref = foo # foo is the handler for the closed signal from this spider # N.B. dispatch returns spider and reason (e.g. 'finished') to foo. crawler.signals.connect(foo, signal=signals.spider_closed) crawler.crawl(spider) # N.B log is scrapy log. log2 is python color logger # The crawler arg is necessary for log_count/{ERROR, DEBUG, INFO..} stats # which you will want for stats mailer extension. # Starting this each time will cause the big torrade of ESMTP Error # log.start(crawler=crawler) crawler.start() return d
def crawl(cls, sites): stat = {"spiders": 0} def soft_stop_reactor(): stat["spiders"] -= 1 if not stat["spiders"]: reactor.stop() for site in sites: try: spider = site.parser.spider(site) except (NotImplementedError, ObjectDoesNotExist): logger.error( _('Spider not implemented for "%s" site', site.label)) else: stat["spiders"] += 1 with spider_project(spider) as settings: crawler = Crawler(settings) crawler.signals.connect( soft_stop_reactor, signal=signals.spider_closed) # reactor.stop crawler.configure() crawler.crawl(spider) crawler.start() logfile = open('crawl.log', 'w') log_observer = log.ScrapyFileLogObserver(logfile, level=logging.INFO) log_observer.start() # the script will block here until the spider_closed signal was sent reactor.run()
def start_crawler(spider, search): # Set up spider spider = TripAdvisorSpider(search=search) # Set up settings settings = Settings() # settings.overrides['FEED_FORMAT']='csv' # settings.overrides['FEED_URI']='tripadvisor_{0}.csv'.format(search) settings.set('CLOSESPIDER_ITEMCOUNT', False) settings.set('ROBOTSTXT_OBEY', False) settings.set('COOKIES_ENABLED', False) settings.set( 'ITEM_PIPELINES', {'tripadvisor_scraper.pipelines.TripadvisorScraperPipeline': 300}) settings.set('DOWNLOAD_DELAY', 3) settings.set('LOG_FILENAME', 'log.log') # settings.overrides['LOG_FILENAME'] = 'log.log' # settings.overrides['ROBOTSTXT_OBEY'] = False # Ignore robots.txt # settings.overrides['CLOSESPIDER_ITEMCOUNT']=1 # settings.overrides['DOWNLOAD_DELAY'] = 3 # settings.overrides['COOKIES_ENABLED'] = False # settings.overrides['ITEM_PIPELINES'] = { # 'tripadvisor_scraper.pipelines.TripadvisorScraperPipeline': 300, # } # Set up crawler crawler = Crawler(spider, settings) # crawler.configure() crawler.signals.connect(spider_closed, signal=signals.spider_closed) crawler.crawl(spider)
def setup_crawler(id="550", publisher="rbd"): spider = DmmQuerySpider(id, publisher) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def crawl(cls, sites): stat = {"spiders": 0} def soft_stop_reactor(): stat["spiders"] -= 1 if not stat["spiders"]: reactor.stop() for site in sites: try: spider = site.parser.spider(site) except (NotImplementedError, ObjectDoesNotExist): logger.error(_('Spider not implemented for "%s" site', site.label)) else: stat["spiders"] += 1 with spider_project(spider) as settings: crawler = Crawler(settings) crawler.signals.connect(soft_stop_reactor, signal=signals.spider_closed) # reactor.stop crawler.configure() crawler.crawl(spider) crawler.start() logfile = open('crawl.log', 'w') log_observer = log.ScrapyFileLogObserver(logfile, level=logging.INFO) log_observer.start() # the script will block here until the spider_closed signal was sent reactor.run()
class Wallhaven_Crawler: def __init__(self, query): self.query = query # Creation of spider from query self.spider = WallhavenSpider(self.query) # Getting scrapy project settings self.settings = get_project_settings() # Creation of crawler from spider and scrapy project settings self.crawler = Crawler(self.settings) self.crawler.signals.connect(reactor.stop, signal = signals.spider_closed) self.crawler.configure() def start(self): # Crawling from spider self.crawler.crawl(self.spider) self.crawler.start() # Logging all process #log.start() #log.msg('Reactor activated.') # Execution of twisted reactor reactor.run() # The script will block here until the 'spider_closed' signal is sent
class startPageSpiderService(service.Service): def __init__(self, parent): self.spiderService = parent self._crawler = Crawler(settings) self._crawler.configure() self._spider = startPageSpider(taskId=self.spiderService.taskId) def getStats(self): return self._crawler.stats.get_stats() def startService(self): service.Service.startService(self) #dispatcher.connect(self.stopService, signals.spider_closed) self._crawler.signals.connect(self.stopService, signals.spider_closed) # self._crawler.signals.connect(self.test2, 'writeListQuque') #_startPageSpider = startPageSpider(taskId=self.spiderService.taskId) self._crawler.crawl(self._spider) #self._crawler.start() self.startCrawl() def startCrawl(self): if not self._crawler.engine.running: self._crawler.start() # def test2(self): # print '================>111111111111111111111111<==========================' def stopService(self): log.msg(format='startPageSpiderService->stopService stop startPageSpiderService serviceName=(%(serviceName)s)',serviceName=self.name) service.Service.stopService(self) self.spiderService.removeSpiderService() self._crawler.stop() if self.name in self.spiderService.namedServices: self.spiderService.removeService(self)
class CrawlerScript(Process): """Runs Spider multiple times within one script by utilizing billiard package (tackle the ReactorNotRestartable error). Parameters ---------- current_dt: datetime.datetime() Timestamp of real-time data (EST). server: list List of Kafka brokers addresses. topic: str Specify Kafka topic to which the stream of data records will be published. """ def __init__(self, current_dt, server, topic): Process.__init__(self) self.current_dt = current_dt self.server = server self.topic = topic self.crawler = Crawler(VIXSpiderSpider, settings={'USER_AGENT': user_agent}) self.crawler.signals.connect(reactor.stop, signal=scrapy_signals.spider_closed) def run(self): self.crawler.crawl(self.current_dt, self.server, self.topic) reactor.run()
def call_spider(spider): settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() reactor.run()
def _create_spider (portion_item,name,wrk_urls): spider = HPizzaDetailSpider(portion_item, name=name,start_urls=wrk_urls) spiders.append(name) crawler = Crawler(Settings({'BOT_NAME':'hpizza_ab','DOWNLOAD_DELAY':4})) crawler.signals.connect(lambda x=name: _chk_signals(x), signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start()
def setup_crawler(spider): #spider = FollowAllSpider(domain=domain) settings = get_project_settings() crawler = Crawler(settings) spider = crawler.spiders.create(spider_name) crawler.configure() crawler.crawl(spider) crawler.start()
def handle(self, *args, **options): self.stdout.write('Start') spider = LinuxFoundationSpider(year=options.get('year')) crawler = Crawler(spider, settings.SPIDER_SETTINGS) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.crawl() reactor.run() # the script will block here until the spider_closed signal is sent'''
def runSpider(args): spider = args[0] settings = args[1] crawler = Crawler(settings) crawler.signals.connect(stopCrawler, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start()
def setup_crawler(spider_class): obj_spider = spider_class() settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(obj_spider) crawler.start()
def crawl(): spider = StackserviceSpider() crawler = Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run() # the script will block here
def setup_crawler(domain): spider = MovieSpider() settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start()
def crawl(): crawler = Crawler(settings) spider = MySpider() crawler.signals.connect(callback, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() reactor.run()
def setup_crawler(stuff): spider = MySpider(stuff=stuff) settings = Settings() #settings.setdict(env_overrides, priority='project') crawler = Crawler(settings) crawler.signals.connect(crawlstack, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start()
def run(self): dispatcher.connect(self.restart_crawler, signal=signals.spider_closed) settings = get_project_settings() crawler = Crawler(petitionspider.PetitionCountSpider, settings) crawler.crawl(start_urls=[self.get_setup_url()], collection=self.mg_collection, petition_number=self.petition_num) yields = reactor.run() print("yield from petnum={} : {}".format(self.petition_num, yields))
def spider_closing(spider): print("closing spider") settings = get_project_settings() crawler = Crawler(petitionspider.PetitionCountSpider, settings) global surls, cllct crawler.crawl(start_urls=surls, collection=cllct, petition_number=202136) time.sleep(5) reactor.run()
def startCrawler(): RUNNING_CRAWLERS.append(1) crawler = Crawler(EstateListSpider, settings) # stop reactor when spider closes crawler.signals.connect(spider_closing, signal=signals.spider_closed) crawler.crawl() reactor.run()
def setup_crawler(): crawler = Crawler(settings) crawler.configure() crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed) spider = AutoRobot_Prenium() crawler.crawl(spider) reactor_control.add_crawler() crawler.start()
def config_spider(self, spid, spider): """The boring startup routine""" proj_settings = get_project_settings() crawler = Crawler(proj_settings) self._ids_to_crawlers_map[spid] = {"spider":spider, "crawler":crawler} # connect each spider's closed signal to self. When all spiders done, stop the reactor crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) # i do not really now if that is appended or overwritten crawler.configure() crawler.crawl(spider) crawler.start()
def get_more_entropy(): spider = TruenetSpider(domain='truenet.co.nz') settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run()