def f(q): try: s = get_project_settings() user_agent_list = data.getUserAgentList() user_agent = None if len(user_agent_list) > 0: user_agent = random.choice(user_agent_list) if user_agent: s.update({ "LOG_ENABLED": "True", "TELNETCONSOLE_ENABLED": "False", "USER_AGENT": user_agent }) else: s.update({ "LOG_ENABLED": "True", "TELNETCONSOLE_ENABLED": "False" }) runner = crawler.CrawlerRunner(s) configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) deferred = runner.crawl( agent.class_element, cus_urls = data.getUrls(), agent = agent ) deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(None) except Exception as e: q.put(e) self.log.error(str(e)) raise SpiderException('[Warning, execute]: %s' % str(e))
def run_spider(spider): try: f = open(name_file_json, "w") path_file = os.path.abspath(name_file_json) print(path_file) runner = crawler.CrawlerRunner( settings={ "FEEDS": { name_file_json: {"format": "json"}, }, "CONCURRENT_REQUESTS": 50, "CONCURRENT_ITEMS": 50, "LOG_ENABLED": False, # 'ITEM_PIPELINES': ['MongoDBPipeline'] } ) deferred = runner.crawl(spider) deferred.addBoth(lambda _: reactor.stop()) reactor.run() if len(show_list_tax_code) > 0: upload_file_to_blob(path_file) # db_connect() print(show_list_tax_code) print("Success!") except Exception as e: print(str(e))
def f(q): try: runner = crawler.CrawlerRunner({ 'ITEM_PIPELINES': { 'scraper.pipelines.AttendancePipeline': 300, }, 'DOWNLOADER_MIDDLEWARES': { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }, 'SPLASH_URL': environ['SPLASH_INSTANCE'], 'SPIDER_MIDDLEWARES': { 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', }) deferred = runner.crawl(AttendanceSpider, USERNAME=USERNAME, PASSWORD=PASSWORD, chatID=chatID) deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(None) except Exception as e: q.put(e)
def f(q): try: runner = crawler.CrawlerRunner({ 'ITEM_PIPELINES': { 'scraper.pipelines.LecturePipeline': 300, 'scraper.pipelines.PracticalPipeline': 400, 'scraper.pipelines.AttendanceScreenshotPipeline': 500, }, 'DOWNLOADER_MIDDLEWARES': { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }, 'SPLASH_URL': environ['SPLASH_INSTANCE'], 'SPIDER_MIDDLEWARES': { 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', }) deferred = runner.crawl(AttendanceSpider, username=username, password=password, chatID=chatID) deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(None) except Exception as e: q.put(e)
def f(q): try: runner = crawler.CrawlerRunner() deferred = runner.crawl(CoinmarketcapSpider) deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(None) except Exception as e: q.put(e)
def f(return_list): def collect_items(signal, sender, item, response, spider): return_list.append(item) dispatcher.connect(collect_items, signal=signals.item_passed) runner = crawler.CrawlerRunner() deferred = runner.crawl(PlantInfoSpider, url=returned_url) deferred.addBoth(lambda _: reactor.stop()) reactor.run()
def f(q): try: runner = crawler.CrawlerRunner() deferred = runner.crawl(GoogleSpider) deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(GoogleSpider.google_response) except Exception as e: q.put(e)
def f(q): try: runner = crawler.CrawlerRunner() deferred = runner.crawl(GallerySpider) deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(GallerySpider.getGallery()) except Exception as e: q.put(e)
def f(q): try: runner = crawler.CrawlerRunner() deferred = runner.crawl(spider, *args, **kwargs) deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(None) except Exception as e: q.put(e)
def run_spider(spider, crawler_settings, root_urls, allowed_domains, depth, request_id): runner = crawler.CrawlerRunner(settings=crawler_settings) deferred = runner.crawl(UrlExtractor, root=root_urls, allow_domains=allowed_domains, depth=depth, request_id=request_id) return deferred
def runSpider2(spider): runner = crawler.CrawlerRunner(get_project_settings()) for i in range(len(config.keyWords)): keyWord = config.keyWords[i] spiderDataFile = 'news-' + str(i + 1) + '.json' runner.crawl(spider, keyWord=keyWord, spiderDataFile=spiderDataFile) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def f(q): try: runner = crawler.CrawlerRunner() deferred = runner.crawl(Airbnb0Spider) deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(None) except Exception as e: print e q.put(e)
def start_to_crawl(request_url, url_id): """Helper method to swapping crawling""" runner = crawler.CrawlerRunner() runner.crawl(GenericSpider, myurl=request_url, url_id=url_id) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def run_spider(self, queue, args): try: runner = crawler.CrawlerRunner(get_project_settings()) deferred = runner.crawl(LiveOptionSpider.kLiveOptionSpider, symbol=args[0][0], expiries=args[0][1], instrumentType=args[0][2], databaseTableName=args[0][3]) deferred.addBoth(lambda _:reactor.stop()) reactor.run() queue.put(None) except Exception as e: queue.put(e)
def f(q): try: runner = crawler.CrawlerRunner() spider = Jokes_SPider(start_urls='test') #pass parameter here deferred = runner.crawl(spider) deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(None) except Exception as e: q.put(e)
def f(q): try: runner = crawler.CrawlerRunner({ 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }) deferred = runner.crawl(spider) deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(None) except Exception as e: q.put(e)
def f(q, spiders): try: runner = crawler.CrawlerRunner({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) for spider in spiders: deferred = runner.crawl(spider) #reactor.run() deferred.addBoth(lambda _: reactor.stop()) print('reactor, stop-start') reactor.run() q.put(None) except Exception as e: print('run reactor exception occured!') q.put(e)
def run_spider(self, queue, args): try: runner = crawler.CrawlerRunner(get_project_settings()) deferred = runner.crawl(HistoricEquitySpider.kHistoricEquitySpider, symbol=args[0][0], startDate=args[0][1], endDate=args[0][2], databaseTableName=args[0][3], coldStart=args[0][-1]) deferred.addBoth(lambda _: reactor.stop()) reactor.run() queue.put(None) except Exception as e: queue.put(e)
def f(q, alreadyUsedWordList, notYetUsedWordList, setting, spider, search_value): try: runner = crawler.CrawlerRunner(setting) deferred = runner.crawl(spider, search_field=search_value) deferred.addBoth(lambda _: reactor.stop()) reactor.run() print('In multi') print(wordSet) for word in wordSet: if word not in alreadyUsedWordList and word not in notYetUsedWordList: notYetUsedWordList.append(word) q.put(None) except Exception as e: q.put(e)
def f(q): try: runner = crawler.CrawlerRunner(get_project_settings()) deferred = runner.crawl(spider, entry_url=self.entry_url, document_xpath=self.document_xpath, image_xpath=self.image_xpath, allow_rule=self.allow_rule, deny_rule=self.deny_rule, page_limit=self.page_limit, exclude_reg=self.exclude_reg ) deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(None) except Exception as e: q.put(e)
def f(q): try: print('A') runner = crawler.CrawlerRunner(get_project_settings()) print('B') deferred = runner.crawl(MySpider) print('C') deferred.addBoth(lambda _: reactor.stop()) print('D') reactor.run() print('E') q.put(None) print('F') except Exception as e: print('THIS IS AN EXCEPTION MOFO') raise q.put(e)
def run(): # process = CrawlerProcess(get_project_settings()) # print(keyWord, spiderDataFile) # process.crawl('baidu_spider', keyWord=keyWord, spiderDataFile=spiderDataFile) # process.start() try: print(keyWord, spiderDataFile) runner = crawler.CrawlerRunner(get_project_settings()) deferred = runner.crawl('baidu_spider', keyWord=keyWord, spiderDataFile=spiderDataFile) # deferred.addBoth(lambda _: reactor.stop()) # reactor.run() runner.join() except Exception as e: pass
def run(d, f): runner = crawler.CrawlerRunner({ 'USER_AGENT': 'Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30' }) kategoriar = [ 'reise', 'mote-klær', 'sko', 'mote-tilbehør', 'sport', 'elektronikk', 'interiør', 'hus-og-hage', 'bil-og-motor', 'kjæledyr', 'skjønnhet', 'underholdning', 'barn', 'tjenester' ] for kategori in kategoriar: runner.crawl(ViatrumfSpider, kategori=kategori) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def run_spider(spider): try: runner = crawler.CrawlerRunner( settings={ "FEEDS": { path_file_json + name_file_json: { "format": "json" }, }, "CONCURRENT_REQUESTS": 50, "CONCURRENT_ITEMS": 50, "FEED_EXPORT_ENCODING": 'utf-8', "LOG_ENABLED": False, # 'ITEM_PIPELINES': ['MongoDBPipeline'] }) deferred = runner.crawl(spider) deferred.addBoth(lambda _: reactor.stop()) reactor.run() print("Success!") except Exception as e: print(str(e))
if 'filename' in item: logging.info('found %s' % item['url']) abspath = os.path.join(args.d, item['filename']) if os.path.exists(abspath): logging.info( '%s exists already, this item was skipped' % abspath) else: rjitems.append(item) return item list_settings = get_project_settings() list_settings.set('ITEM_PIPELINES', { '__main__.RJPipeline': 100, }) list_runner = crawler.CrawlerRunner(list_settings) dl_settings = get_project_settings() dl_settings.set('FILES_STORE', args.d) dl_settings.set('ITEM_PIPELINES', { 'scrapy.pipelines.files.FilesPipeline': None, 'dlsitebackup.pipelines.DLFilesPipeline': 700 }) dl_settings.set('CONCURRENT_ITEMS', 1) dl_settings.set('CONCURRENT_REQUESTS', 1) dl_runner = crawler.CrawlerRunner(dl_settings) logging.basicConfig( filename='backup.log', format='%(levelname)s: %(message)s', level=logging.INFO )