def run(): configure_logging() runner = CrawlerRunner(get_project_settings()) d = runner.crawl(mabelleSpider) d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until the crawling is finished
import os from twisted.internet import reactor, defer from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from scrapy.utils.log import configure_logging from utils.mover import move_nas os.chdir('/root/spiderItem/pyusa/pyusa') configure_logging() runner = CrawlerRunner(get_project_settings()) @defer.inlineCallbacks def crawl(): yield runner.crawl('update_company_list') yield runner.crawl('update_company_docs') reactor.stop() crawl() reactor.run() move_nas('/data/lq/usa/reports', '/data/usa') move_nas('/data/lq/usa/announcements', '/data/usa_announcements')
def loop_crawl(): runner = CrawlerRunner(get_project_settings()) crawl(runner) reactor.run()
def __init__(self, session): self.session = session self.review_urls = [] self.profile_urls = [] settings = self.__configure_project() self.runner = CrawlerRunner(settings=settings)
def test_crawler_runner_accepts_None(self): runner = CrawlerRunner() self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')
item['store'] = final_product.store item['category'] = final_product.category item['animal'] = final_product.animal item['date'] = final_product.date item['date_str'] = final_product.date_str yield item next_page = response.css( 'a.fa-chevron-right::attr(href)').extract_first() if next_page is not None: yield response.follow(next_page, callback=self.parse) configure_logging() RUNNER = CrawlerRunner() @defer.inlineCallbacks def crawl(): """ Execute the spiders sequentially """ yield RUNNER.crawl(TiendapetDogFoodSpider) yield runner.crawl(TiendapetDogMedSpider) reactor.stop() crawl() reactor.run( ) #the script will block here until the last crawl call is finished
except: return parties_voted cdu_csu = parties_voted[cdu_cdu_pos - 1:cdu_cdu_pos + 2] cdu_csu_string = ''.join(cdu_csu) parties_voted_fixed = parties_voted[0:cdu_cdu_pos - 1] \ + [cdu_csu_string] \ + parties_voted[cdu_cdu_pos + 2:len(parties_voted)] return parties_voted_fixed def clean_umlaute(self, input): replacers = {'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss', 'Ä': 'AE', 'Ö': 'OE', 'Ü': 'UE', '–': '-' } for key, value in replacers.items(): input = input.replace(key, value) return input process = None process = CrawlerRunner() crawler = process.crawl(UmfrageerbegnisseSpider) crawler.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until the crawling is finished
def _runner(self): return CrawlerRunner({'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION'})
from scraping.webscrapy.spiders.amazonscraper import AmazonspiderSpider from scraping.webscrapy.spiders.playstationscraper import PlaystationspiderSpider from scraping.webscrapy.spiders.nintendoscraper import NintendospiderSpider from scraping.webscrapy.spiders.metacriticscraper import MetacriticspiderSpider from scraping.webscrapy.spiders.howlongtobeatscraper import HowLongToBeatspiderSpider from flask import Flask, render_template, jsonify, request, redirect, url_for from scrapy import signals from scrapy.crawler import CrawlerRunner from scrapy.signalmanager import dispatcher from scrapy.utils.project import get_project_settings final_data = {} crochet.setup() output_data = [] crawl_runner = CrawlerRunner({ 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 OPR/75.0.3969.259' }) actual_path = os.path.dirname(os.path.abspath(__file__)) games_path = actual_path + "/game_list.json" scrape_result_path = actual_path + "/scrape_result.json" outputfile_amazon = actual_path + "/outputfile_amazon.json" outputfile_howlongtobeat = actual_path + "/outputfile_howlongtobeat.json" outputfile_playstation = actual_path + "/outputfile_playstation.json" outputfile_metacritic = actual_path + "/outputfile_metacritic.json" f = open(games_path) games_data = json.load(f) def doScraping(game):
def test_async_def_asyncio_parse(self): runner = CrawlerRunner({"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor"}) runner.crawl(AsyncDefAsyncioSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield runner.join() self.assertIn("Got response 200", str(log))
def main(): username = raw_input('用户名:') password = getpass.getpass('密码:') rule = { 'keyword': raw_input('关键字:'), 'ori': int(raw_input('是否尝试下载原图 (1 原图 / 0 缩略图):') or "0"), 'star': float(raw_input('最少星标(默认为0,最大为5 输入值可包含两位小数):') or "0"), 'fav': int(raw_input('最少收藏数(默认为0):') or "0"), 'start_page': int(raw_input('从第几页开始下载(默认为1):') or "1"), 'end_page': int(raw_input('下载至第几页(默认为5):') or "5"), 'doujinshi': (raw_input('是否包含doujinshi(1 包含 / 0 不包含 默认包含):') or "1") == "0" and "off" or "on", 'manga': (raw_input('是否包含manga(1 包含 / 0 不包含 默认包含):') or "1") == "0" and "off" or "on", 'artist_cg': (raw_input('是否包含artist_cg(1 包含 / 0 不包含 默认包含):') or "1") == "0" and "off" or "on", 'game_cg': (raw_input('是否包含game_cg(1 包含 / 0 不包含 默认包含):') or "1") == "0" and "off" or "on", 'western': (raw_input('是否包含western(1 包含 / 0 不包含 默认不包含):') or "0") == "0" and "off" or "on", 'non_h': (raw_input('是否包含non_h(1 包含 / 0 不包含 默认不包含):') or "0") == "0" and "off" or "on", 'image_set': (raw_input('是否包含image_set(1 包含 / 0 不包含 默认包含):') or "1") == "0" and "off" or "on", 'cosplay': (raw_input('是否包含cosplay(1 包含 / 0 不包含 默认包含):') or "1") == "0" and "off" or "on", 'asian_porn': (raw_input('是否包含asian_porn(1 包含 / 0 不包含 默认不包含):') or "0") == "0" and "off" or "on", 'misc': (raw_input('是否包含misc(1 包含 / 0 不包含 默认不包含):') or "0") == "0" and "off" or "on", } settings = get_project_settings() # disable the scrapy log # configure_logging(settings) runner = CrawlerRunner(settings) runner.crawl(ExHentaiSpider, user={ 'username': username, 'password': password, }, rule=rule) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def main(): configure_logging() runner = CrawlerRunner() task = LoopingCall(lambda: runner.crawl(CovidGeoInfoSpider)) task.start(60 * 15, now=True) reactor.run()
def crawl(url): runner = CrawlerRunner() d = runner.crawl(PhilomathSpider, start_urls=[url]) d.addBoth(lambda _: reactor.stop()) reactor.run(installSignalHandlers=0)
def start(base_url, max_urls_to_scrap=50): '''More settings can be added here to change the spider behaviour https://docs.scrapy.org/en/latest/topics/settings.html''' process = CrawlerRunner(settings) return process.crawl(LinkSpider, start_urls=[base_url], max_to_scrap=max_urls_to_scrap)
def crawl(reactor): runner = CrawlerRunner() d = runner.crawl(GeocachingSpider.GeocachingSpider) d.addCallback(getResult) d.addCallback(crawl2, runner) return d
def test_crawler_runner_accepts_dict(self): runner = CrawlerRunner({'foo': 'bar'}) self.assertEqual(runner.settings['foo'], 'bar') self.assertEqual(runner.settings['RETRY_ENABLED'], default_settings.RETRY_ENABLED) self.assertIsInstance(runner.settings, Settings)
def run_generic_spider(user_id=None, spider_id=None, datamodel=None, run_spider_config=None, test_limit=None): """ just launch run_generic_spider() from any handler in controller """ print log_scrap.info("--- run_generic_spider / spider_id : %s ", spider_id) # !!! spider is launched from main.py level !!! # all relative routes referring to this... log_scrap.info("--- run_generic_spider / os.getcwd() : %s ", os.getcwd()) ### flattening run_spider_config : from nested to flat dict log_scrap.info( "--- run_generic_spider / 'flattenSpiderConfig()' on 'run_spider_config' --> 'spider_config_flat' ..." ) spider_config_flat = flattenSpiderConfig(run_spider_config) ### settings for crawler # cf : https://hackernoon.com/how-to-crawl-the-web-politely-with-scrapy-15fbe489573d # gllobal settings for scrapy processes (see upper) log_scrap.info("--- run_generic_spider / BOT_NAME : %s ", settings.get('BOT_NAME')) log_scrap.info("--- run_generic_spider / USER_AGENT : %s ", settings.get('USER_AGENT')) log_scrap.info("--- run_generic_spider / ITEM_PIPELINES : %s ", settings.get('ITEM_PIPELINES').__dict__) # specific settings for this scrapy process settings.set("CURRENT_SPIDER_ID", spider_id) settings.set("DOWNLOAD_DELAY", DOWNLOAD_DELAY) settings.set("RANDOMIZE_DOWNLOAD_DELAY", RANDOMIZE_DOWNLOAD_DELAY) ### initiating crawler process log_scrap.info("--- run_generic_spider / instanciate process ...") # process = CrawlerRunner() # process = CrawlerProcess() process = CrawlerRunner(settings=settings) ### adding CrawlerRunner as deferred def f(q): try: ### send/create custom spider from run_spider_config ### cf : https://stackoverflow.com/questions/35662146/dynamic-spider-generation-with-scrapy-subclass-init-error deferred = process.crawl(GenericSpider, user_id=user_id, datamodel=datamodel, spider_id=spider_id, spider_config_flat=spider_config_flat, test_limit=test_limit) deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(None) except Exception as e: q.put(e) ### putting task in queue and start q = Queue() p = Process(target=f, args=(q, )) p.start() result = q.get() p.join() if result is not None: raise result print "\n\n{}\n".format("> > > " * 20) ############################################# ### cool snippets ### convert to class object # spider = globals()[spider]
def crawl_and_scrape(url): """ 入力されたurlを起点に,再帰的にページをクロールし,取得した文章コンテンツを返す. Args: url (str): 再帰的クロールを開始するurl. Returns: (list): 取得したコンテンツのリスト.コンテンツは辞書形式:{"url":str, "title":str, "text":str} """ # output_pathはurlのドメインに一意 output_path = get_contents_path(url) # 既に当該ドメインをクロール済みの場合 if os.path.exists(output_path): try: with open(output_path, encoding="utf-8") as f: contents = json.load(f) return contents except: os.remove(output_path) settings = { # "USER_AGENT":"", "EXTENSIONS": { # 'scrapy.extensions.telnet.TelnetConsole': None, 'scrapy.extensions.closespider.CloseSpider': 1, }, "CLOSESPIDER_TIMEOUT": 0, "CLOSESPIDER_ITEMCOUNT": 30, "CLOSESPIDER_PAGECOUNT": 0, "CLOSESPIDER_ERRORCOUNT": 0, "CONCURRENT_REQUESTS": 16, "DOWNLOAD_DELAY": 1, # リクエストの間隔 "DEPTH_LIMIT": 2, # 再帰の深さ上限 "FEED_FORMAT": "json", "FEED_URI": output_path, # 出力ファイルパス "FEED_EXPORT_ENCODING": 'utf-8', } print("crawl start") # クローリング実行 # process: CrawlerProcess = CrawlerProcess(settings=settings) # process.crawl(MySpider, [url]) # process.start() # the script will block here until the crawling is finished runner: CrawlerRunner = CrawlerRunner(settings=settings) d = runner.crawl(MySpider, [url]) d.addBoth(lambda _: reactor.stop()) reactor.run() # クロールが終了するまでスクリプトはここでブロックされます # スクレイピング結果はoutput_pathに保存してある. try: with open(output_path, encoding="utf-8") as f: contents = json.load(f) except: contents = None print("crawl end") return contents
def spider_process(spider, settings=None): """Runs a scrapy CrawlerRunner""" runner = CrawlerRunner(settings) deferred = runner.crawl(spider) deferred.addBoth(lambda _: reactor.stop()) reactor.run()
def consume(reactor, hosts='kafka-server1:9092'): topic = 'crawl-queue' client = yield ready_client(reactor, hosts, topic) partitions = client.topic_partitions[topic] print(f'PARTITIONS: {partitions}') settings = project.get_project_settings() runner = CrawlerRunner(settings=settings) spiders = make_spider_dict(settings=settings) def process(consumer, message_list): """ This function is called for every batch of messages received from Kafka. It may return a Deferred, but this implementation just logs the messages received. """ deferreds = [] for m in message_list: log.debug("Got message %r", m) mo = json.loads(m.message.value) log.info(mo) log.info(consumers) try: spider_obj = spiders[mo['spider']] except KeyError as e: log.error( f"Unable to find spider '{mo['spider']}'. Ignoring error {e}" ) continue d = runner.crawl(spider_obj, rss_item=mo) deferreds.append(d) def consumer_commit(r): success = all(list(zip(*r))[0]) if success: log.info("Committing to consumer!") d = consumer.commit() d.addCallback(lambda _: log.info("Succesfully commited.")) else: log.error("A consumer failed. Not committing...") dl = defer.DeferredList(deferreds) dl.addBoth(consumer_commit) consumer.shutdown() consumers = [ Consumer(client, topic, partition, process, consumer_group='scraper-group', auto_offset_reset=OFFSET_EARLIEST, buffer_size=1024) for partition in partitions ] def cb_closed(result): """ Called when a consumer cleanly stops. """ log.info("Consumer stopped") def eb_failed(failure): """ Called when a consumer fails due to an uncaught exception in the processing callback or a network error on shutdown. In this case we simply log the error. """ log.error("Consumer failed: %s", failure) def start_consumer(consumer): log.info("Consumer started.") d = consumer.start(OFFSET_COMMITTED) d.addCallbacks(cb_closed, eb_failed) return d def stop_consumers(): log.info("\n") log.info("Time is up, stopping consumers...") d = defer.gatherResults([c.shutdown() for c in consumers]) d.addCallback(lambda result: client.close()) return d yield defer.gatherResults([start_consumer(c) for c in consumers] #[task.deferLater(reactor, 10.0, stop_consumers)] )
def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner()
runner = CrawlerRunner(settings=Settings({ 'DOWNLOAD_DELAY': 3, 'CONCURRENT_REQUESTS': 20, 'ROBOTSTXT_OBEY': False, 'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0', 'AUTOTHROTTLE_ENABLED': True, 'HTTPCACHE_ENABLED': False, # Cache enabled for testing 'HTTPCACHE_EXPIRATION_SECS': 0, 'TELNETCONSOLE_PORT': None, 'RETRY_ENABLED': False, 'REDIRECT_ENABLED': True, 'COOKIES_ENABLED': False, 'REACTOR_THREADPOOL_MAXSIZE': 20, 'DOWNLOAD_TIMEOUT': 30, # To avoid loss of entries? # Retry many times since proxies often fail 'RETRY_TIMES': 10, # Retry on most error codes since proxies fail for different reasons 'RETRY_HTTP_CODES': [500, 503, 504, 400, 403, 404, 408], 'DOWNLOADER_MIDDLEWARES': { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 610, 'random_useragent.RandomUserAgentMiddleware': 400, 'rotating_proxies.middlewares.RotatingProxyMiddleware': 110, 'rotating_proxies.middlewares.BanDetectionMiddleware': 620, }, 'PROXY_LIST': PROXY_PATH, 'PROXY_MODE': 0, 'USER_AGENT_LIST': USER_PATH }))
def test_crawler_runner_accepts_dict(self): runner = CrawlerRunner({'foo': 'bar'}) self.assertEqual(runner.settings['foo'], 'bar') self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')
def crawl(settings=None): runner = CrawlerRunner(settings=settings) runner.crawl(ClusterSpider) runner.join()
def test_crawler_runner_bootstrap_successful_for_several(self): runner = CrawlerRunner() yield runner.crawl(NoRequestsSpider) yield runner.crawl(NoRequestsSpider) self.assertEqual(runner.bootstrap_failed, False)
To run this app, run it directly: python flask_twisted.py Alternatively, use Twisted's `twist` executable. This assumes you're in the directory where the source files are located: PYTHONPATH=$(pwd) twist web --wsgi flask_twisted.app --port tcp:9000:interface=0.0.0.0 """ import json from flask import Flask from scrapy.crawler import CrawlerRunner from quote_scraper import QuoteSpider app = Flask('Scrape With Flask') crawl_runner = CrawlerRunner() # requires the Twisted reactor to run quotes_list = [] # store quotes scrape_in_progress = False scrape_complete = False @app.route('/greeting') @app.route('/greeting/<name>') def greeting(name='World'): return 'Hello %s!' % (name) @app.route('/crawl') def crawl_for_quotes(): """ Scrape for quotes
current = response.xpath('//a[contains(@class, "card current")]//div[@class="temp"]/span[1]/text()').get() real_feel = response.xpath('//a[contains(@class, "card current")]//div[@class="real-feel"]/text()').get() # Limpieza de datos ciudad = ciudad.replace('\n', '').replace('\r', '').strip() current = current.replace('°', '').replace('\n', '').replace('\r', '').strip() real_feel = real_feel.replace('RealFeel®', '').replace('°', '').replace('\n', '').replace('\r', '').strip() # Guardado de datos en un archivo f = open("./datos_clima_scrapy.csv", "a") f.write(ciudad + "," + current + "," + real_feel + "\n") f.close() print(ciudad) print(current) print(real_feel) print() # No necesito hacer yield. El yield me sirve cuando voy a guardar los datos # en un archivo, corriendo Scrapy desde Terminal # Logica para correr una extraccion de Scrapy periodicamente. Es decir, automatizarla. runner = CrawlerRunner() task = LoopingCall(lambda: runner.crawl(ExtractorClima)) # Para Investigar: Funciones Anonimas en Python task.start(20) # Tiempo en segundos desde la primera corrida del programa para repetir la extraccion reactor.run() # Segundos en 1 dia: 86400 # Segundos en 1 hora: 3600 # Segundos en 1 semana: 604800 # Segundos en 1 mes: 2.628e+6 # Segundos en 1 minuto: 60
def f(q): try: print "running...", spider_name ### setting up output directory for the spider. detail_path = '{ymd}/{spider}'.format( ymd=st_time.strftime('%Y%m%d'), spider=spider_name) html_path = os.path.join(html_base_dir, detail_path) html_path = os.path.join(html_path, '{dttm}.html') if not os.path.exists(os.path.dirname(html_path)): os.makedirs(os.path.dirname(html_path)) ### setting up log directory for the spider sp_log_path = log_path.format(spider=spider_name) sp_err_log_path = err_log_path.format(spider=spider_name) if not os.path.exists(os.path.dirname(sp_log_path)): os.makedirs(os.path.dirname(sp_log_path)) if not os.path.exists(os.path.dirname(sp_err_log_path)): os.makedirs(os.path.dirname(sp_err_log_path)) ### setting up logger for the spider logger = logging.getLogger(spider_name + '_logger') logger.setLevel(logging.DEBUG) debug_handler = logging.FileHandler(sp_log_path) error_handler = logging.FileHandler(sp_err_log_path) debug_handler.setLevel(logging.DEBUG) error_handler.setLevel(logging.WARNING) formatter = logging.Formatter( '%(asctime)s:%(module)s - %(message)s') debug_handler.setFormatter(formatter) error_handler.setFormatter(formatter) logger.addHandler(debug_handler) logger.addHandler(error_handler) logger.info('logger created') ### preparing spider object. settings = project.get_project_settings() settings.set('ITEM_PIPELINES', {pipeline: 1}, priority='cmdline') spider_loader = spiderloader.SpiderLoader.from_settings(settings) spider = spider_loader.load(spider_name) spider.html_path = html_path spider.proxies = proxies spider.use_proxy = use_proxy spider.logger = logger spider.sqllogger = sqllogger spider.repeat_count = 0 spider.repeat_threshold = 10 spider.error_count = 0 spider.error_threshold = 5 ### starting spider queue and spider crawler_runner = CrawlerRunner(settings) #from Scrapy docs deferred = crawler_runner.crawl(spider) deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(None) except Exception as e: q.put(e)
from twisted.internet import reactor import scrapy from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging from scrapy.utils.project import get_project_settings from spiders.gladiaspider import GladiaspiderSpider import sys configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) settings = get_project_settings() settings.set('FEED_FORMAT', 'csv') settings.set('FEED_URI', 'stats.csv') runner = CrawlerRunner(settings) if len(sys.argv) > 2: d = runner.crawl(GladiaspiderSpider, max_r=sys.argv[1], player=sys.argv[2]) elif len(sys.argv) > 1: d = runner.crawl(GladiaspiderSpider, max_r=sys.argv[1]) else: d = runner.crawl(GladiaspiderSpider, max_r='100') d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until the crawling is finished
def get_crawl_runner(): return CrawlerRunner( { 'USER_AGENT': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0', "LOG_LEVEL" : "INFO" })