def run_spider(): settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36") settings.set("ITEM_PIPELINES", { 'pipelines.FilterProxyPipline': 1, 'pipelines.SaveProxyPipeline': 2 }) settings.set("LOG_STDOUT ", True) # 配置日志记录规则设置 # configure_logging({ # 'filename': datetime.now().strftime('%Y_%m_%d_%H_proxy.log'), # 'format': '%(asctime)s %(levelname)-8s %(name)-15s %(message)s', # 'level': logging.INFO # }) configure_logging(install_root_handler=False) # 初始化日志路径 logpath = datetime.now().strftime(log_path) if not os.path.isdir(logpath): os.makedirs(logpath) logging.basicConfig( filename=datetime.now().strftime('%s/%s_proxy.log' % (logpath, log_file)), format=log_format, level=logging.INFO ) process = CrawlerProcess(settings) process.crawl(GetProxySpider) process.start()
def __init__(self, titlesfile = None, platform = None, region = None): # set default encoding to utf8 for parsing and logging # utf-8 characters in console and files # reload(sys) sys.setdefaultencoding('utf8') configure_logging(install_root_handler=False) logging.basicConfig( filename='export.log', filemode = 'a', format='%(levelname)s: %(message)s', level=logging.INFO ) # identify platform # self.platform = platform if self.platform is None: logging.error('No platform found! Pass it as an argument.') return else: platformId = platforms.getId(self.platform) if platformId is None: logging.error('Platform ' + self.platform + ' not supported.') return self.titlesfile = titlesfile self.region = region if self.region is None: self.region = "Worldwide" if titlesfile: titles = [] urls = [] with open( self.titlesfile ) as f: titles = f.read().splitlines() for title in titles: logging.debug('Submitting title:' + title ) urls.append( 'http://mobygames.com/search/quick' + '?q=' + title + '&p=' + platformId + '&search=Go' '&sFilter=1' '&sG=on' '&search_title=' + urllib.quote( title ) + '&search_platform=' + urllib.quote(self.platform) + '&search_region=' + urllib.quote(self.region) ) process = CrawlerProcess(get_project_settings()) process.crawl(MobygamesSpider, start_urls=urls) process.start() else: logging.warning('No file.')
def crawl_articles(spids): settings = get_project_settings() configure_logging(settings, install_root_handler=False) logging.getLogger('scrapy').setLevel(logging.WARNING) runner = CrawlerRunner(settings) loader = runner.spider_loader if 'all' in spids: spids = loader.list() spiders = [ loader.load(spid) for spid in spids if spid in loader.list() ] if not spiders: return random.shuffle(spiders) for spider in spiders: runner.crawl(spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) logger.info('crawl job starting...') try: reactor.run() except Exception: logger.exception('crawl job got exception:') logger.info('crawl job finished')
def test_spider_custom_settings_log_level(self): with tempfile.NamedTemporaryFile() as log_file: class MySpider(scrapy.Spider): name = 'spider' custom_settings = { 'LOG_LEVEL': 'INFO', 'LOG_FILE': log_file.name, } configure_logging() self.assertEqual(get_scrapy_root_handler().level, logging.DEBUG) crawler = Crawler(MySpider, {}) self.assertEqual(get_scrapy_root_handler().level, logging.INFO) info_count = crawler.stats.get_value('log_count/INFO') logging.debug('debug message') logging.info('info message') logging.warning('warning message') logging.error('error message') logged = log_file.read().decode('utf8') self.assertNotIn('debug message', logged) self.assertIn('info message', logged) self.assertIn('warning message', logged) self.assertIn('error message', logged) self.assertEqual(crawler.stats.get_value('log_count/ERROR'), 1) self.assertEqual(crawler.stats.get_value('log_count/WARNING'), 1) self.assertEqual( crawler.stats.get_value('log_count/INFO') - info_count, 1) self.assertEqual(crawler.stats.get_value('log_count/DEBUG', 0), 0)
def crawler_start(usage, tasks): """Start specified spiders or validators from cmd with scrapy core api. There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't assign any tasks, all these spiders will run. """ maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS if not tasks: spiders = origin_spiders else: spiders = list() cases = list(map(BaseCase, origin_spiders)) for task in tasks: for case in cases: if case.check(task, maps): spiders.append(case.spider) break else: # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format( # task, list(maps.keys()))) pass if not spiders: #crawler_logger.warning('no spider starts up, please check your task input') return settings = get_project_settings() configure_logging(settings) runner = CrawlerRunner(settings) for spider in spiders: runner.crawl(spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def handle_lj(self): configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner(crawler_setting) #d = runner.crawl(HouseSpider) d = runner.crawl(LianjiaHouseSpider) d.addBoth(lambda _: reactor.stop()) reactor.run()
def run_spider(): options = { 'CONCURRENT_ITEMS': 250, 'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)', 'CONCURRENT_REQUESTS': 30, 'DOWNLOAD_DELAY': 0.5, 'COOKIES_ENABLED': False, } settings = get_project_settings() configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) settings.update(options); #BookToscrapeSpider basic version from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider #runner = CrawlerRunner(settings) #runner.crawl(BookToscrapeSpider()) #BookToscrapeSpider crawl version from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl runner = CrawlerRunner(settings) runner.crawl(BookToscrapeSpider_crawl()) #crawler = Crawler(settings) #crawler.signals.connect(reactor.stop, signal=signals.spider_closed) #crawler.install() #crawler.configure() #crawler.crawl(spider) #crawler.start() #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False) d= runner.join() d.addBoth(lambda _:reactor.stop()) reactor.run()
def main(): """ Scrapy pull request: configure_logging() should accept a config argument Note: scrapy.utils.log.TopLevelFormatter is cool need to access Scrapy loggger's handler and replace the filter with a new TopLevelFormatter with more names: e.g. ['scrapy', 'scrapybox', 'aiohttp'] """ configure_logging(settings) logger.info('Scrapybox server starting') # formatter = logging.Formatter(fmt=settings.get('LOG_FORMAT'), # datefmt=settings.get('LOG_DATEFORMAT')) # handler = logging.StreamHandler() # handler.setFormatter(formatter) # handler.setLevel(settings.get('LOG_LEVEL')) # logging.root.addHandler(handler) twisted.internet.reactor.run() logger.info('Twisted reactor running') app = aiohttp.web.Application( loop=asyncio.get_event_loop(), # middlewares=[aiohttp_debugtoolbar.toolbar_middleware_factory] ) # aiohttp_debugtoolbar.setup(app) # http://127.0.0.1:8080/_debugtoolbar aiohttp_jinja2.setup(app, loader=jinja2.FileSystemLoader(user_path)) app.on_shutdown.append(on_shutdown) app['static_path'] = user_path scrapybox.server.routes.add(app) logger.info('Aiohttp server starting') aiohttp.web.run_app(app)
def test_spider_custom_settings_log_level(self): log_file = self.mktemp() class MySpider(scrapy.Spider): name = 'spider' custom_settings = { 'LOG_LEVEL': 'INFO', 'LOG_FILE': log_file, # disable telnet if not available to avoid an extra warning 'TELNETCONSOLE_ENABLED': telnet.TWISTED_CONCH_AVAILABLE, } configure_logging() self.assertEqual(get_scrapy_root_handler().level, logging.DEBUG) crawler = Crawler(MySpider, {}) self.assertEqual(get_scrapy_root_handler().level, logging.INFO) info_count = crawler.stats.get_value('log_count/INFO') logging.debug('debug message') logging.info('info message') logging.warning('warning message') logging.error('error message') with open(log_file, 'rb') as fo: logged = fo.read().decode('utf8') self.assertNotIn('debug message', logged) self.assertIn('info message', logged) self.assertIn('warning message', logged) self.assertIn('error message', logged) self.assertEqual(crawler.stats.get_value('log_count/ERROR'), 1) self.assertEqual(crawler.stats.get_value('log_count/WARNING'), 1) self.assertEqual( crawler.stats.get_value('log_count/INFO') - info_count, 1) self.assertEqual(crawler.stats.get_value('log_count/DEBUG', 0), 0)
def handle_cap(self): configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner(crawler_setting) #d = runner.crawl(AnjukeCaptchaSpider) #d.addBoth(lambda _: reactor.stop()) #reactor.run() print 'skip'
def runSpider(self, spider): configure_logging({'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s'}) settings = Settings() settings.set('FEED_URI', 'output.json') settings.set('FEED_FORMAT', 'json') runner = CrawlerRunner(settings) dfd = runner.crawl(spider) dfd.addBoth(lambda _: reactor.stop())
def runProcess(self): configure_logging() dbHandler.check_watches() runner = CrawlerRunner() runner.crawl(spider.available_courses_spider) dbHandler.check_watches() d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def main(): locale.setlocale(locale.LC_TIME, 'es_ES') configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner() d = runner.crawl(LotoSpider) d.addBoth(lambda _: reactor.stop()) reactor.run() return None
def run(self, args, opts): configure_logging(install_root_handler=False) logger = logging.getLogger() databaseLogHandler = DatabaseLogHandler() logger.addHandler(databaseLogHandler) for spider_name in self.crawler_process.spider_loader.list(): spider_class = self.crawler_process.spider_loader.load(spider_name) self.crawler_process.crawl(spider_class) self.crawler_process.start()
def __init__(self, seedUrl='http://www.amazon.co.uk/gp/bestsellers/electronics?ie=UTF8&ref_=sv_ce_0'): print "Starting gadgetzon spider" self.seedUrl = seedUrl configure_logging(install_root_handler=False) self.thisDir = os.path.abspath(os.path.dirname(__file__)) logConfigFile = os.path.join(self.thisDir, 'gadget.log') logging.basicConfig( filename=logConfigFile, format='%(asctime)s - %(name)s - %(levelname)s - %(module)s : %(lineno)d - %(message)s', level=logging.DEBUG ) self.price_parser = price_parser.PriceParser()
def run(self, args, opts): if len(args) < 1: raise UsageError() elif len(args) > 1: raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported") spider_name = args[0] configure_logging(install_root_handler=False) logger = logging.getLogger() databaseLogHandler = DatabaseLogHandler() logger.addHandler(databaseLogHandler) self.crawler_process.crawl(spider_name) self.crawler_process.start()
def scrape_it(output_path): csv_filename= path.join(output_path, 'items.csv') # TODO fix cantrestartreactor problem settings = Settings(dict(FEED_FORMAT='csv', FEED_URI=csv_filename, )) configure_logging({'LOG_ENABLED':False}) process = CrawlerProcess( settings) process.crawl(ListingSpider) process.start()#stop_after_crawl=False)
def perform_scrape(pick_or_shake): '''Perform a Scraping run for either a PickSpider or Shakespider. Args: pick_or_shake (str): string 'pick' or 'shake' specifiying what spider to run ''' #get the settings and configure the logging level settings = scrapingtools.get_settings() settings.set('LOG_ENABLED',True) configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) #get the job list for the spider if pick_or_shake=='Pick': joblist=get_pick_joblist(settings.get('SOURCE_FILE_NAME')) elif pick_or_shake='Shake': joblist=get_pick_joblist(settings.get('SOURCE_FILE_NAME'))
def scrape(self): ''' create a scrapy spider and scrape this users start_url ''' start_urls = ['http://airbnb.com/s?host_id=%i'%(self.userid)] # TODO fix cantrestartreactor problem settings = Settings(dict(FEED_FORMAT='csv', FEED_URI=self.scraped_csv, )) process = CrawlerProcess( settings) process.crawl(ListingSpider, start_urls=start_urls ) configure_logging({'LOG_ENABLED':False,'LOG_LEVEL':'CRITICAL'}) process.start()#stop_after_crawl=False)
def main(): configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) settings = get_project_settings() runner = CrawlerRunner(settings) # settings.set('FEED_FORMAT','json') # settings.set('FEED_URI', 'result.json') runner.crawl(PttBoard) runner.crawl(PTTArticle) d = runner.join() d.addBoth(lambda _: reactor.stop()) result = reactor.run() # the script will block here until the crawling is finished print result
def cleanup(ctx): """ Cleanup old cache entries. By default, entries older than 90 days will be removed. This value can be overriden in the config file. """ settings = ctx.obj["settings"] # Manually configure logging since we don't have a CrawlerProcess which # would take care of that. configure_logging(settings) if not settings.getbool("HTTPCACHE_ENABLED"): logger.error("Cache is disabled, will not clean up cache dir.") return 1 run_cleanup_cache(settings)
def _run_feed_spider(url, feed): spid = str(uuid.uuid4()) feed['_id'] = spid configure_logging(TEST_SETTINGS, install_root_handler=False) logging.getLogger('scrapy').setLevel(logging.WARNING) save_feed(url) cls = SpiderFactory.create_spider(feed) runner = CrawlerRunner(TEST_SETTINGS) d = runner.crawl(cls) d.addBoth(lambda _: reactor.stop()) reactor.run(installSignalHandlers=False) n = get_stats([spid])[spid] if n == 0: raise Exception(f'feed spider crawled 0 articles') if is_exists_spider(url): raise Exception(f'feed[{url}] existed') del feed['_id'] save_spider_settings(feed)
def perform_scrape(): '''Perform a MunchSpider scrape using the current Scrapy Settings ''' settings = scrapingtools.get_settings() publisher_database = get_publisher_database(settings,mongo=False) configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) (doi_links,doi_sources) = get_joblist(settings.get('COLLECT_FILE_NAME')) domains = get_domains(publisher_database) runner=CrawlerRunner(settings) d=runner.crawl(Spiders.MunchSpider.MunchSpider, start_urls=doi_links, crossref_items = doi_sources, allowed_domains=domains, publisher_database=publisher_database, ) d2=d.addBoth(lambda _: reactor.stop()) d2.addCallback(lambda _: scrapingtools.finalise_file(settings.get('COMPLETE_FILE_NAME'))) d2.addCallback(lambda _: scrapingtools.finalise_file(settings.get('ERROR_FILE_NAME')))
def crawl_guardian(job_id, url): import scrapy from scrapy.crawler import CrawlerProcess, CrawlerRunner from scrapy.settings import Settings from scrapy.utils.project import get_project_settings from scrapy.utils.log import configure_logging from scraper.guardianukscraper.spiders.guardian_spider import GuardianSpider from scraper.guardianukscraper import settings import os os.environ['SCRAPY_SETTINGS_MODULE'] = 'scraper.guardianukscraper.settings' settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE'] settings = Settings() settings.setmodule(settings_module_path, priority='project') configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) crawler = scrapy.crawler.Crawler(GuardianSpider,settings) return crawler.crawl(job_id=job_id, url=url)
def __setup(self): """ Setup :return: """ if not os.path.exists(self.__local_download_dir_warc): os.makedirs(self.__local_download_dir_warc) # make loggers quite configure_logging({"LOG_LEVEL": "ERROR"}) logging.getLogger('requests').setLevel(logging.CRITICAL) logging.getLogger('readability').setLevel(logging.CRITICAL) logging.getLogger('PIL').setLevel(logging.CRITICAL) logging.getLogger('newspaper').setLevel(logging.CRITICAL) logging.getLogger('newsplease').setLevel(logging.CRITICAL) logging.getLogger('urllib3').setLevel(logging.CRITICAL) # set own logger logging.basicConfig(level=self.__log_level) self.__logger = logging.getLogger(__name__) self.__logger.setLevel(self.__log_level)
def run(): # Logging settings configure_logging(install_root_handler=False) logging.basicConfig( datefmt='%Y-%m-%d %H:%M:%S', filemode='w', filename='output/' + datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") + '.log', format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO ) # Project settings settings = Settings() settings.setmodule('settings', priority='project') # Class to run parallel spiders process = CrawlerProcess(settings) process.crawl(spiders.LiquipediaSpider) # Block until crawling is complete process.start()
def run_spiders(): """ 说明: 如果该调用程序是程序的最外层循环,那么此处可以直接调用爬虫的配置文件: 在文件中使用如下代码: from scrapy.utils.project import get_project_settings # some code runner = CrawlerRunner(get_project_settings()) 如果该程序调用只是一个封装的函数,则配置文件需要自己构造,如下面的代码 """ configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) # 定义日志格式 # 设置当前爬虫的配置信息,此处是选择要调用的pipe settings = Settings() settings.set('ITEM_PIPELINES', {'spider.tutorial.pipelines.TutorialPipeline': 300,}) # 将加载后的配置文件加载到爬虫中 runner = CrawlerRunner(settings) # 启用爬虫运行器 d = runner.crawl(ChinazSpider) d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until the crawling is finished
def __init__(self, crawler): _settuings = get_project_settings() # log文件名的时间格式 dt_fmt = _settuings.get("LOG_NAME_DATE_FMT", "%Y-%m-%d") dtstr = datetime.utcnow().strftime(dt_fmt) # 存储log的目录 log_dir = _settuings.get("LOG_DIR", "./") # log_dir 不存在,就递归创建log_dir if not os.path.exists(log_dir): try: os.makedirs(log_dir) logger.debug(u"定义的日志目录 {} 不存在,已创建!".format(log_dir)) except Exception as e: logger.exception(u"自定义的日志目录 {} 不存在且创建失败! 重置为默认!".format(log_dir)) logger.exception(e) log_dir = "./" # 普通级别的log log_file_name = "{}_{}.log".format(crawler.spidercls.name, dtstr) log_setting = { "LOG_FILE": os.path.join(log_dir, log_file_name), "LOG_LEVEL": logging.DEBUG } configure_logging(log_setting) # 错误级别的log log_file_name = "{}_error_{}.log".format(crawler.spidercls.name, dtstr) log_setting = { "LOG_FILE": os.path.join(log_dir, log_file_name), "LOG_LEVEL": logging.ERROR } configure_logging(log_setting)
def access0(self, runId, records): logger.info('Start accessing on Linkedin') # runner = CrawlerRunner({ # 'USER_AGENT': CrawlerBrowser.get_useragent(CrawlerBrowser.FIREFOX), # 'DOWNLOAD_DELAY': 1 # }) # self.crawl(runner, runId, records) # # d = runner.crawl(Spiderman, runId=runId, pipeline=self, items=records, siteCfg=self.config) # # d.addBoth(lambda _: reactor.stop()) # reactor.run() configure_logging(install_root_handler=False) process = CrawlerProcess({ 'USER_AGENT': CrawlerBrowser.get_useragent(CrawlerBrowser.EDGE), 'DOWNLOAD_DELAY': 1 }) process.crawl(Spiderman, runId=runId, pipeline=self, items=records, browser=CrawlerBrowser.EDGE, siteCfg=self.config) process.start()
def runspider(spargs): url = spargs.get('url') name = spargs.get('name', 'jd') guid = spargs.get('guid') product_id = spargs.get('product_id') if not os.path.exists('log'): os.makedirs('log') configure_logging(install_root_handler=False) logging.basicConfig(filename='log/%s.log' % name, format='%(levelname)s %(asctime)s: %(message)s', level=logging.ERROR) print "get_project_settings().attributes:", get_project_settings( ).attributes['SPIDER_MODULES'] process = CrawlerProcess(get_project_settings()) start_time = time.time() try: logging.info('进入爬虫') process.crawl(name, **spargs) process.start() except Exception, e: process.stop() logging.error("url:%s, errorMsg:%s" % (url, e.message))
def f(q): try: s = get_project_settings() user_agent_list = data.getUserAgentList() user_agent = None if len(user_agent_list) > 0: user_agent = random.choice(user_agent_list) if user_agent: s.update({ "LOG_ENABLED": "True", "TELNETCONSOLE_ENABLED": "False", "USER_AGENT": user_agent }) else: s.update({ "LOG_ENABLED": "True", "TELNETCONSOLE_ENABLED": "False" }) configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = crawler.CrawlerRunner(s) agent.state['fire'] = False agent.state['data'] = data deferred = runner.crawl( agent.class_element, cus_urls=data.getUrls(), cus_allowed_domains=data.getAllowedDomains(), agent=agent) deferred.addBoth(lambda _: reactor.stop()) reactor.run() if not agent.state['fire']: q.put(self.case_without_rules_trigger()) q.put(None) except Exception as e: q.put(e) self.log.error(str(e)) raise SpiderException('[Warning, execute]: %s' % str(e))
def __setup(local_download_dir_warc, log_level): """ Setup :return: """ os.makedirs(local_download_dir_warc, exist_ok=True) global __log_pathname_fully_extracted_warcs __log_pathname_fully_extracted_warcs = os.path.join(local_download_dir_warc, 'fullyextractedwarcs.list') # make loggers quite configure_logging({"LOG_LEVEL": "ERROR"}) logging.getLogger('requests').setLevel(logging.CRITICAL) logging.getLogger('readability').setLevel(logging.CRITICAL) logging.getLogger('PIL').setLevel(logging.CRITICAL) logging.getLogger('newspaper').setLevel(logging.CRITICAL) logging.getLogger('newsplease').setLevel(logging.CRITICAL) logging.getLogger('urllib3').setLevel(logging.CRITICAL) logging.getLogger('jieba').setLevel(logging.CRITICAL) # set own logger logging.basicConfig(level=log_level) __logger = logging.getLogger(__name__) __logger.setLevel(log_level)
def crawl(self, spidername, keyword, times, log=True, runner=None, settings=None): thesettings = copy.deepcopy(get_project_settings()) if log and not settings: self.logfilename = LOG_DIR + getCurrentTimeReadable() \ + '-' + spidername + '.log' logfilename = self.logfilename thesettings['LOG_FILE'] = logfilename else: thesettings = settings # https://docs.scrapy.org/en/latest/topics # /api.html#scrapy.settings.Settings # process = CrawlerProcess(get_project_settings()) if not runner: configure_logging(thesettings) therunner = CrawlerRunner(thesettings) else: therunner = runner d = therunner.crawl(spidername, q=keyword, t=times) if not runner: d.addBoth(lambda _: reactor.stop()) if self.loop < 3: self.loop = self.loop + 1 d.addBoth(lambda _: self.crawl(spidername, keyword, self.loop, log, runner, settings))
def __setup(self): """ Setup :return: """ os.makedirs(self.__local_download_dir_warc, exist_ok=True) # make loggers quiet configure_logging({"LOG_LEVEL": "ERROR"}) logging.getLogger('requests').setLevel(logging.CRITICAL) logging.getLogger('readability').setLevel(logging.CRITICAL) logging.getLogger('PIL').setLevel(logging.CRITICAL) logging.getLogger('newspaper').setLevel(logging.CRITICAL) logging.getLogger('newsplease').setLevel(logging.CRITICAL) logging.getLogger('urllib3').setLevel(logging.CRITICAL) boto3.set_stream_logger('botocore', self.__log_level) boto3.set_stream_logger('boto3', self.__log_level) boto3.set_stream_logger('s3transfer', self.__log_level) # set own logger logging.basicConfig(level=self.__log_level) self.__logger = logging.getLogger(__name__) self.__logger.setLevel(self.__log_level)
def run_all(self): hasRunnedToday = os.path.isfile(checkfile) if not hasRunnedToday: # 今天此脚本未执行 with open(checkfile, 'w+') as f: # 执行前创建此文件 pass else: os.remove(checkfile) with open('Checkjiuyue/Checkjiuyue/domain.txt', 'r') as f1: moudle_list = [] for r in f1.readlines(): name = r.replace('.', '').replace('\n', '').strip() # name为domain去. domain = r.replace('\n', '').strip() with open( 'Checkjiuyue/Checkjiuyue/spiders/{}_spider.py'.format( name), 'w') as f: f.write( 'from Checkjiuyue.Checkjiuyue.spiders.base_baidu_spider import BaseBaiduSpider' + '\n' + '\n') f.write('class {}Spider(BaseBaiduSpider):'.format(name) + '\n') f.write('\t' + "name='{}Spider'".format(name) + '\n') f.write('\t' + "domain='{}'".format(domain) + '\n') moudle = get( 'Checkjiuyue.Checkjiuyue.spiders.{0}_spider.{1}Spider'. format(name, name)) moudle_list.append(moudle) configure_logging() runner = CrawlerRunner(settings) for spider in moudle_list: runner.crawl(spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run() self.send_email_zip_summary() time.sleep(self.getsleeptime())
def run_scraper(): crawler_settings = Settings() crawler_settings.setmodule(settings) configure_logging() # runner = CrawlerRunner(settings=crawler_settings) # task = LoopingCall(lambda: runner.crawl(NewsOeOffshoreSpider)) # task.start(6000 * 100) # reactor.run() process = CrawlerProcess(settings=crawler_settings) process.crawl(OilCrossSpider) process.crawl(LngConSpider) process.crawl(CnpcNewsSpider) process.crawl(PetroTradingSpider) process.crawl(EnergyExpressSpider) process.crawl(HaiBeiSpider) process.crawl(WeiXinOffshoreEnergySpider) process.crawl(HaiBoSpider) process.crawl(CRSLSpider) process.crawl(OilCubicSpider) process.crawl(OilLinkSpider) process.start()
class MySpider(CrawlSpider): start_urls = ['https://www.gmail.com'] name = 'link_checker' configure_logging(install_root_handler=True) custom_settings = { 'DEPTH_LIMIT': '1'} rules = (Rule(LinkExtractor(), callback='parse_url', follow=True), ) def parse_url(self, response): logging.info("\n") logging.info("The Response URL is: {}".format(response.url)) logging.info("The Time of Parsing is: {}".format(now.strftime("%Y-%m-%d %H:%M"))) #logging.info('\n\n\n') logging.info("Source Code is: \n {}".format(response.body)) #logging.info('\n\n\n') #logging.info(process.memory_info().rss) #logging.info('\n\n\n')
class emailSpider(scrapy.spiders.CrawlSpider): name = 'email' logging.getLogger('scrapy').propagate = False configure_logging(install_root_handler=True) logging.basicConfig(filename='log.txt', format='%(levelname)s:%(message)s', level=logging.INFO) rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse_obj', follow=True), ) """ Description: Parse each response from the spidering and try to scrape email from the html body. If the email is not in global "maillist" variable, add to the maillist. This way, no duplicate emails will be inserted into maillist. """ def parse_obj(self, response): # Use set instead of list to remove all duplicate occurrences try: email = set( re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text)) except: email = set() # For emails found in the response html, see if the email already exists in the # current maillist, and if not, print it out and add it to the maillist. for item in email: if len(maillist) >= emailLimit: #print("Email number limit have reached: ", emailLimit) raise CloseSpider('Email number limit have reached') elif item == '': continue # TIL, if item in <dictionary> is actually using O(1) and is fast. elif item in maillist: continue maillist[item] = 1 print(item)
class ArtistSpider(scrapy.Spider): """Collect the Internal Soundcloud ID and then all necessary data from the /user endpoint on Soundcloud's API""" # $ scrapy crawl [name] name = 'artists' LOGGING = True def __init__(self, limit=None, *args, **kwargs): super(ArtistSpider, self).__init__(*args, **kwargs) self.api = Soundcloud(wait=0.25) self.limit = int(limit) self.start_urls = self.api.get_start_urls(spider=self.name, url_limit=self.limit) # log all output if LOGGING: configure_logging(install_root_handler=False) logging.basicConfig(filename='log.txt', format='%(levelname)s: %(message)s', level=logging.INFO) custom_settings = { 'ITEM_PIPELINES': { 'sc_scraper.pipelines.ArtistPipeline': 0, } } def parse(self, response): # get internal ID from profile URL and call SC's /user endpoint internal_id = self.api.get_internal_sc_user_id(response.url) user = self.api.get_user(internal_id, db_data_only=True) # add to Artist Item artist = Artist() artist['item_type'] = 'artist' artist['dt_crawled'] = self.api.get_timestamp() artist['retrieved_tracks'] = False for k in user.keys(): artist[k] = user[k] yield artist
class IkeaCategoriesSpider(scrapy.Spider): configure_logging(install_root_handler=False) logging.basicConfig(filename='log.txt', format='%(levelname)s: %(message)s', level=logging.INFO) name = 'ikea_categories' def start_requests(self): scrape_urls = ['https://www.ikea.com/ca/en/cat/products-products/'] for url in scrape_urls: yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): # creating/overwriting the product list with the columns of both # the product and category files ikea_category_df_data = {'category_id': [], 'category_name': []} category_list = enumerate( response.css( '.vn-accordion__item > ul > li > a::attr(href)').getall()) for index, category_url in category_list: # Getting the last array element in the stripped URL # .strip('/') removes the trailing / in the URL category_name_id = category_url.strip('/').split('/')[-1] category_name = ' '.join(category_name_id.split('-')[:-1]).title() category_id = category_name_id.split('-')[-1] ikea_category_df_data['category_id'] += [category_id] ikea_category_df_data['category_name'] += [category_name] ikea_category_df = pd.DataFrame(data=ikea_category_df_data, dtype=str) ikea_category_df.to_csv(categories_csv, encoding='utf-8', index=False) print('Categories completed. Starting product crawl.') product_spider = IkeaProductsSpider(products_csv) product_spider.crawl_products() if not product_spider.completed: print('Unable to complete product crawl.')
class JobsSpider(scrapy.Spider): name = "linkedinJobCard" configure_logging(install_root_handler=False) def start_requests(self): query = 'Python' place = 'Germany' search_url = 'https://www.linkedin.com/jobs/search?keywords=' + query.lower() + '&location=' + place.lower() self.jobs_to_scrape = 20 yield SeleniumRequest(url=search_url, callback=self.get_job_urls) def get_job_urls(self,response): for jj in range(1, self.jobs_to_scrape + 1): url=response.xpath('//*[@id="main-content"]/div/section/ul/li[' + str(jj) + ']/a/@href').get() yield SeleniumRequest(url=url, callback=self.parse) def parse(self, response): text = response.xpath('/html/body/main/section[1]/section[3]/div/section/div/text()').get() data = { 'url': response.url, 'text': text } yield data
class TestSpider(scrapy.Spider): name = 'test_spider' start_urls = ['http://quotes.toscrape.com/'] custom_settings = { 'EXTENSIONS': { 'scrapy.extensions.test_extension.CustomStats': 500 } } configure_logging(install_root_handler=False) log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' log_level = logging.INFO log_file = 'test_log.log' logging.basicConfig(format=log_format, level=log_level) rotating_file_log = TimedRotatingFileHandler(log_file, when='S', interval=1, backupCount=5) rotating_file_log.setFormatter(logging.Formatter(log_format)) root_logger = logging.getLogger() root_logger.addHandler(rotating_file_log) def parse(self, response): for row in response.xpath( "//div[@class='row']/div[@class='col-md-8']/div"): yield { 'quote': row.xpath(".//span[@class='text']/text()").get(), } next_page = response.xpath("//li[@class='next']/a/@href").get() if next_page is not None: next_page_link = response.urljoin(next_page) yield scrapy.Request(url=next_page_link, callback=self.parse)
class ModSpider(scrapy.Spider): name = "spider" start_urls = ['http://www.google.com'] # Configure Setting for main driver code configure_logging() runner = CrawlerRunner() def crawl(self): self.crawl(ModSpider) reactor.stop() crawl() reactor.run() # the script will block here until the last crawl call is finished # SETTING_SELECTOR instructs spider which 'key word' to use when parsing the data # NAME_SELECTOR is the <HTML> tag to search within # The Object we are iterating over has its own CSS Method so we pass a selector element # to parse out child elements. def parse(self, response): SETTING_SELECTOR = '.set' for k in response.css(SETTING_SELECTOR): pass NAME_SELECTOR = 'h1:: text' yield { 'name': k.css(NAME_SELECTOR).extract_first(), } # We define a selector for the next_page element (link), extract first match # and check to see if it exists. NEXT_PAGE_SELECT = '.next a ::attr(href)' next_page = response.css(NEXT_PAGE_SELECT).extract_first() if next_page: yield scrapy.Request(response.urljoin(next_page), callback=self.parse)
def set_logger(self, name: str = "COMMAND", level: str = "DEBUG"): self.logger = logging.getLogger(name=name) self.logger.setLevel(level) configure_logging() logging.getLogger("pika").setLevel( self.project_settings.get("PIKA_LOG_LEVEL", "WARNING"))
for k, v in os.environ.items() if k.startswith('SCRAPY_') } if env_overrides: settings.setdict(env_overrides, priority='project') return settings ENVVAR = 'SCRAPY_SETTINGS_MODULE' s = get_project_settings() if CUSTOM_LOGGING: logging.basicConfig(level=LOG_LEVEL) else: configure_logging(settings=s, install_root_handler=False) runner = CrawlerRunner(s) base_string = 'https://www.olx.pl/nieruchomosci' housing_types = ['mieszkania', 'stancje-pokoje'] business_types = ['sprzedaz', 'wynajem'] urls_flats_OLX = [] urls_rooms_OLX = [] cities_scope = [unidecode.unidecode(x) for x in cities_scope] for type in housing_types: for city in cities_scope: if type == 'mieszkania': for purpose in business_types: urls_flats_OLX.append('/'.join(
def run(): configure_logging() runner = CrawlerRunner(get_project_settings()) d = runner.crawl(SasaSpider) d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until the crawling is finished
#Imports related to scrapy from scrapy import Spider from scrapy import Request #Model import , saving data from uniScrapers.items import UniversityInfoItem #Generating module logs import logging from scrapy.utils.log import configure_logging #Configuring logs for Main crawler Module. #Log file name can be changed below #Log Level can be modified to INFO , ERROR , WARNING , DEBUG etc #Format specifies file format configure_logging(install_root_handler=False) logging.basicConfig(filename='log.txt', format='%(levelname)s: %(message)s', level=logging.ERROR) #Main Crawler class class UniversityCrawler(Spider): #Defining Name of Spider | It will be used for running spider #Use - scrapy crawl <spider-name> [scrapeData in our case] name = 'UniversityCrawler' # Domains allowed during scrape session | Outer domains will be filtered # This section need not to be altered in our case # So it can be left unaltered
def __init__(self, settings=None, install_root_handler=True): super(CrawlerProcess, self).__init__(settings) install_shutdown_handlers(self._signal_shutdown) configure_logging(self.settings, install_root_handler) log_scrapy_info(self.settings)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging from scrapy.utils.project import get_project_settings from tophub.spiders.github_spider import GitHubSpider from tophub.spiders.juejin_spider import JueJinSpider from tophub.spiders.douban_spider import DouBanSpiderFiction, \ DouBanSpiderNonFiction from tophub.spiders.reddit_spider import RedditSpider from tophub.spiders.segmentfault_spider import SegmentFaultSpider from tophub.spiders.hacker_news import HackerNewsSpider configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner(get_project_settings()) runner.crawl(GitHubSpider) runner.crawl(JueJinSpider) runner.crawl(HackerNewsSpider) runner.crawl(DouBanSpiderFiction) runner.crawl(DouBanSpiderNonFiction) # runner.crawl(RedditSpider) runner.crawl(SegmentFaultSpider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
from twisted.internet import reactor, defer from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging from spiders.course_spider import CourseSpider configure_logging() runner = CrawlerRunner() @defer.inlineCallbacks def crawl(): yield runner.crawl(CourseSpider) reactor.stop() crawl() reactor.run() # the script will block here until the last crawl call is finished
settings.set('ITEM_PIPELINES', { '__main__.JsonLinesExportPipeline': 100, }) settings.set('USER_AGENT', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64)') # instantiate a spider ff_spider = FFSpider() ao_spider = AOSpider() # instantiate a crawler passing in settings # crawler = Crawler(settings) ff_crawler = Crawler(ff_spider, settings) ao_crawler = Crawler(ao_spider, settings) # configure signals ff_crawler.signals.connect(callback, signal=signals.spider_closed) ao_crawler.signals.connect(callback, signal=signals.spider_closed) # configure and start the crawler # crawler.configure() # crawler.crawl(spider) ff_crawler.crawl() # le_crawler.crawl() # ao_crawler.crawl() # start logging # log.start() log.configure_logging() # start the reactor (blocks execution) reactor.run()
def __init__(self, cfg_file_path, json_file_path, site_index, shall_resume, daemonize, library_mode=False): # set up logging before it's defined via the config file, # this will be overwritten and all other levels will be put out # as well, if it will be changed. configure_logging({"LOG_LEVEL": "CRITICAL"}) self.log = logging.getLogger(__name__) self.cfg_file_path = cfg_file_path self.json_file_path = json_file_path self.site_number = int(site_index) self.shall_resume = shall_resume \ if isinstance(shall_resume, bool) else literal_eval(shall_resume) self.daemonize = daemonize \ if isinstance(daemonize, bool) else literal_eval(daemonize) # set up the config file self.cfg = CrawlerConfig.get_instance() self.cfg.setup(self.cfg_file_path) self.log.debug("Config initialized - Further initialisation.") self.cfg_crawler = self.cfg.section("Crawler") # load the URL-input-json-file or - if in library mode - take the json_file_path as the site information ( # kind of hacky..) if not library_mode: self.json = JsonConfig.get_instance() self.json.setup(self.json_file_path) sites = self.json.get_site_objects() site = sites[self.site_number] else: sites = [json_file_path] site = json_file_path if "ignore_regex" in site: ignore_regex = "(%s)" % site["ignore_regex"] else: ignore_regex = "(%s)" % \ self.cfg.section('Crawler')['ignore_regex'] # Get the default crawler. The crawler can be overwritten by fallbacks. if "additional_rss_daemon" in site and self.daemonize: self.crawler_name = "RssCrawler" elif "crawler" in site: self.crawler_name = site["crawler"] else: self.crawler_name = self.cfg.section("Crawler")["default"] # Get the real crawler-class (already "fallen back") crawler_class = self.get_crawler(self.crawler_name, site["url"]) if not self.cfg.section('Files')['relative_to_start_processes_file']: relative_to_path = os.path.dirname(self.cfg_file_path) else: # absolute dir this script is in relative_to_path = os.path.dirname(__file__) self.helper = Helper(self.cfg.section('Heuristics'), self.cfg.section("Files")["local_data_directory"], relative_to_path, self.cfg.section('Files')['format_relative_path'], sites, crawler_class, self.cfg.get_working_path()) self.__scrapy_options = self.cfg.get_scrapy_options() self.update_jobdir(site) # make sure the crawler does not resume crawling # if not stated otherwise in the arguments passed to this script self.remove_jobdir_if_not_resume() self.load_crawler(crawler_class, site["url"], ignore_regex) # start the job. if in library_mode, do not stop the reactor and so on after this job has finished # so that further jobs can be executed. it also needs to run in a thread since the reactor.run method seems # to not return. also, scrapy will attempt to start a new reactor, which fails with an exception, but # the code continues to run. we catch this excepion in the function 'start_process'. if library_mode: start_new_thread(start_process, (self.process, False,)) else: self.process.start()
def test_injection_failure(settings): configure_logging(settings) items, url, crawler = yield crawl_items( spider_for(UnressolvableProductPage), ProductHtml, settings) assert items == []
def __init__(self, settings=None, install_root_handler=True): super().__init__(settings) configure_logging(self.settings, install_root_handler) log_scrapy_info(self.settings) self._initialized_reactor = False
def __init__(self, id_list, crawl_list, *args, **kwargs): super().__init__(*args, **kwargs) self.id_list = id_list self.crawl_list = crawl_list configure_logging() self.runner = CrawlerRunner(get_project_settings())
def newscrape(requested, ticker): # here begins the definition for the web scraper logging.getLogger('scrapy').propagate = False # turn off logging targetdrop = requested['Percent'][len(requested) - 1] * 0.9 #targetdrop = -5 if targetdrop > 0: return 'Target percent was greater than 0- Bogged Again should only be used with dips.' #print(f'Target percent is {targetdrop}... gathering data') datelist = [] # placeholder for dates of interest percentlist = [] finalpercentlist = [] finalopenpercent = [] for x in range(0, len(requested) - 3): if requested['Percent'][x] <= targetdrop: final = truncate( (requested["Close"][x + 3] - requested["Close"][x]) / requested["Close"][x]) final_open = truncate( (requested["Open"][x + 3] - requested["Close"][x]) / requested["Close"][x]) datelist.append(requested["Date"] [x]) # saving items of interest to separate lists percentlist.append(requested["Percent"][x]) finalpercentlist.append(final) finalopenpercent.append(final_open) #print(f'{ticker} suffered a {requested["Percent"][x]} percent drop on \ # {requested["Date"][x]}, 3 days later it went to {requested["Close"][x+3]} for a {final} percent change') datelist.append(requested["Date"][ len(requested) - 1]) # we add the items for today as well to the lists percentlist.append(requested['Percent'][len(requested) - 1]) finalpercentlist.append( 0) # place holder because we don't know the future percentage change finalopenpercent.append(0) datelist.reverse( ) # reverse all the lists to be in reverse chronological order percentlist.reverse() finalpercentlist.reverse() finalopenpercent.reverse() datesecs = [] for date in datelist: datesecs.append(time.mktime(time.strptime( date, "%Y-%m-%d"))) # convert all the dates into seconds if len(datesecs) > 100: return "There are at least 100 similar drops in the past, which probably means it's not a big enough drop. " categorylist = ['Unknown'] * len(datesecs) finaloutput = pd.DataFrame({ 'Date': datelist, 'Percent': percentlist, '3 Day Percent': finalpercentlist, '3 Day Open Percent': finalopenpercent }) # launching the webspider here stuff = {'datesecs': datesecs, 'ticker': ticker} configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner(get_project_settings()) crawltime = abs( datesecs[0] - datesecs[-1] ) / 60 * 60 * 24 # how many days are in between the first and the last def run_spider(spidername, thing): #d = runner.crawl(spidername, thing) #return d #reactor.run() # the script will block here until the crawling is finished #a = process.crawl(spidername, thing) # the script will block here until the crawling is finished process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) process.crawl( spidername, thing) # the script will block here until the crawling is finished #process.stop() #run_spider(QuotesSpider, stuff) # while the spider is still broken: filename = f'{lpath}stocks/headlines-{ticker}.csv' a = open(filename, 'w', newline='', encoding="utf-8") writer = csv.writer(a) writer.writerow(['Headline', 'URL']) # adding a header for better pandas a.close() # since scrapy checks the news page by page, if the first and last date are far apart it may take some time for the # crawling to complete. timeout = 0 while not os.path.exists( f'{lpath}stocks/headlines-{ticker}.csv') and timeout < 1500: time.sleep( 5 ) # assume it takes 1 second per page, 20 days of news on one page timeout += 5 # if the spider ran successfully, it will have saved the headlines in a csv in the same directory if os.path.exists(f'{lpath}stocks/headlines-{ticker}.csv'): headlines = pd.read_csv(f'{lpath}stocks/headlines-{ticker}.csv', encoding='unicode_escape') os.remove(f'{lpath}stocks/headlines-{ticker}.csv' ) # deleting file once it's fulfilled its purpose newstext = [] # make a dataframe out of all the stuff we care about, tack on the headlines to the side, and save it! finaloutput.assign(Headline=headlines.Headline) #finaloutput.assign(MatchPercent=headlines.MatchPercent) # unpickle pre-generated classifier to classify news f = open(f'{lpath}polls/boggedagain/my_classifier.pickle', 'rb') classifier = pickle.load(f) f.close() categorydecoder = { 'AD': 'Analyst downgrade', 'B': 'Bankruptcy', 'CS': 'Company scandal', 'LC': 'Leadership change', 'LG': 'Lowered guidance', 'LL': 'Lost lawsuit', 'LS': 'Leadership scandal', 'M': 'Merger', 'NO': 'New options', 'PO': 'Public offering', 'R': 'Regulation', 'RL': 'Restructuring/Layoff', 'RM': 'Revenue miss', 'SD': 'Sector dump', 'SS': 'Stock split', 'T': 'Trump', 'TW': 'Trade war' } for x in range(0, len(headlines.Headline)): if headlines.Headline[x] != 'No match': filename = f'{ticker}-{time.strftime("%Y-%m-%d", time.gmtime(datesecs[x]))}' with open(f'{lpath}news/{filename}.txt', encoding="utf-8") as f: data = f.read() f.close() category = classifier.classify(cl.news_features(data)) categorylist[x] = categorydecoder[category] # if this is not part of the training set yet, copy over as unlabeled so I can label it later. if not os.path.exists( f'{lpath}trainer/{category}/{filename}_labeled.txt' ): copyfile(f'{lpath}news/{filename}.txt', f'{lpath}trainer/{filename}.txt') os.remove(f'{lpath}news/{filename}.txt') finaloutput.assign(Category=categorylist) finalfinal = finaloutput.join(headlines) finalfinal['Category'] = pd.Series(categorylist, index=finalfinal.index) return finalfinal else: return "QuoteSpider did not finish after 150 seconds. Either there are too many news per page for the crawler" \ "to parse through, or something went wrong with the spider and it hung. Please try again!"
def parse(self, response): pass ''' from scrapy.spiders import CrawlSpider, Rule from scrapy.selector import Selector #from scrapy.linkextractors.sgml import SgmlLinkExtractor from itbooks.items import ItbooksItem from scrapy.http import Request from scrapy.linkextractors import LinkExtractor import logging from scrapy.utils.log import configure_logging from scrapy.shell import inspect_response configure_logging(install_root_handler=False) logging.basicConfig( filename='log.txt', format='%(levelname)s: %(message)s', level=logging.WARNING ) class ItbooksSpider(CrawlSpider): name = "itbooks" allowed_domains = ["it-ebooks.info"] start_urls = ["http://it-ebooks.info/tag/programming/"] rules = ( # Rule(SgmlLinkExtractor(allow=(r"page\/\d+\/$",), ), callback="parse_start_url", follow=True ), # Rule(LinkExtractor(allow=(r"page\/\d+\/$",), ), callback="parse_start_url_a", follow=False ), Rule(LinkExtractor(allow=(r"page\/\d+\/$",), ), callback="parse_start_url_a", follow=True ), # Rule(LinkExtractor(allow=(r"page\/\d+\/$",), ),),
def __configure_logging(self): configure_logging(install_root_handler=False) logging.basicConfig(filename=self.spider + '_log.txt', format='%(levelname)s: %(message)s', level=logging.INFO)
from tests.mockserver import MockServer, MockDNSServer class LocalhostSpider(Spider): name = "localhost_spider" def start_requests(self): yield Request(self.url) def parse(self, response): netloc = urlparse(response.url).netloc self.logger.info("Host: %s" % netloc.split(":")[0]) self.logger.info("Type: %s" % type(response.ip_address)) self.logger.info("IP address: %s" % response.ip_address) if __name__ == "__main__": with MockServer() as mock_http_server, MockDNSServer() as mock_dns_server: port = urlparse(mock_http_server.http_address).port url = "http://not.a.real.domain:{port}/echo".format(port=port) servers = [(mock_dns_server.host, mock_dns_server.port)] reactor.installResolver(createResolver(servers=servers)) configure_logging() runner = CrawlerRunner() d = runner.crawl(LocalhostSpider, url=url) d.addBoth(lambda _: reactor.stop()) reactor.run()
import sys reload(sys) sys.setdefaultencoding('utf8') import scrapy from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging from scrapy.spiders import Spider from scrapy.selector import HtmlXPathSelector from items import FaqscrapyItem from scrapy.http import Request from scrapy.selector import Selector from scrapy.utils.project import get_project_settings from spiders.FAQ_jingdong import JingdongSpider from spiders.FAQ_suning import SuningSpider import re if __name__ == '__main__': settings = get_project_settings() configure_logging(settings) runner = CrawlerRunner(settings) runner.crawl(JingdongSpider) runner.crawl(SuningSpider) d = runner.join() d.addBoth(lambda _: reactor.stop()) # blocks process so always keep as the last statement reactor.run()