# -*- coding: utf-8 -*- # Scrapy settings for aif project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html from bots import setup_django_env setup_django_env() BOT_NAME = 'aif' SPIDER_MODULES = ['aif.spiders'] NEWSPIDER_MODULE = 'aif.spiders' JOBDIR = 'items/jobdir' DOWNLOAD_HANDLERS = {'s3': None} # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'aif (+http://www.yourdomain.com)' # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS=32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# -*- coding: utf-8 -*- # Scrapy settings for _39 project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html from bots import setup_django_env setup_django_env() BOT_NAME = '_39' SPIDER_MODULES = ['_39.spiders'] NEWSPIDER_MODULE = '_39.spiders' DOWNLOAD_TIMEOUT = 100 DOWNLOAD_DELAY = 1 DOWNLOADER_MIDDLEWARES = { 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 'bots.base.middlewares.RotateUserAgentMiddleware': 543, } ITEM_PIPELINES = { '_39.pipelines.UniqueItemPersistencePipeline': 100, '_39.pipelines.RelatedItemPersistencePipeline': 200 }
# -*- coding: utf-8 -*- # Scrapy settings for spiderbot project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html import sys sys.path.append('../../') import bots bots.setup_django_env() BOT_NAME = 'spiderbot' SPIDER_MODULES = ['spiderbot.spiders'] NEWSPIDER_MODULE = 'spiderbot.spiders' DOWNLOAD_HANDLERS = {'s3': None} DOWNLOAD_DELAY = 0.5 DOWNLOAD_TIMEOUT = 100 CONCURRENT_REQUESTS_PER_IP = 1 # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'spiderbot (+http://www.yourdomain.com)'