def from_crawler(cls, crawler): """Initialize the middleware with the crawler settings""" driver_name = crawler.settings.get('SELENIUM_DRIVER_NAME') driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH') driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS') if not driver_name or not driver_executable_path: raise NotConfigured( 'SELENIUM_DRIVER_NAME and SELENIUM_DRIVER_EXECUTABLE_PATH must be set' ) middleware = cls( driver_name=driver_name, driver_executable_path=driver_executable_path, driver_arguments=driver_arguments ) crawler.signals.connect(middleware.spider_closed, signals.spider_closed) return middleware
def from_crawler(cls, crawler): s = crawler.settings proxy_path = s.get('ROTATING_PROXY_LIST_PATH', None) if proxy_path is not None: with codecs.open(proxy_path, 'r', encoding='utf8') as f: proxy_list = [line.strip() for line in f if line.strip()] else: proxy_list = s.getlist('ROTATING_PROXY_LIST') if not proxy_list: raise NotConfigured() mw = cls(proxy_list=proxy_list, stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER', False), max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES', 5), max_order=s.getint("PROXY_MAX_ORDER", 1000000), timeout_if_no_proxy=s.getint("TIMEOUT_IF_NO_PROXY", 300), proxy_download_delay=s.getint("PROXY_DELAY", 3), randomize_download_delay=s.getbool("RANDOMIZE_DOWNLOAD_DELAY", True)) return mw
def _parse(self, response, **kwargs): if not hasattr(self, "parse_node"): raise NotConfigured( "You must define parse_node method in order to scrape this XML feed" ) response = self.adapt_response(response) if self.iterator == "iternodes": nodes = self._iternodes(response) elif self.iterator == "xml": selector = Selector(response, type="xml") self._register_namespaces(selector) nodes = selector.xpath(f"//{self.itertag}") elif self.iterator == "html": selector = Selector(response, type="html") self._register_namespaces(selector) nodes = selector.xpath(f"//{self.itertag}") else: raise NotSupported("Unsupported node iterator") return self.parse_nodes(response, nodes)
def parse(self, response): if not hasattr(self, 'parse_node'): raise NotConfigured( 'You must define parse_node method in order to scrape this XML feed' ) response = self.adapt_response(response) if self.iterator == 'iternodes': nodes = xmliter(response, self.itertag) elif self.iterator == 'xml': selector = XmlXPathSelector(response) self._register_namespaces(selector) nodes = selector.select('//%s' % self.itertag) elif self.iterator == 'html': selector = HtmlXPathSelector(response) self._register_namespaces(selector) nodes = selector.select('//%s' % self.itertag) else: raise NotSupported('Unsupported node iterator') return self.parse_nodes(response, nodes)
def _parse(self, response, **kwargs): if not hasattr(self, 'parse_node'): raise NotConfigured( 'You must define parse_node method in order to scrape this XML feed' ) response = self.adapt_response(response) if self.iterator == 'iternodes': nodes = self._iternodes(response) elif self.iterator == 'xml': selector = Selector(response, type='xml') self._register_namespaces(selector) nodes = selector.xpath(f'//{self.itertag}') elif self.iterator == 'html': selector = Selector(response, type='html') self._register_namespaces(selector) nodes = selector.xpath(f'//{self.itertag}') else: raise NotSupported('Unsupported node iterator') return self.parse_nodes(response, nodes)
def __init__(self, settings, *, crawler=None, aws_access_key_id=None, aws_secret_access_key=None, aws_session_token=None, httpdownloadhandler=HTTPDownloadHandler, **kw): if not is_botocore_available(): raise NotConfigured('missing botocore library') if not aws_access_key_id: aws_access_key_id = settings['AWS_ACCESS_KEY_ID'] if not aws_secret_access_key: aws_secret_access_key = settings['AWS_SECRET_ACCESS_KEY'] if not aws_session_token: aws_session_token = settings['AWS_SESSION_TOKEN'] # If no credentials could be found anywhere, # consider this an anonymous connection request by default; # unless 'anon' was set explicitly (True/False). anon = kw.get('anon') if anon is None and not aws_access_key_id and not aws_secret_access_key: kw['anon'] = True self.anon = kw.get('anon') self._signer = None import botocore.auth import botocore.credentials kw.pop('anon', None) if kw: raise TypeError(f'Unexpected keyword arguments: {kw}') if not self.anon: SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3'] self._signer = SignerCls(botocore.credentials.Credentials( aws_access_key_id, aws_secret_access_key, aws_session_token)) _http_handler = create_instance( objcls=httpdownloadhandler, settings=settings, crawler=crawler, ) self._download_http = _http_handler.download_request
def __init__(self, crawler): self.crawler = crawler settings = self.crawler.settings try: self.producer = AutoProducer( bootstrap_servers=settings.getlist(KAFKA_PRODUCER_BROKERS), configs=settings.get(KAFKA_PRODUCER_CONFIGS, None), topic=settings.get(KAFKA_PRODUCER_TOPIC, None), kafka_loglevel=loglevel( settings.get(KAFKA_PRODUCER_LOGLEVEL, "WARNING")), ) except Exception as e: raise NotConfigured(f"init producer {e}") self.logger = logging.getLogger(self.__class__.__name__) crawler.signals.connect(self.spider_closed, signals.spider_closed) self.exporter = TextDictKeyPythonItemExporter( binary=False, ensure_base64=settings.getbool(KAFKA_VALUE_ENSURE_BASE64, False), ) self.encoder = ScrapyJSNONBase64Encoder() self.field_filter = set(settings.getlist("KAFKA_EXPORT_FILTER", [])) self.logger.debug(f"KAFKA_EXPORT_FILTER: {self.field_filter}")
def parse_epoch(epoch): if isinstance(epoch, bool) or isinstance(epoch, datetime): return epoch elif epoch == 'True': return True elif epoch == 'False': return False try: return datetime.strptime(epoch, EPOCH_DATE_FORMAT) except ValueError: pass parser = parsedatetime.Calendar(Constants()) time_tupple = parser.parse( epoch) # 'yesterday' => (time.struct_time, int) if not time_tupple[1]: raise NotConfigured('Could not parse epoch: %s' % epoch) time_struct = time_tupple[0] return datetime(*time_struct[:6])
def get_env_variable(var_name, default=None): """ Get the environment variable or raise an exception. Args: var_name (str): the name of the environment variable. Keyword Args: default (str): the default value to use of the environment variable is not set. Returns: str: the value for the specified environment variable. Raises: a NotConfigured exception if there is no environment variable with the specified name and a default value was not given. """ value = os.environ.get(var_name, default) if value is None: from scrapy.exceptions import NotConfigured raise NotConfigured("Set the %s environment variable" % var_name) return value
def from_crawler(cls, crawler, client=None, dsn=None): release = crawler.settings.get("RELEASE", get_release(crawler)) additional_opts = crawler.settings.get("SENTRY_CLIENT_OPTIONS", {}) dsn = os.environ.get("SENTRY_DSN", crawler.settings.get("SENTRY_DSN", None)) if dsn is None: raise NotConfigured('No SENTRY_DSN configured') o = cls(dsn=dsn, release=release, **additional_opts) sentry_signals = crawler.settings.get("SENTRY_SIGNALS", []) if len(sentry_signals) > 0: receiver = o.spider_error for signalpath in sentry_signals: signalmodule, signalname = signalpath.rsplit('.', 1) _m = importlib.import_module(signalmodule) signal = getattr(_m, signalname) crawler.signals.connect(receiver, signal=signal) else: crawler.signals.connect(o.spider_error, signal=signals.spider_error) return o
def __init__(self, settings): self.google_cloud_enabled = settings.getbool('GOOGLE_CLOUD_ENABLED') if self.google_cloud_enabled: credentials_json = settings.get( 'GOOGLE_CLOUD_APPLICATION_CREDENTIALS_JSON') if credentials_json: if not os.path.isfile(self.credentials_json_path): with open(self.credentials_json_path, 'w') as outfile: outfile.write(credentials_json) os.environ[ 'GOOGLE_APPLICATION_CREDENTIALS'] = self.credentials_json_path logger.info('Google Cloud extensions inited with success') else: settings.set('GOOGLE_CLOUD_ENABLED', False) raise NotConfigured( 'GOOGLE_CLOUD_APPLICATION_CREDENTIALS_JSON ' 'not is set in settings') else: logger.info('GOOGLE_CLOUD_ENABLED is False')
def __init__(self): # Open database connection self.db = mysql.connect(host=ROJAK_DB_HOST, port=ROJAK_DB_PORT, user=ROJAK_DB_USER, passwd=ROJAK_DB_PASS, db=ROJAK_DB_NAME) self.cursor = self.db.cursor() self.cursor_urls = self.db.cursor() # Using UTF-8 Encoding self.db.set_character_set('utf8') self.cursor.execute('SET NAMES utf8;') self.cursor.execute('SET CHARACTER SET utf8;') self.cursor.execute('SET character_set_connection=utf8;') self.media = {} self.media['name'] = 'detikcom' try: # Get media information from the database self.logger.info('Fetching media information') self.cursor.execute(sql_get_media, [self.media['name']]) row = self.cursor.fetchone() self.media['id'] = row[0] self.media['last_scraped_at'] = row[1] except mysql.Error as err: self.logger.error('Unable to fetch media data: %s', err) raise NotConfigured('Unable to fetch media data: %s' % err) if ROJAK_SLACK_TOKEN != '': self.is_slack = True self.slack = Slacker(ROJAK_SLACK_TOKEN) else: self.is_slack = False self.logger.info('Post error to #rojak-pantau-errors is disabled') # We execute the cursor here and we fetch one by one self.cursor_urls.execute(sql_get_urls, [self.media['id']])
def from_crawler(cls, crawler): settings = crawler.settings if crawler.settings.getbool('BROWSER_ENGINE_COOKIES_ENABLED', False): if crawler.settings.getbool('COOKIES_ENABLED'): logger.warning("Default cookies middleware enabled together " "with browser engine aware cookies middleware. " "Set COOKIES_ENABLED to False.") cookies_mw = RemotelyAccessbileCookiesMiddleware( debug=crawler.settings.getbool('COOKIES_DEBUG')) else: cookies_mw = None server = settings.get('BROWSER_ENGINE_SERVER') if server: endpoint = clientFromString(reactor, server) else: if settings.getbool('BROWSER_ENGINE_START_SERVER', False): # Twisted logs the process's stderr with INFO level. logging.getLogger("twisted").setLevel(logging.INFO) argv = [ sys.executable, "-m", "scrapy_qtwebkit.browser_engine", "stdio" ] endpoint = ProcessEndpoint(reactor, argv[0], argv, env=None) else: raise NotConfigured( "Must provide either BROWSER_ENGINE_SERVER " "or BROWSER_ENGINE_START_SERVER") ext = cls( crawler, endpoint, page_limit=settings.getint('BROWSER_ENGINE_PAGE_LIMIT', 4), cookies_middleware=cookies_mw, ) return ext
def from_crawler(cls, crawler): s = crawler.settings proxy_path = s.get('ROTATING_PROXY_LIST_PATH', None) proxy_url = s.get('ROTATING_PROXY_URL', None) if proxy_path is not None: with codecs.open(proxy_path, 'r', encoding='utf8') as f: proxy_list = [line.strip() for line in f if line.strip()] elif proxy_url is not None: http = urllib3.PoolManager() request = http.request('GET', proxy_url) if request.status == 200: proxies_str = request.data.decode('utf-8').strip() proxies = proxies_str.split('\r\n') logger.info(f"Get Proxy list is {len(proxies)}") proxy_list = proxies else: logger.error("Get Proxy Http Status is not 200") else: proxy_list = s.getlist('ROTATING_PROXY_LIST') if not proxy_list: raise NotConfigured() mw = cls( proxy_list=proxy_list, logstats_interval=s.getfloat('ROTATING_PROXY_LOGSTATS_INTERVAL', 30), stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER', False), max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES', 5), backoff_base=s.getfloat('ROTATING_PROXY_BACKOFF_BASE', 300), backoff_cap=s.getfloat('ROTATING_PROXY_BACKOFF_CAP', 3600), crawler=crawler, ) crawler.signals.connect(mw.engine_started, signal=signals.engine_started) crawler.signals.connect(mw.engine_stopped, signal=signals.engine_stopped) return mw
def from_crawler(cls, crawler): s = crawler.settings ninja_key = s.get('ROTATING_NINJA_KEY', None) proxy_list = s.getlist('ROTATING_PROXY_LIST', []) if ninja_key is None and len(proxy_list) == 0: raise NotConfigured() mw = cls( ninja_key=ninja_key, proxy_list=proxy_list, logstats_interval=s.getfloat('ROTATING_PROXY_LOGSTATS_INTERVAL', 30), renew_interval=s.getfloat('ROTATING_NINJA_RENEW_INTERVAL', 1200), stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER', False), max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES', 5), backoff_base=s.getfloat('ROTATING_PROXY_BACKOFF_BASE', 300), backoff_cap=s.getfloat('ROTATING_PROXY_BACKOFF_CAP', 3600), crawler=crawler, ) crawler.signals.connect(mw.engine_started, signal=signals.engine_started) crawler.signals.connect(mw.engine_stopped, signal=signals.engine_stopped) return mw
def __init__(self, crawler): if not crawler.settings.getbool("TELNETCONSOLE_ENABLED"): raise NotConfigured if not TWISTED_CONCH_AVAILABLE: raise NotConfigured( "TELNETCONSOLE_ENABLED setting is True but required twisted " "modules failed to import:\n" + _TWISTED_CONCH_TRACEBACK ) self.crawler = crawler self.noisy = False self.portrange = [ int(x) for x in crawler.settings.getlist("TELNETCONSOLE_PORT") ] self.host = crawler.settings["TELNETCONSOLE_HOST"] self.username = crawler.settings["TELNETCONSOLE_USERNAME"] self.password = crawler.settings["TELNETCONSOLE_PASSWORD"] if not self.password: self.password = binascii.hexlify(os.urandom(8)).decode("utf8") logger.info("Telnet Password: %s", self.password) self.crawler.signals.connect(self.start_listening, signals.engine_started) self.crawler.signals.connect(self.stop_listening, signals.engine_stopped)
def from_crawler(cls, crawler): """Get the settings and initialize""" algolia_api_id = crawler.settings['ALGOLIA_API_ID'] algolia_api_key = crawler.settings['ALGOLIA_API_KEY'] algolia_index_name = crawler.settings['ALGOLIA_INDEX_NAME'] if not algolia_api_id or not algolia_api_key or not algolia_index_name: raise NotConfigured( 'Missing configuration for the AlgoliaItemPipeline' ) algolia_item_bulk_nbr = crawler.settings.get( 'ALGOLIA_ITEM_BULK_NBR', 100 ) return cls( algolia_api_id, algolia_api_key, algolia_index_name, algolia_item_bulk_nbr )
def makeRequest(self, url, referenceUrl=None, callBackFunctionName=None, meta={}, urlId=None, priority=1, **kw): ''' 创建Request ''' if not urlId: raise NotConfigured('爬虫%s创建Request的url%s没有提供id,将导致无法更新url的状态' % (self.name, url)) if (callBackFunctionName != None): print '危险:在scrapyApt初始化的request,只有函数名' kw.setdefault('callback', callBackFunctionName) meta['urlId'] = urlId meta['download_timeout'] = 180 meta['depth'] = 0 kw.setdefault('meta', meta) kw.setdefault('priority', priority) return Request(url, **kw)
def __init__(self, uri, access_key=None, secret_key=None, acl=None, endpoint_url=None, *, feed_options=None, session_token=None): if not is_botocore_available(): raise NotConfigured('missing botocore library') u = urlparse(uri) self.bucketname = u.hostname self.access_key = u.username or access_key self.secret_key = u.password or secret_key self.session_token = session_token self.keyname = u.path[1:] # remove first "/" self.acl = acl self.endpoint_url = endpoint_url import botocore.session session = botocore.session.get_session() self.s3_client = session.create_client( 's3', aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key, aws_session_token=self.session_token, endpoint_url=self.endpoint_url) if feed_options and feed_options.get('overwrite', True) is False: logger.warning('S3 does not support appending to files. To ' 'suppress this warning, remove the overwrite ' 'option from your FEEDS setting or set it to True.')
def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. # Initialize settings for selenium driver with the crawler settings driver_name = crawler.settings.get('SELENIUM_DRIVER_NAME') driver_executable_path = crawler.settings.get( 'SELENIUM_DRIVER_EXECUTABLE_PATH') driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS') if not driver_name or not driver_executable_path: raise NotConfigured( 'SELENIUM_DRIVER_NAME and SELENIUM_DRIVER_EXECUTABLE_PATH must be set' ) middleware_settings = cls( driver_name=driver_name, driver_executable_path=driver_executable_path, driver_arguments=driver_arguments) crawler.signals.connect(middleware_settings.spider_opened, signal=signals.spider_opened) crawler.signals.connect(middleware_settings.spider_closed, signal=signals.spider_closed) return middleware_settings
def __init__(self, settings, aws_access_key_id=None, aws_secret_access_key=None, \ httpdownloadhandler=HTTPDownloadHandler, **kw): if not aws_access_key_id: aws_access_key_id = settings['AWS_ACCESS_KEY_ID'] if not aws_secret_access_key: aws_secret_access_key = settings['AWS_SECRET_ACCESS_KEY'] # If no credentials could be found anywhere, # consider this an anonymous connection request by default; # unless 'anon' was set explicitly (True/False). anon = kw.get('anon') if anon is None and not aws_access_key_id and not aws_secret_access_key: kw['anon'] = True self.anon = kw.get('anon') self._signer = None if is_botocore(): import botocore.auth import botocore.credentials kw.pop('anon', None) if kw: raise TypeError('Unexpected keyword arguments: %s' % kw) if not self.anon: SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3'] self._signer = SignerCls( botocore.credentials.Credentials(aws_access_key_id, aws_secret_access_key)) else: _S3Connection = _get_boto_connection() try: self.conn = _S3Connection(aws_access_key_id, aws_secret_access_key, **kw) except Exception as ex: raise NotConfigured(str(ex)) self._download_http = httpdownloadhandler(settings).download_request
def __init__(self): # Open database connection self.db = mysql.connect(host=ROJAK_DB_HOST, port=ROJAK_DB_PORT, user=ROJAK_DB_USER, passwd=ROJAK_DB_PASS, db=ROJAK_DB_NAME) self.cursor = self.db.cursor() self.media = {} try: # Get media information from the database self.logger.info('Fetching media information') self.cursor.execute(sql_get_media, [self.name]) row = self.cursor.fetchone() self.media['id'] = row[0] self.media['last_scraped_at'] = row[1] except mysql.Error as err: self.logger.error('Unable to fetch media data: {}'.format(err)) raise NotConfigured('Unable to fetch media data: {}'.format(err)) if ROJAK_SLACK_TOKEN != '': self.is_slack = True self.slack = Slacker(ROJAK_SLACK_TOKEN) else: self.is_slack = False self.logger.info('Post error to #rojak-pantau-errors is disabled')
def from_crawler(cls, crawler): s = crawler.settings proxy_path = s.get('ROTATING_PROXY_LIST_PATH', None) if proxy_path is not None: with codecs.open(proxy_path, 'r', encoding='utf8') as f: proxy_list = [line.strip() for line in f if line.strip()] else: proxy_list = s.getlist('ROTATING_PROXY_LIST') if not proxy_list: raise NotConfigured() mw = cls( proxy_list=proxy_list, logstats_interval=s.getfloat('ROTATING_PROXY_LOGSTATS_INTERVAL', 30), stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER', False), max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES', 5), backoff_base=s.getfloat('ROTATING_PROXY_BACKOFF_BASE', 300), backoff_cap=s.getfloat('ROTATING_PROXY_BACKOFF_CAP', 3600), crawler=crawler, ) crawler.signals.connect(mw.engine_started, signal=signals.engine_started) crawler.signals.connect(mw.engine_stopped, signal=signals.engine_stopped) return mw
def __init__(self, crawler): if not crawler.settings.getbool('TELNETCONSOLE_ENABLED'): # 1 raise NotConfigured if not TWISTED_CONCH_AVAILABLE: raise NotConfigured( 'TELNETCONSOLE_ENABLED setting is True but required twisted ' 'modules failed to import:\n' + _TWISTED_CONCH_TRACEBACK) self.crawler = crawler self.noisy = False self.portrange = [ int(x) for x in crawler.settings.getlist('TELNETCONSOLE_PORT') ] # [6023, 6073] self.host = crawler.settings['TELNETCONSOLE_HOST'] # 127.0.0.1 self.username = crawler.settings['TELNETCONSOLE_USERNAME'] # scrapy self.password = crawler.settings['TELNETCONSOLE_PASSWORD'] # None if not self.password: self.password = binascii.hexlify(os.urandom(8)).decode('utf8') logger.info('Telnet Password: %s', self.password) self.crawler.signals.connect(self.start_listening, signals.engine_started) self.crawler.signals.connect(self.stop_listening, signals.engine_stopped)
def from_crawler(cls, crawler): """Called by Scrapy to create an instance of this middleware. :param crawler: Current crawler :type crawler: Crawler object :raises NotConfigured: Issue with middleware settings :return: Instance of the middleware :rtype: RandomProxyMiddelware """ # Get all the settings s = crawler.settings # Check if eabled if not s.getbool('SSP_ENABLED', default=False): raise NotConfigured( 'scrapy_scylla_proxies middleware is not enabled') # Fetch my settings scylla_uri = s.get('SSP_SCYLLA_URI', default='http://localhost:8899') timeout = s.getint('SSP_PROXY_TIMEOUT', default=60) https = s.getbool('SSP_HTTPS', default=True) splash_request_enabled = s.getbool( 'SSP_SPLASH_REQUEST_ENABLED', default=False) # Create a a Scylla object scylla = Scylla(scylla_uri) # Create an instance of this middleware mw = cls(scylla, timeout, https, crawler, splash_request_enabled) # Connect to signals crawler.signals.connect( mw.spider_closed, signal=signals.spider_closed) return mw
def _yield(self, items, response, data): if not items.get('yield'): return info = items.get('yield') if isinstance(info, six.string_types): if info != 'item': raise TypeError("unkwnown yield type, excepting `item`.") item_cls = self.create_item('ScraproItem', data.keys()) item = item_cls(data) return item elif isinstance(info, dict): info = copy.deepcopy(info) url_key = self._find_key(info, 'url') if not url_key: raise NotConfigured("need param url.") v = info.pop(url_key) url = self._extract(url_key, v, response, data=data) callback = info.pop('callback') kwargs = self._parse_yield(info, data) return Request(url, getattr(self, callback), **kwargs) else: raise TypeError("Unexcepted yield type, excepting string or dict.")
def from_crawler(cls, crawler): # 首先检查是否应该启用和提高扩展 # 否则不配置 if not crawler.settings.getbool('MYEXT_ENABLED'): raise NotConfigured if not 'redis_key' in crawler.spidercls.__dict__.keys(): raise NotConfigured('Only supports RedisSpider') # 获取配置中的时间片个数,默认为360个,30分钟 idle_number = crawler.settings.getint('IDLE_NUMBER', 360) # 实例化扩展对象 ext = cls(idle_number, crawler) # 将扩展对象连接到信号, 将signals.spider_idle 与 spider_idle() 方法关联起来。 crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) crawler.signals.connect(ext.spider_idle, signal=signals.spider_idle) # return the extension object return ext
def __init__(self, settings): if not any(self.__class__.__name__ in s for s in settings.getwithbase('SPIDER_MIDDLEWARES').keys()): raise ValueError('%s must be in SPIDER_MIDDLEWARES' % (self.__class__.__name__, )) if not settings.getbool('AUTOUNIT_ENABLED'): raise NotConfigured('scrapy-autounit is not enabled') if settings.getint('CONCURRENT_REQUESTS') > 1: logger.warn( 'Recording with concurrency > 1! ' 'Data races in shared object modification may create broken ' 'tests.') self.max_fixtures = settings.getint( 'AUTOUNIT_MAX_FIXTURES_PER_CALLBACK', default=10) self.max_fixtures = \ self.max_fixtures if self.max_fixtures >= 10 else 10 self.base_path = settings.get('AUTOUNIT_BASE_PATH', default=os.path.join( get_project_dir(), 'autounit')) create_dir(self.base_path, exist_ok=True) self.fixture_counters = {}
def from_crawler(cls, crawler): settings = crawler.settings if not settings.get('DATABASE_URI'): raise NotConfigured('Database settings are not configured.') return cls()
def from_crawler(cls, crawler): dsn = crawler.settings.get("SENTRY_DSN", None) if dsn is None: raise NotConfigured('No SENTRY_DSN configured') o = cls(dsn=dsn) return o