コード例 #1
0
    def from_crawler(cls, crawler):
        """Initialize the middleware with the crawler settings"""

        driver_name = crawler.settings.get('SELENIUM_DRIVER_NAME')
        driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH')
        driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS')

        if not driver_name or not driver_executable_path:
            raise NotConfigured(
                'SELENIUM_DRIVER_NAME and SELENIUM_DRIVER_EXECUTABLE_PATH must be set'
            )

        middleware = cls(
            driver_name=driver_name,
            driver_executable_path=driver_executable_path,
            driver_arguments=driver_arguments
        )

        crawler.signals.connect(middleware.spider_closed, signals.spider_closed)

        return middleware
コード例 #2
0
 def from_crawler(cls, crawler):
     s = crawler.settings
     proxy_path = s.get('ROTATING_PROXY_LIST_PATH', None)
     if proxy_path is not None:
         with codecs.open(proxy_path, 'r', encoding='utf8') as f:
             proxy_list = [line.strip() for line in f if line.strip()]
     else:
         proxy_list = s.getlist('ROTATING_PROXY_LIST')
     if not proxy_list:
         raise NotConfigured()
     mw = cls(proxy_list=proxy_list,
              stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER',
                                           False),
              max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES',
                                          5),
              max_order=s.getint("PROXY_MAX_ORDER", 1000000),
              timeout_if_no_proxy=s.getint("TIMEOUT_IF_NO_PROXY", 300),
              proxy_download_delay=s.getint("PROXY_DELAY", 3),
              randomize_download_delay=s.getbool("RANDOMIZE_DOWNLOAD_DELAY",
                                                 True))
     return mw
コード例 #3
0
    def _parse(self, response, **kwargs):
        if not hasattr(self, "parse_node"):
            raise NotConfigured(
                "You must define parse_node method in order to scrape this XML feed"
            )

        response = self.adapt_response(response)
        if self.iterator == "iternodes":
            nodes = self._iternodes(response)
        elif self.iterator == "xml":
            selector = Selector(response, type="xml")
            self._register_namespaces(selector)
            nodes = selector.xpath(f"//{self.itertag}")
        elif self.iterator == "html":
            selector = Selector(response, type="html")
            self._register_namespaces(selector)
            nodes = selector.xpath(f"//{self.itertag}")
        else:
            raise NotSupported("Unsupported node iterator")

        return self.parse_nodes(response, nodes)
コード例 #4
0
    def parse(self, response):
        if not hasattr(self, 'parse_node'):
            raise NotConfigured(
                'You must define parse_node method in order to scrape this XML feed'
            )

        response = self.adapt_response(response)
        if self.iterator == 'iternodes':
            nodes = xmliter(response, self.itertag)
        elif self.iterator == 'xml':
            selector = XmlXPathSelector(response)
            self._register_namespaces(selector)
            nodes = selector.select('//%s' % self.itertag)
        elif self.iterator == 'html':
            selector = HtmlXPathSelector(response)
            self._register_namespaces(selector)
            nodes = selector.select('//%s' % self.itertag)
        else:
            raise NotSupported('Unsupported node iterator')

        return self.parse_nodes(response, nodes)
コード例 #5
0
ファイル: feed.py プロジェクト: sernakos/scrapy
    def _parse(self, response, **kwargs):
        if not hasattr(self, 'parse_node'):
            raise NotConfigured(
                'You must define parse_node method in order to scrape this XML feed'
            )

        response = self.adapt_response(response)
        if self.iterator == 'iternodes':
            nodes = self._iternodes(response)
        elif self.iterator == 'xml':
            selector = Selector(response, type='xml')
            self._register_namespaces(selector)
            nodes = selector.xpath(f'//{self.itertag}')
        elif self.iterator == 'html':
            selector = Selector(response, type='html')
            self._register_namespaces(selector)
            nodes = selector.xpath(f'//{self.itertag}')
        else:
            raise NotSupported('Unsupported node iterator')

        return self.parse_nodes(response, nodes)
コード例 #6
0
    def __init__(self, settings, *,
                 crawler=None,
                 aws_access_key_id=None, aws_secret_access_key=None,
                 aws_session_token=None,
                 httpdownloadhandler=HTTPDownloadHandler, **kw):
        if not is_botocore_available():
            raise NotConfigured('missing botocore library')

        if not aws_access_key_id:
            aws_access_key_id = settings['AWS_ACCESS_KEY_ID']
        if not aws_secret_access_key:
            aws_secret_access_key = settings['AWS_SECRET_ACCESS_KEY']
        if not aws_session_token:
            aws_session_token = settings['AWS_SESSION_TOKEN']

        # If no credentials could be found anywhere,
        # consider this an anonymous connection request by default;
        # unless 'anon' was set explicitly (True/False).
        anon = kw.get('anon')
        if anon is None and not aws_access_key_id and not aws_secret_access_key:
            kw['anon'] = True
        self.anon = kw.get('anon')

        self._signer = None
        import botocore.auth
        import botocore.credentials
        kw.pop('anon', None)
        if kw:
            raise TypeError(f'Unexpected keyword arguments: {kw}')
        if not self.anon:
            SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3']
            self._signer = SignerCls(botocore.credentials.Credentials(
                aws_access_key_id, aws_secret_access_key, aws_session_token))

        _http_handler = create_instance(
            objcls=httpdownloadhandler,
            settings=settings,
            crawler=crawler,
        )
        self._download_http = _http_handler.download_request
コード例 #7
0
 def __init__(self, crawler):
     self.crawler = crawler
     settings = self.crawler.settings
     try:
         self.producer = AutoProducer(
             bootstrap_servers=settings.getlist(KAFKA_PRODUCER_BROKERS),
             configs=settings.get(KAFKA_PRODUCER_CONFIGS, None),
             topic=settings.get(KAFKA_PRODUCER_TOPIC, None),
             kafka_loglevel=loglevel(
                 settings.get(KAFKA_PRODUCER_LOGLEVEL, "WARNING")),
         )
     except Exception as e:
         raise NotConfigured(f"init producer {e}")
     self.logger = logging.getLogger(self.__class__.__name__)
     crawler.signals.connect(self.spider_closed, signals.spider_closed)
     self.exporter = TextDictKeyPythonItemExporter(
         binary=False,
         ensure_base64=settings.getbool(KAFKA_VALUE_ENSURE_BASE64, False),
     )
     self.encoder = ScrapyJSNONBase64Encoder()
     self.field_filter = set(settings.getlist("KAFKA_EXPORT_FILTER", []))
     self.logger.debug(f"KAFKA_EXPORT_FILTER: {self.field_filter}")
コード例 #8
0
    def parse_epoch(epoch):
        if isinstance(epoch, bool) or isinstance(epoch, datetime):
            return epoch
        elif epoch == 'True':
            return True
        elif epoch == 'False':
            return False

        try:
            return datetime.strptime(epoch, EPOCH_DATE_FORMAT)
        except ValueError:
            pass

        parser = parsedatetime.Calendar(Constants())
        time_tupple = parser.parse(
            epoch)  # 'yesterday' => (time.struct_time, int)
        if not time_tupple[1]:
            raise NotConfigured('Could not parse epoch: %s' % epoch)

        time_struct = time_tupple[0]

        return datetime(*time_struct[:6])
コード例 #9
0
def get_env_variable(var_name, default=None):
    """ Get the environment variable or raise an exception.

    Args:
        var_name (str): the name of the environment variable.

    Keyword Args:
        default (str): the default value to use of the environment variable
            is not set.

    Returns:
        str: the value for the specified environment variable.

    Raises:
        a NotConfigured exception if there is no environment variable
        with the specified name and a default value was not given.
    """
    value = os.environ.get(var_name, default)
    if value is None:
        from scrapy.exceptions import NotConfigured
        raise NotConfigured("Set the %s environment variable" % var_name)
    return value
コード例 #10
0
    def from_crawler(cls, crawler, client=None, dsn=None):
        release = crawler.settings.get("RELEASE", get_release(crawler))
        additional_opts = crawler.settings.get("SENTRY_CLIENT_OPTIONS", {})

        dsn = os.environ.get("SENTRY_DSN",
                             crawler.settings.get("SENTRY_DSN", None))
        if dsn is None:
            raise NotConfigured('No SENTRY_DSN configured')
        o = cls(dsn=dsn, release=release, **additional_opts)

        sentry_signals = crawler.settings.get("SENTRY_SIGNALS", [])
        if len(sentry_signals) > 0:
            receiver = o.spider_error
            for signalpath in sentry_signals:
                signalmodule, signalname = signalpath.rsplit('.', 1)
                _m = importlib.import_module(signalmodule)
                signal = getattr(_m, signalname)
                crawler.signals.connect(receiver, signal=signal)
        else:
            crawler.signals.connect(o.spider_error,
                                    signal=signals.spider_error)
        return o
コード例 #11
0
    def __init__(self, settings):
        self.google_cloud_enabled = settings.getbool('GOOGLE_CLOUD_ENABLED')

        if self.google_cloud_enabled:
            credentials_json = settings.get(
                'GOOGLE_CLOUD_APPLICATION_CREDENTIALS_JSON')
            if credentials_json:
                if not os.path.isfile(self.credentials_json_path):
                    with open(self.credentials_json_path, 'w') as outfile:
                        outfile.write(credentials_json)

                os.environ[
                    'GOOGLE_APPLICATION_CREDENTIALS'] = self.credentials_json_path
                logger.info('Google Cloud extensions inited with success')
            else:
                settings.set('GOOGLE_CLOUD_ENABLED', False)
                raise NotConfigured(
                    'GOOGLE_CLOUD_APPLICATION_CREDENTIALS_JSON '
                    'not is set in settings')

        else:
            logger.info('GOOGLE_CLOUD_ENABLED is False')
コード例 #12
0
    def __init__(self):
        # Open database connection
        self.db = mysql.connect(host=ROJAK_DB_HOST,
                                port=ROJAK_DB_PORT,
                                user=ROJAK_DB_USER,
                                passwd=ROJAK_DB_PASS,
                                db=ROJAK_DB_NAME)
        self.cursor = self.db.cursor()
        self.cursor_urls = self.db.cursor()

        # Using UTF-8 Encoding
        self.db.set_character_set('utf8')
        self.cursor.execute('SET NAMES utf8;')
        self.cursor.execute('SET CHARACTER SET utf8;')
        self.cursor.execute('SET character_set_connection=utf8;')

        self.media = {}
        self.media['name'] = 'detikcom'
        try:
            # Get media information from the database
            self.logger.info('Fetching media information')
            self.cursor.execute(sql_get_media, [self.media['name']])
            row = self.cursor.fetchone()
            self.media['id'] = row[0]
            self.media['last_scraped_at'] = row[1]
        except mysql.Error as err:
            self.logger.error('Unable to fetch media data: %s', err)
            raise NotConfigured('Unable to fetch media data: %s' % err)

        if ROJAK_SLACK_TOKEN != '':
            self.is_slack = True
            self.slack = Slacker(ROJAK_SLACK_TOKEN)
        else:
            self.is_slack = False
            self.logger.info('Post error to #rojak-pantau-errors is disabled')

        # We execute the cursor here and we fetch one by one
        self.cursor_urls.execute(sql_get_urls, [self.media['id']])
コード例 #13
0
    def from_crawler(cls, crawler):
        settings = crawler.settings

        if crawler.settings.getbool('BROWSER_ENGINE_COOKIES_ENABLED', False):
            if crawler.settings.getbool('COOKIES_ENABLED'):
                logger.warning("Default cookies middleware enabled together "
                               "with browser engine aware cookies middleware. "
                               "Set COOKIES_ENABLED to False.")
            cookies_mw = RemotelyAccessbileCookiesMiddleware(
                debug=crawler.settings.getbool('COOKIES_DEBUG'))
        else:
            cookies_mw = None

        server = settings.get('BROWSER_ENGINE_SERVER')
        if server:
            endpoint = clientFromString(reactor, server)
        else:
            if settings.getbool('BROWSER_ENGINE_START_SERVER', False):
                # Twisted logs the process's stderr with INFO level.
                logging.getLogger("twisted").setLevel(logging.INFO)
                argv = [
                    sys.executable, "-m", "scrapy_qtwebkit.browser_engine",
                    "stdio"
                ]
                endpoint = ProcessEndpoint(reactor, argv[0], argv, env=None)
            else:
                raise NotConfigured(
                    "Must provide either BROWSER_ENGINE_SERVER "
                    "or BROWSER_ENGINE_START_SERVER")

        ext = cls(
            crawler,
            endpoint,
            page_limit=settings.getint('BROWSER_ENGINE_PAGE_LIMIT', 4),
            cookies_middleware=cookies_mw,
        )

        return ext
コード例 #14
0
    def from_crawler(cls, crawler):
        s = crawler.settings
        proxy_path = s.get('ROTATING_PROXY_LIST_PATH', None)
        proxy_url = s.get('ROTATING_PROXY_URL', None)

        if proxy_path is not None:
            with codecs.open(proxy_path, 'r', encoding='utf8') as f:
                proxy_list = [line.strip() for line in f if line.strip()]

        elif proxy_url is not None:
            http = urllib3.PoolManager()
            request = http.request('GET', proxy_url)
            if request.status == 200:
                proxies_str = request.data.decode('utf-8').strip()
                proxies = proxies_str.split('\r\n')
                logger.info(f"Get Proxy list is {len(proxies)}")
                proxy_list = proxies
            else:
                logger.error("Get Proxy Http Status is not 200")

        else:
            proxy_list = s.getlist('ROTATING_PROXY_LIST')
        if not proxy_list:
            raise NotConfigured()
        mw = cls(
            proxy_list=proxy_list,
            logstats_interval=s.getfloat('ROTATING_PROXY_LOGSTATS_INTERVAL', 30),
            stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER', False),
            max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES', 5),
            backoff_base=s.getfloat('ROTATING_PROXY_BACKOFF_BASE', 300),
            backoff_cap=s.getfloat('ROTATING_PROXY_BACKOFF_CAP', 3600),
            crawler=crawler,
        )
        crawler.signals.connect(mw.engine_started,
                                signal=signals.engine_started)
        crawler.signals.connect(mw.engine_stopped,
                                signal=signals.engine_stopped)
        return mw
コード例 #15
0
 def from_crawler(cls, crawler):
     s = crawler.settings
     ninja_key = s.get('ROTATING_NINJA_KEY', None)
     proxy_list = s.getlist('ROTATING_PROXY_LIST', [])
     if ninja_key is None and len(proxy_list) == 0:
         raise NotConfigured()
     mw = cls(
         ninja_key=ninja_key,
         proxy_list=proxy_list,
         logstats_interval=s.getfloat('ROTATING_PROXY_LOGSTATS_INTERVAL',
                                      30),
         renew_interval=s.getfloat('ROTATING_NINJA_RENEW_INTERVAL', 1200),
         stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER', False),
         max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES', 5),
         backoff_base=s.getfloat('ROTATING_PROXY_BACKOFF_BASE', 300),
         backoff_cap=s.getfloat('ROTATING_PROXY_BACKOFF_CAP', 3600),
         crawler=crawler,
     )
     crawler.signals.connect(mw.engine_started,
                             signal=signals.engine_started)
     crawler.signals.connect(mw.engine_stopped,
                             signal=signals.engine_stopped)
     return mw
コード例 #16
0
    def __init__(self, crawler):
        if not crawler.settings.getbool("TELNETCONSOLE_ENABLED"):
            raise NotConfigured
        if not TWISTED_CONCH_AVAILABLE:
            raise NotConfigured(
                "TELNETCONSOLE_ENABLED setting is True but required twisted "
                "modules failed to import:\n" + _TWISTED_CONCH_TRACEBACK
            )
        self.crawler = crawler
        self.noisy = False
        self.portrange = [
            int(x) for x in crawler.settings.getlist("TELNETCONSOLE_PORT")
        ]
        self.host = crawler.settings["TELNETCONSOLE_HOST"]
        self.username = crawler.settings["TELNETCONSOLE_USERNAME"]
        self.password = crawler.settings["TELNETCONSOLE_PASSWORD"]

        if not self.password:
            self.password = binascii.hexlify(os.urandom(8)).decode("utf8")
            logger.info("Telnet Password: %s", self.password)

        self.crawler.signals.connect(self.start_listening, signals.engine_started)
        self.crawler.signals.connect(self.stop_listening, signals.engine_stopped)
コード例 #17
0
    def from_crawler(cls, crawler):
        """Get the settings and initialize"""

        algolia_api_id = crawler.settings['ALGOLIA_API_ID']
        algolia_api_key = crawler.settings['ALGOLIA_API_KEY']
        algolia_index_name = crawler.settings['ALGOLIA_INDEX_NAME']

        if not algolia_api_id or not algolia_api_key or not algolia_index_name:
            raise NotConfigured(
                'Missing configuration for the AlgoliaItemPipeline'
            )

        algolia_item_bulk_nbr = crawler.settings.get(
            'ALGOLIA_ITEM_BULK_NBR',
            100
        )

        return cls(
            algolia_api_id,
            algolia_api_key,
            algolia_index_name,
            algolia_item_bulk_nbr
        )
コード例 #18
0
 def makeRequest(self,
                 url,
                 referenceUrl=None,
                 callBackFunctionName=None,
                 meta={},
                 urlId=None,
                 priority=1,
                 **kw):
     '''
     创建Request
     '''
     if not urlId:
         raise NotConfigured('爬虫%s创建Request的url%s没有提供id,将导致无法更新url的状态' %
                             (self.name, url))
     if (callBackFunctionName != None):
         print '危险:在scrapyApt初始化的request,只有函数名'
         kw.setdefault('callback', callBackFunctionName)
     meta['urlId'] = urlId
     meta['download_timeout'] = 180
     meta['depth'] = 0
     kw.setdefault('meta', meta)
     kw.setdefault('priority', priority)
     return Request(url, **kw)
コード例 #19
0
ファイル: feedexport.py プロジェクト: shenyafeng/scrapy
 def __init__(self, uri, access_key=None, secret_key=None, acl=None, endpoint_url=None, *,
              feed_options=None, session_token=None):
     if not is_botocore_available():
         raise NotConfigured('missing botocore library')
     u = urlparse(uri)
     self.bucketname = u.hostname
     self.access_key = u.username or access_key
     self.secret_key = u.password or secret_key
     self.session_token = session_token
     self.keyname = u.path[1:]  # remove first "/"
     self.acl = acl
     self.endpoint_url = endpoint_url
     import botocore.session
     session = botocore.session.get_session()
     self.s3_client = session.create_client(
         's3', aws_access_key_id=self.access_key,
         aws_secret_access_key=self.secret_key,
         aws_session_token=self.session_token,
         endpoint_url=self.endpoint_url)
     if feed_options and feed_options.get('overwrite', True) is False:
         logger.warning('S3 does not support appending to files. To '
                        'suppress this warning, remove the overwrite '
                        'option from your FEEDS setting or set it to True.')
コード例 #20
0
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        # Initialize settings for selenium driver with the crawler settings

        driver_name = crawler.settings.get('SELENIUM_DRIVER_NAME')
        driver_executable_path = crawler.settings.get(
            'SELENIUM_DRIVER_EXECUTABLE_PATH')
        driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS')

        if not driver_name or not driver_executable_path:
            raise NotConfigured(
                'SELENIUM_DRIVER_NAME and SELENIUM_DRIVER_EXECUTABLE_PATH must be set'
            )

        middleware_settings = cls(
            driver_name=driver_name,
            driver_executable_path=driver_executable_path,
            driver_arguments=driver_arguments)
        crawler.signals.connect(middleware_settings.spider_opened,
                                signal=signals.spider_opened)
        crawler.signals.connect(middleware_settings.spider_closed,
                                signal=signals.spider_closed)
        return middleware_settings
コード例 #21
0
ファイル: s3.py プロジェクト: yashagarwal1999/scrapy
    def __init__(self, settings, aws_access_key_id=None, aws_secret_access_key=None, \
            httpdownloadhandler=HTTPDownloadHandler, **kw):

        if not aws_access_key_id:
            aws_access_key_id = settings['AWS_ACCESS_KEY_ID']
        if not aws_secret_access_key:
            aws_secret_access_key = settings['AWS_SECRET_ACCESS_KEY']

        # If no credentials could be found anywhere,
        # consider this an anonymous connection request by default;
        # unless 'anon' was set explicitly (True/False).
        anon = kw.get('anon')
        if anon is None and not aws_access_key_id and not aws_secret_access_key:
            kw['anon'] = True
        self.anon = kw.get('anon')

        self._signer = None
        if is_botocore():
            import botocore.auth
            import botocore.credentials
            kw.pop('anon', None)
            if kw:
                raise TypeError('Unexpected keyword arguments: %s' % kw)
            if not self.anon:
                SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3']
                self._signer = SignerCls(
                    botocore.credentials.Credentials(aws_access_key_id,
                                                     aws_secret_access_key))
        else:
            _S3Connection = _get_boto_connection()
            try:
                self.conn = _S3Connection(aws_access_key_id,
                                          aws_secret_access_key, **kw)
            except Exception as ex:
                raise NotConfigured(str(ex))

        self._download_http = httpdownloadhandler(settings).download_request
コード例 #22
0
ファイル: sindonews.py プロジェクト: rrmdn/rojak
    def __init__(self):
        # Open database connection
        self.db = mysql.connect(host=ROJAK_DB_HOST, port=ROJAK_DB_PORT,
            user=ROJAK_DB_USER, passwd=ROJAK_DB_PASS, db=ROJAK_DB_NAME)
        self.cursor = self.db.cursor()

        self.media = {}
        try:
            # Get media information from the database
            self.logger.info('Fetching media information')
            self.cursor.execute(sql_get_media, [self.name])
            row = self.cursor.fetchone()
            self.media['id'] = row[0]
            self.media['last_scraped_at'] = row[1]
        except mysql.Error as err:
            self.logger.error('Unable to fetch media data: {}'.format(err))
            raise NotConfigured('Unable to fetch media data: {}'.format(err))

        if ROJAK_SLACK_TOKEN != '':
            self.is_slack = True
            self.slack = Slacker(ROJAK_SLACK_TOKEN)
        else:
            self.is_slack = False
            self.logger.info('Post error to #rojak-pantau-errors is disabled')
コード例 #23
0
 def from_crawler(cls, crawler):
     s = crawler.settings
     proxy_path = s.get('ROTATING_PROXY_LIST_PATH', None)
     if proxy_path is not None:
         with codecs.open(proxy_path, 'r', encoding='utf8') as f:
             proxy_list = [line.strip() for line in f if line.strip()]
     else:
         proxy_list = s.getlist('ROTATING_PROXY_LIST')
     if not proxy_list:
         raise NotConfigured()
     mw = cls(
         proxy_list=proxy_list,
         logstats_interval=s.getfloat('ROTATING_PROXY_LOGSTATS_INTERVAL', 30),
         stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER', False),
         max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES', 5),
         backoff_base=s.getfloat('ROTATING_PROXY_BACKOFF_BASE', 300),
         backoff_cap=s.getfloat('ROTATING_PROXY_BACKOFF_CAP', 3600),
         crawler=crawler,
     )
     crawler.signals.connect(mw.engine_started,
                             signal=signals.engine_started)
     crawler.signals.connect(mw.engine_stopped,
                             signal=signals.engine_stopped)
     return mw
コード例 #24
0
    def __init__(self, crawler):
        if not crawler.settings.getbool('TELNETCONSOLE_ENABLED'):  # 1
            raise NotConfigured
        if not TWISTED_CONCH_AVAILABLE:
            raise NotConfigured(
                'TELNETCONSOLE_ENABLED setting is True but required twisted '
                'modules failed to import:\n' + _TWISTED_CONCH_TRACEBACK)
        self.crawler = crawler
        self.noisy = False
        self.portrange = [
            int(x) for x in crawler.settings.getlist('TELNETCONSOLE_PORT')
        ]  # [6023, 6073]
        self.host = crawler.settings['TELNETCONSOLE_HOST']  # 127.0.0.1
        self.username = crawler.settings['TELNETCONSOLE_USERNAME']  # scrapy
        self.password = crawler.settings['TELNETCONSOLE_PASSWORD']  # None

        if not self.password:
            self.password = binascii.hexlify(os.urandom(8)).decode('utf8')
            logger.info('Telnet Password: %s', self.password)

        self.crawler.signals.connect(self.start_listening,
                                     signals.engine_started)
        self.crawler.signals.connect(self.stop_listening,
                                     signals.engine_stopped)
コード例 #25
0
    def from_crawler(cls, crawler):
        """Called by Scrapy to create an instance of this middleware.

        :param crawler: Current crawler
        :type crawler: Crawler object
        :raises NotConfigured: Issue with middleware settings
        :return: Instance of the middleware
        :rtype: RandomProxyMiddelware
        """

        # Get all the settings
        s = crawler.settings

        # Check if eabled
        if not s.getbool('SSP_ENABLED', default=False):
            raise NotConfigured(
                'scrapy_scylla_proxies middleware is not enabled')

        # Fetch my settings
        scylla_uri = s.get('SSP_SCYLLA_URI', default='http://localhost:8899')
        timeout = s.getint('SSP_PROXY_TIMEOUT', default=60)
        https = s.getbool('SSP_HTTPS', default=True)
        splash_request_enabled = s.getbool(
            'SSP_SPLASH_REQUEST_ENABLED', default=False)

        # Create a a Scylla object
        scylla = Scylla(scylla_uri)

        # Create an instance of this middleware
        mw = cls(scylla, timeout, https, crawler, splash_request_enabled)

        # Connect to signals
        crawler.signals.connect(
            mw.spider_closed, signal=signals.spider_closed)

        return mw
コード例 #26
0
    def _yield(self, items, response, data):
        if not items.get('yield'):
            return
        info = items.get('yield')
        if isinstance(info, six.string_types):
            if info != 'item':
                raise TypeError("unkwnown yield type, excepting `item`.")

            item_cls = self.create_item('ScraproItem', data.keys())
            item = item_cls(data)
            return item

        elif isinstance(info, dict):
            info = copy.deepcopy(info)
            url_key = self._find_key(info, 'url')
            if not url_key:
                raise NotConfigured("need param url.")
            v = info.pop(url_key)
            url = self._extract(url_key, v, response, data=data)
            callback = info.pop('callback')
            kwargs = self._parse_yield(info, data)
            return Request(url, getattr(self, callback), **kwargs)
        else:
            raise TypeError("Unexcepted yield type, excepting string or dict.")
コード例 #27
0
    def from_crawler(cls, crawler):
        # 首先检查是否应该启用和提高扩展
        # 否则不配置

        if not crawler.settings.getbool('MYEXT_ENABLED'):
            raise NotConfigured

        if not 'redis_key' in crawler.spidercls.__dict__.keys():
            raise NotConfigured('Only supports RedisSpider')

        # 获取配置中的时间片个数,默认为360个,30分钟
        idle_number = crawler.settings.getint('IDLE_NUMBER', 360)

        # 实例化扩展对象
        ext = cls(idle_number, crawler)

        # 将扩展对象连接到信号, 将signals.spider_idle 与 spider_idle() 方法关联起来。
        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(ext.spider_idle, signal=signals.spider_idle)


        # return the extension object
        return ext
コード例 #28
0
    def __init__(self, settings):
        if not any(self.__class__.__name__ in s
                   for s in settings.getwithbase('SPIDER_MIDDLEWARES').keys()):
            raise ValueError('%s must be in SPIDER_MIDDLEWARES' %
                             (self.__class__.__name__, ))
        if not settings.getbool('AUTOUNIT_ENABLED'):
            raise NotConfigured('scrapy-autounit is not enabled')
        if settings.getint('CONCURRENT_REQUESTS') > 1:
            logger.warn(
                'Recording with concurrency > 1! '
                'Data races in shared object modification may create broken '
                'tests.')

        self.max_fixtures = settings.getint(
            'AUTOUNIT_MAX_FIXTURES_PER_CALLBACK', default=10)
        self.max_fixtures = \
            self.max_fixtures if self.max_fixtures >= 10 else 10

        self.base_path = settings.get('AUTOUNIT_BASE_PATH',
                                      default=os.path.join(
                                          get_project_dir(), 'autounit'))
        create_dir(self.base_path, exist_ok=True)

        self.fixture_counters = {}
コード例 #29
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     if not settings.get('DATABASE_URI'):
         raise NotConfigured('Database settings are not configured.')
     return cls()
コード例 #30
0
 def from_crawler(cls, crawler):
     dsn = crawler.settings.get("SENTRY_DSN", None)
     if dsn is None:
         raise NotConfigured('No SENTRY_DSN configured')
     o = cls(dsn=dsn)
     return o