def __init__(self, *arg, **argdict): """ 初始化对象属性 """ self.rule = '' self.titleXpath = '' self.descriptionXpath = '' self.descriptionLenght = 0 self.contentXpath = '' self.linkXpath = '' self.imgUrlXpath = '' self.imageNum = 0 self.goodRemarkXpath = '' self.badRemarkXpath = '' self.zhunfaRemarkXpath = '' self.videoUrlXpath = '' self.pubDateXpath = '' self.guidXpath = '' self.rule_id = '' self.checkTxtXpath = '' self.max_deepth = 0 self.is_duplicate = False self.last_md5 = '' self.next_request_url = '' self.next_page_url_prefix = '' Spider.__init__(self, *arg, **argdict) self.currentNode = None self.isDone = False self.isFirstListPage = True
def __init__(self, **kwargs): Spider.__init__(self, **kwargs) self.config_file = kwargs.get('config_file', None) config = kwargs.get('config', None) if self.config_file: jconfig = jsonload(open(self.config_file)) elif config: jconfig = jsonloads(config) else: logger.critical('config_file or config is expected') raise Exception('config_file or config is expected') self.template = config_parse(jconfig) # 指定单个要爬的入口地址,可用于测试,或者单独爬取某个页面 self.test_url = kwargs.get('test_url', None) # 指定抓取页面数 self.max_pages = kwargs.get('max_pages', None) self.max_pages = int(self.max_pages) if self.max_pages is not None else None # extractor 测试 self.test_extractor = kwargs.get('test_extractor', None) # entity 测试 self.test_entity = kwargs.get('test_entity', None)
def __init__(self, userName='',password='',*args, **kwargs): Spider.__init__(self,*args, **kwargs) if ((userName.replace(' ','')=='') or (password.replace(' ','')=='')): print('请输入账号密码') self.closed('退出') else: self.userName=userName self.password=password
def __init__(self, crawler, *args, **kwargs): print "wwj debug in scrapy spider init" Spider.__init__(self, name=None, **kwargs) self.seed_mode= crawler.settings.get('SEED_MODE') if(self.seed_mode == 'seeds'): seeds_file = crawler.settings.get('SEEDS_FILE') self.load_seeds(seeds_file) elif (self.seed_mode == 'redis'): redis_client_path = crawler.settings.get('REDIS_CLIENT')
def __init__(self, *arg, **argdict): self.titleXpath = '' self.descriptionXpath = '' self.descriptionLenght = 0 self.linkXpath = '' self.imgUrlXpath = '' self.imageNum = 1 self.videoUrlXpath = '' self.pubDateXpath = '' self.guidXpath = '' self.rule_id = '' self.checkTxtXpath = '' self.is_remove_namespaces = False Spider.__init__(self, *arg, **argdict) self.currentNode = None
def test_store_load(self): jobdir = self.mktemp() os.mkdir(jobdir) spider = Spider(name="default") dt = datetime.now() ss = SpiderState(jobdir) ss.spider_opened(spider) spider.state["one"] = 1 spider.state["dt"] = dt ss.spider_closed(spider) spider2 = Spider(name="default") ss2 = SpiderState(jobdir) ss2.spider_opened(spider2) self.assertEqual(spider.state, {"one": 1, "dt": dt}) ss2.spider_closed(spider2)
def test_store_load(self): jobdir = self.mktemp() os.mkdir(jobdir) spider = Spider(name='default') dt = datetime.now() ss = SpiderState(jobdir) ss.spider_opened(spider) spider.state['one'] = 1 spider.state['dt'] = dt ss.spider_closed(spider) spider2 = Spider(name='default') ss2 = SpiderState(jobdir) ss2.spider_opened(spider2) self.assertEqual(spider.state, {'one': 1, 'dt': dt}) ss2.spider_closed(spider2)
def __init__(self): Spider.__init__(self) # trans from the ”民國“ into YYYY form. today = datetime(date.today().year, date.today().month, date.today().day - 1) def date_trans(date_): year = int(date_.split('/')[0]) - 1911 return '/'.join([str(year)] + date_.split('/')[1:]) self.temp = get_viewstate() self.formdata = { "ctl00$ScriptManager_Master": "ctl00$contentPlaceHolder$updatePanelMain|ctl00$contentPlaceHolder$btnQuery", "ctl00$ucLogin$txtMemberID": "", "ctl00$ucLogin$txtPassword": "", "ctl00$ucLogin$txtValCode": "", "ctl00$contentPlaceHolder$ucSolarLunar$radlSolarLunar": "S", "ctl00$contentPlaceHolder$txtSTransDate": date_trans(today.strftime('%Y/%m/%d')), "ctl00$contentPlaceHolder$txtETransDate": date_trans(today.strftime('%Y/%m/%d')), "ctl00$contentPlaceHolder$txtMarket": "全部市場", "ctl00$contentPlaceHolder$hfldMarketNo": "ALL", "ctl00$contentPlaceHolder$txtProduct": "全部產品", "ctl00$contentPlaceHolder$hfldProductNo": "ALL", "ctl00$contentPlaceHolder$hfldProductType": "A", "__EVENTTARGET": "", "__EVENTARGUMENT": "", "__VIEWSTATE": self.temp[0], "__EVENTVALIDATION": self.temp[1], "__ASYNCPOST": "true", "ctl00$contentPlaceHolder$btnQuery": "查詢"} self.headers = {'Referer': self.start_urls, 'Accept': ' application/json, text/javascript, */*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36" }
def __init__(self, *arg, **argdict): """ 初始化对象属性 """ self.rule = '' self.titleXpath = '' self.descriptionXpath = '' self.descriptionLenght = 0 self.linkXpath = '' self.imgUrlXpath = '' self.imageNum = 1 self.videoUrlXpath = '' self.pubDateXpath = '' self.guidXpath = '' self.rule_id = '' self.checkTxtXpath = '' self.is_remove_namespaces = False self.last_md5 = '' self.next_request_url = '' Spider.__init__(self, *arg, **argdict) self.currentNode = None self.isDone = False self.isFirstListPage = True
def test_connect_request(self): request = Request(self.getURL('file'), method='CONNECT') d = self.download_request(request, Spider('foo')) d.addCallback(lambda r: r.body) d.addCallback(self.assertEqual, b'') return d
def setUp(self): self.spider = Spider('myspider', start_urls=["http://example.com"]) self.response = HtmlResponse(body=b"<html></html>", url="http://www.example.com")
def setUp(self): self.spider = Spider('foo') self.settings = {'CRAWLERA_APIKEY': 'apikey'}
def setUp(self): self.spider = Spider('media.com') self.pipe = self.pipeline_class(download_func=_mocked_download_func, settings=Settings(self.settings)) self.pipe.open_spider(self.spider) self.info = self.pipe.spiderinfo
def setUp(self): self.spider = Spider('foo') self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOW_ALL': True})) self.req = Request('http://scrapytest.org') self.res200, self.res404, self.res402 = _responses( self.req, [200, 404, 402])
def test_download(self): request = Request(self.getURL('file')) d = self.download_request(request, Spider('foo')) d.addCallback(lambda r: r.body) d.addCallback(self.assertEqual, b"0123456789") return d
def __init__(self, *args, **kwargs): Spider.__init__(self, *args, **kwargs) self.redis_conn = None
def setUp(self): self.download_handler = DataURIDownloadHandler(Settings()) self.download_request = self.download_handler.download_request self.spider = Spider('foo')
def setUp(self): self.spider = Spider('df_tests') self.temp_dir = tempfile.gettempdir() self.db_path = os.path.join(self.temp_dir, 'df_tests.db') crawler = get_crawler(Spider) self.stats = StatsCollector(crawler)
def create_scheduler(self): self.mock_crawler = MockCrawler(self.priority_queue_cls, self.jobdir) self.scheduler = Scheduler.from_crawler(self.mock_crawler) self.spider = Spider(name='spider') self.scheduler.open(self.spider)
def test_download_head(self): request = Request(self.getURL('file'), method='HEAD') d = self.download_request(request, Spider('foo')) d.addCallback(lambda r: r.body) d.addCallback(self.assertEquals, b'') return d
def setUp(self): s3reqh = S3DownloadHandler(Settings(), self.AWS_ACCESS_KEY_ID, \ self.AWS_SECRET_ACCESS_KEY, \ httpdownloadhandler=HttpDownloadHandlerMock) self.download_request = s3reqh.download_request self.spider = Spider('foo')
def setUp(self): self.spider = Spider('foo') self.mw = RefererMiddleware()
def setUp(self): self.spider = Spider('foo') self.settings = {'CRAWLERA_USER': '******', 'CRAWLERA_PASS': '******'}
def __init__(self, name=None, **kwargs): Spider.__init__(self, name, **kwargs)
def setup_module(module): global spider spider = Spider('spidr')
def test_download_chunked_content(self): request = Request(self.getURL('chunked')) d = self.download_request(request, Spider('foo')) d.addCallback(lambda r: r.body) d.addCallback(self.assertEqual, b"chunked content\n") return d
def setUp(self): self.mw = DecompressionMiddleware() self.spider = Spider('foo')
def parse(self, response): Spider.parse(self, response)
def _get_spider(self): return Spider('foo')
def setUp(self): crawler = get_crawler(Spider) self.spider = Spider.from_crawler(crawler, name='foo') self.mw = HttpErrorMiddleware(Settings({})) self.req = Request('http://scrapytest.org') self.res200, self.res404 = _responses(self.req, [200, 404])
def setUp(self): crawler = get_crawler() self.download_handler = create_instance(DataURIDownloadHandler, crawler.settings, crawler) self.download_request = self.download_handler.download_request self.spider = Spider('foo')
def setUp(self): self.formatter = LogFormatter() self.spider = Spider('default')
def __init__(self): Spider.__init__(self) self.browser = webdriver.Chrome( '/Users/liulizhe/Desktop/python_file/chromedriver')
def __init__(self, *args, **kwargs): Spider.__init__(self) SpiderBase.__init__(*args, **kwargs)
def test_redirect_status_head(self): request = Request(self.getURL('redirect'), method='HEAD') d = self.download_request(request, Spider('foo')) d.addCallback(lambda r: r.status) d.addCallback(self.assertEqual, 302) return d
from copy import deepcopy from functools import partial from typing import Dict from urllib.parse import urlparse from scrapy.crawler import Crawler from scrapy.exceptions import NotConfigured from scrapy.http import Request from scrapy.settings import Settings from scrapy.spiders import Spider from twisted.trial.unittest import TestCase from scrapy_proxy_management.downloadermiddlewares.httpproxy import \ HttpProxyMiddleware _spider = Spider('foo') @contextmanager def _open_spider( spider: Spider, settings: Settings, auth_encoding: str = 'latin-1' ): crawler = Crawler(spider, settings) middleware = HttpProxyMiddleware( crawler=crawler, auth_encoding=auth_encoding ) middleware.open_spider(spider)
def test_download_with_maxsize_per_req(self): meta = {'download_maxsize': 2} request = Request(self.getURL('file'), meta=meta) d = self.download_request(request, Spider('foo')) yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)
def __init__(self, *args, **kwargs): Spider.__init__(self) self._set_config(**kwargs)
def test_download_with_small_maxsize_per_spider(self): request = Request(self.getURL('file')) d = self.download_request(request, Spider('foo', download_maxsize=2)) yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)
def test_download_with_large_maxsize_per_spider(self): request = Request(self.getURL('file')) d = self.download_request(request, Spider('foo', download_maxsize=100)) d.addCallback(lambda r: r.body) d.addCallback(self.assertEqual, b"0123456789") return d
def test_non_existent(self): request = Request('file://%s' % self.mktemp()) d = self.download_request(request, Spider('foo')) return self.assertFailure(d, IOError)
def __del__(self): self.selenium.close() print(self.verificationErrors) Spider.__del__(self)