Ejemplo n.º 1
0
    def __init__(self, *arg, **argdict):
        """ 初始化对象属性 """

        self.rule = ''
        self.titleXpath = ''
        self.descriptionXpath = ''
        self.descriptionLenght = 0
        self.contentXpath = ''
        self.linkXpath = ''
        self.imgUrlXpath = ''
        self.imageNum = 0
        self.goodRemarkXpath = ''
        self.badRemarkXpath = ''
        self.zhunfaRemarkXpath = ''

        self.videoUrlXpath = ''
        self.pubDateXpath = ''
        self.guidXpath = ''
        self.rule_id = ''
        self.checkTxtXpath = ''
        self.max_deepth = 0
        self.is_duplicate = False
        self.last_md5 = ''
        self.next_request_url = ''
        self.next_page_url_prefix = ''
        Spider.__init__(self, *arg, **argdict)
        self.currentNode = None
        self.isDone = False
        self.isFirstListPage = True
Ejemplo n.º 2
0
    def __init__(self, **kwargs):
        Spider.__init__(self, **kwargs)

        self.config_file = kwargs.get('config_file', None)
        config = kwargs.get('config', None)
        if self.config_file:
            jconfig = jsonload(open(self.config_file))
        elif config:
            jconfig = jsonloads(config)
        else:
            logger.critical('config_file or config is expected')
            raise Exception('config_file or config is expected')

        self.template = config_parse(jconfig)

        # 指定单个要爬的入口地址,可用于测试,或者单独爬取某个页面
        self.test_url = kwargs.get('test_url', None)

        # 指定抓取页面数
        self.max_pages = kwargs.get('max_pages', None)
        self.max_pages = int(self.max_pages) if self.max_pages is not None else None

        # extractor 测试
        self.test_extractor = kwargs.get('test_extractor', None)

        # entity 测试
        self.test_entity = kwargs.get('test_entity', None)
Ejemplo n.º 3
0
	def __init__(self, userName='',password='',*args, **kwargs):
		Spider.__init__(self,*args, **kwargs)
		if ((userName.replace(' ','')=='') or  (password.replace(' ','')=='')):
			print('请输入账号密码')
			self.closed('退出')
		else:
			self.userName=userName
			self.password=password
Ejemplo n.º 4
0
 def __init__(self, crawler, *args, **kwargs):
     print "wwj debug in scrapy spider init"
     Spider.__init__(self, name=None, **kwargs)
     self.seed_mode= crawler.settings.get('SEED_MODE')
     if(self.seed_mode == 'seeds'):
         seeds_file = crawler.settings.get('SEEDS_FILE')
         self.load_seeds(seeds_file)
     elif (self.seed_mode == 'redis'):
         redis_client_path = crawler.settings.get('REDIS_CLIENT')
Ejemplo n.º 5
0
    def __init__(self, *arg, **argdict):

        self.titleXpath = ''
        self.descriptionXpath = ''
        self.descriptionLenght = 0
        self.linkXpath = ''
        self.imgUrlXpath = ''
        self.imageNum = 1
        self.videoUrlXpath = ''
        self.pubDateXpath = ''
        self.guidXpath = ''
        self.rule_id = ''
        self.checkTxtXpath = ''
        self.is_remove_namespaces = False
        Spider.__init__(self, *arg, **argdict)
        self.currentNode = None
Ejemplo n.º 6
0
    def test_store_load(self):
        jobdir = self.mktemp()
        os.mkdir(jobdir)
        spider = Spider(name="default")
        dt = datetime.now()

        ss = SpiderState(jobdir)
        ss.spider_opened(spider)
        spider.state["one"] = 1
        spider.state["dt"] = dt
        ss.spider_closed(spider)

        spider2 = Spider(name="default")
        ss2 = SpiderState(jobdir)
        ss2.spider_opened(spider2)
        self.assertEqual(spider.state, {"one": 1, "dt": dt})
        ss2.spider_closed(spider2)
Ejemplo n.º 7
0
    def test_store_load(self):
        jobdir = self.mktemp()
        os.mkdir(jobdir)
        spider = Spider(name='default')
        dt = datetime.now()

        ss = SpiderState(jobdir)
        ss.spider_opened(spider)
        spider.state['one'] = 1
        spider.state['dt'] = dt
        ss.spider_closed(spider)

        spider2 = Spider(name='default')
        ss2 = SpiderState(jobdir)
        ss2.spider_opened(spider2)
        self.assertEqual(spider.state, {'one': 1, 'dt': dt})
        ss2.spider_closed(spider2)
Ejemplo n.º 8
0
    def __init__(self):
        Spider.__init__(self)
        # trans from the ”民國“ into YYYY form.
        today = datetime(date.today().year, date.today().month,
                         date.today().day - 1)

        def date_trans(date_):
            year = int(date_.split('/')[0]) - 1911
            return '/'.join([str(year)] + date_.split('/')[1:])

        self.temp = get_viewstate()
        self.formdata = {
            "ctl00$ScriptManager_Master":
            "ctl00$contentPlaceHolder$updatePanelMain|ctl00$contentPlaceHolder$btnQuery",
            "ctl00$ucLogin$txtMemberID": "",
            "ctl00$ucLogin$txtPassword": "",
            "ctl00$ucLogin$txtValCode": "",
            "ctl00$contentPlaceHolder$ucSolarLunar$radlSolarLunar": "S",
            "ctl00$contentPlaceHolder$txtSTransDate":
            date_trans(today.strftime('%Y/%m/%d')),
            "ctl00$contentPlaceHolder$txtETransDate":
            date_trans(today.strftime('%Y/%m/%d')),
            "ctl00$contentPlaceHolder$txtMarket": "全部市場",
            "ctl00$contentPlaceHolder$hfldMarketNo": "ALL",
            "ctl00$contentPlaceHolder$txtProduct": "全部產品",
            "ctl00$contentPlaceHolder$hfldProductNo": "ALL",
            "ctl00$contentPlaceHolder$hfldProductType": "A",
            "__EVENTTARGET": "",
            "__EVENTARGUMENT": "",
            "__VIEWSTATE": self.temp[0],
            "__EVENTVALIDATION": self.temp[1],
            "__ASYNCPOST": "true",
            "ctl00$contentPlaceHolder$btnQuery": "查詢"}
        self.headers = {'Referer': self.start_urls,
                        'Accept': ' application/json, text/javascript, */*',
                        'Content-Type':
                        'application/x-www-form-urlencoded; charset=UTF-8',
                        'X-Requested-With': 'XMLHttpRequest',
                        'User-Agent':
                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36"
                        }
Ejemplo n.º 9
0
    def __init__(self, *arg, **argdict):
        """ 初始化对象属性 """

        self.rule = ''
        self.titleXpath = ''
        self.descriptionXpath = ''
        self.descriptionLenght = 0
        self.linkXpath = ''
        self.imgUrlXpath = ''
        self.imageNum = 1
        self.videoUrlXpath = ''
        self.pubDateXpath = ''
        self.guidXpath = ''
        self.rule_id = ''
        self.checkTxtXpath = ''
        self.is_remove_namespaces = False
        self.last_md5 = ''
        self.next_request_url = ''
        Spider.__init__(self, *arg, **argdict)
        self.currentNode = None
        self.isDone = False
        self.isFirstListPage = True
Ejemplo n.º 10
0
 def test_connect_request(self):
     request = Request(self.getURL('file'), method='CONNECT')
     d = self.download_request(request, Spider('foo'))
     d.addCallback(lambda r: r.body)
     d.addCallback(self.assertEqual, b'')
     return d
Ejemplo n.º 11
0
 def setUp(self):
     self.spider = Spider('myspider', start_urls=["http://example.com"])
     self.response = HtmlResponse(body=b"<html></html>",
                                  url="http://www.example.com")
Ejemplo n.º 12
0
 def setUp(self):
     self.spider = Spider('foo')
     self.settings = {'CRAWLERA_APIKEY': 'apikey'}
 def setUp(self):
     self.spider = Spider('media.com')
     self.pipe = self.pipeline_class(download_func=_mocked_download_func,
                                     settings=Settings(self.settings))
     self.pipe.open_spider(self.spider)
     self.info = self.pipe.spiderinfo
 def setUp(self):
     self.spider = Spider('foo')
     self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOW_ALL': True}))
     self.req = Request('http://scrapytest.org')
     self.res200, self.res404, self.res402 = _responses(
         self.req, [200, 404, 402])
Ejemplo n.º 15
0
 def test_download(self):
     request = Request(self.getURL('file'))
     d = self.download_request(request, Spider('foo'))
     d.addCallback(lambda r: r.body)
     d.addCallback(self.assertEqual, b"0123456789")
     return d
Ejemplo n.º 16
0
 def __init__(self, *args, **kwargs):
     Spider.__init__(self, *args, **kwargs)
     self.redis_conn = None
Ejemplo n.º 17
0
 def setUp(self):
     self.download_handler = DataURIDownloadHandler(Settings())
     self.download_request = self.download_handler.download_request
     self.spider = Spider('foo')
Ejemplo n.º 18
0
 def setUp(self):
     self.spider = Spider('df_tests')
     self.temp_dir = tempfile.gettempdir()
     self.db_path = os.path.join(self.temp_dir, 'df_tests.db')
     crawler = get_crawler(Spider)
     self.stats = StatsCollector(crawler)
Ejemplo n.º 19
0
 def create_scheduler(self):
     self.mock_crawler = MockCrawler(self.priority_queue_cls, self.jobdir)
     self.scheduler = Scheduler.from_crawler(self.mock_crawler)
     self.spider = Spider(name='spider')
     self.scheduler.open(self.spider)
Ejemplo n.º 20
0
 def test_download_head(self):
     request = Request(self.getURL('file'), method='HEAD')
     d = self.download_request(request, Spider('foo'))
     d.addCallback(lambda r: r.body)
     d.addCallback(self.assertEquals, b'')
     return d
Ejemplo n.º 21
0
 def setUp(self):
     s3reqh = S3DownloadHandler(Settings(), self.AWS_ACCESS_KEY_ID, \
             self.AWS_SECRET_ACCESS_KEY, \
             httpdownloadhandler=HttpDownloadHandlerMock)
     self.download_request = s3reqh.download_request
     self.spider = Spider('foo')
 def setUp(self):
     self.spider = Spider('foo')
     self.mw = RefererMiddleware()
Ejemplo n.º 23
0
 def setUp(self):
     self.spider = Spider('foo')
     self.settings = {'CRAWLERA_USER': '******', 'CRAWLERA_PASS': '******'}
Ejemplo n.º 24
0
 def __init__(self, name=None, **kwargs):
     Spider.__init__(self, name, **kwargs)
Ejemplo n.º 25
0
def setup_module(module):
    global spider
    spider = Spider('spidr')
Ejemplo n.º 26
0
 def test_download_chunked_content(self):
     request = Request(self.getURL('chunked'))
     d = self.download_request(request, Spider('foo'))
     d.addCallback(lambda r: r.body)
     d.addCallback(self.assertEqual, b"chunked content\n")
     return d
 def setUp(self):
     self.mw = DecompressionMiddleware()
     self.spider = Spider('foo')
Ejemplo n.º 28
0
 def parse(self, response):
     Spider.parse(self, response)
 def _get_spider(self):
     return Spider('foo')
 def setUp(self):
     crawler = get_crawler(Spider)
     self.spider = Spider.from_crawler(crawler, name='foo')
     self.mw = HttpErrorMiddleware(Settings({}))
     self.req = Request('http://scrapytest.org')
     self.res200, self.res404 = _responses(self.req, [200, 404])
Ejemplo n.º 31
0
 def setUp(self):
     crawler = get_crawler()
     self.download_handler = create_instance(DataURIDownloadHandler,
                                             crawler.settings, crawler)
     self.download_request = self.download_handler.download_request
     self.spider = Spider('foo')
Ejemplo n.º 32
0
 def setUp(self):
     self.formatter = LogFormatter()
     self.spider = Spider('default')
Ejemplo n.º 33
0
 def __init__(self):
     Spider.__init__(self)
     self.browser = webdriver.Chrome(
         '/Users/liulizhe/Desktop/python_file/chromedriver')
Ejemplo n.º 34
0
 def __init__(self, *args, **kwargs):
     Spider.__init__(self)
     SpiderBase.__init__(*args, **kwargs)
Ejemplo n.º 35
0
 def test_redirect_status_head(self):
     request = Request(self.getURL('redirect'), method='HEAD')
     d = self.download_request(request, Spider('foo'))
     d.addCallback(lambda r: r.status)
     d.addCallback(self.assertEqual, 302)
     return d
Ejemplo n.º 36
0
from copy import deepcopy
from functools import partial
from typing import Dict
from urllib.parse import urlparse

from scrapy.crawler import Crawler
from scrapy.exceptions import NotConfigured
from scrapy.http import Request
from scrapy.settings import Settings
from scrapy.spiders import Spider
from twisted.trial.unittest import TestCase

from scrapy_proxy_management.downloadermiddlewares.httpproxy import \
    HttpProxyMiddleware

_spider = Spider('foo')


@contextmanager
def _open_spider(
        spider: Spider,
        settings: Settings,
        auth_encoding: str = 'latin-1'
):
    crawler = Crawler(spider, settings)
    middleware = HttpProxyMiddleware(
        crawler=crawler, auth_encoding=auth_encoding
    )

    middleware.open_spider(spider)
Ejemplo n.º 37
0
 def test_download_with_maxsize_per_req(self):
     meta = {'download_maxsize': 2}
     request = Request(self.getURL('file'), meta=meta)
     d = self.download_request(request, Spider('foo'))
     yield self.assertFailure(d, defer.CancelledError,
                              error.ConnectionAborted)
Ejemplo n.º 38
0
    def __init__(self, *args, **kwargs):
        Spider.__init__(self)

        self._set_config(**kwargs)
Ejemplo n.º 39
0
 def test_download_with_small_maxsize_per_spider(self):
     request = Request(self.getURL('file'))
     d = self.download_request(request, Spider('foo', download_maxsize=2))
     yield self.assertFailure(d, defer.CancelledError,
                              error.ConnectionAborted)
 def setUp(self):
     crawler = get_crawler(Spider)
     self.spider = Spider.from_crawler(crawler, name='foo')
     self.mw = HttpErrorMiddleware(Settings({}))
     self.req = Request('http://scrapytest.org')
     self.res200, self.res404 = _responses(self.req, [200, 404])
Ejemplo n.º 41
0
 def test_download_with_large_maxsize_per_spider(self):
     request = Request(self.getURL('file'))
     d = self.download_request(request, Spider('foo', download_maxsize=100))
     d.addCallback(lambda r: r.body)
     d.addCallback(self.assertEqual, b"0123456789")
     return d
Ejemplo n.º 42
0
 def test_non_existent(self):
     request = Request('file://%s' % self.mktemp())
     d = self.download_request(request, Spider('foo'))
     return self.assertFailure(d, IOError)
Ejemplo n.º 43
0
 def __del__(self):
     self.selenium.close()
     print(self.verificationErrors)
     Spider.__del__(self)