def get_defaults_spider_mw(self):
     crawler = get_crawler()
     spider = BaseSpider('foo')
     spider.set_crawler(crawler)
     defaults = dict([(k, [v]) for k, v in \
         crawler.settings.get('DEFAULT_REQUEST_HEADERS').iteritems()])
     return defaults, spider, DefaultHeadersMiddleware()
Beispiel #2
0
    def __init__(self):
        BaseSpider.__init__(self)

        # settings
        settings.overrides["DOWNLOAD_DELAY"] = 0
        settings.overrides["LOG_FILE"] = "scrapy.log"
        settings.overrides["LOG_STDOUT"] = True
        settings.overrides["DOWNLOAD_TIMEOUT"] = 180
        settings.overrides["RETRY_TIMES"] = 10

        # base url of all the pages
        self.base_url = "http://www.365zn.com/fyc/"

        # regex objects

        # example: <a href="fyc_h.htm"
        self.reobj_word_list_page = re.compile(r"fyc_\w+.htm")

        # example: <a href=htm/11474.htm title='把持'>
        self.reobj_word_and_page = re.compile(r"href=\S+\s+title='[^']+'")

        # 【同义词】 <font color=blue>胸有成竹&nbsp;&nbsp;心中有数&nbsp;&nbsp;稳操胜券</font>
        self.reobj_synonym = re.compile(r"【同义词】\W+<font color=blue>([^<]*)</font>")

        # 【反义词】 <font color=red>心中无数&nbsp;&nbsp;手忙脚乱</font>
        self.reobj_antonym = re.compile(r"【反义词】\W+<font color=red>([^<]*)</font>")

        # chinese character(s)
        #        self.reobj_chinese = re.compile(r"[\u4e00-\u9fa5]+")
        self.reobj_chinese = re.compile(r"[\x80-\xff]+")
	def __init__(self, **kwargs):
		BaseSpider.__init__(self)
		try:
			self.outDir=kwargs['outDir']
			if self.outDir[-1]!= '/': self.outDir += '/'
			startYear=int(kwargs['startYear'])
			endYear=int(kwargs['endYear'])
			assert startYear <= endYear 
		except:
			print >>sys.stderr, "eventSpider needs 3 arguments: outDir, startYear, endYear"
			exit(1)
		startingAdd = "https://en.wikipedia.org/wiki/"
		self.start_urls = []
                if startYear < -500:
		    	for i in range(startYear, min(-499, endYear), 10):
				add = startingAdd+str(-i)+"_BC"
				self.start_urls.append(add)
				path = self.outDir+str(-i)+"_BC/"
    				if not os.path.exists(path):   os.makedirs(path)
			if endYear > -500: startYear = -499
		if startYear >-500 and startYear < 0:
			for i in range(max(startYear,-499), min(0,endYear), 1):
				add = startingAdd+str(-i)+"_BC"
				self.start_urls.append(add)
				path = self.outDir+str(-i)+"_BC/"
	    			if not os.path.exists(path):   os.makedirs(path)
			if endYear > 0: startYear = 1
		if startYear > 0:
			for i in range(startYear, endYear+1):
				add = startingAdd+str(i)
				self.start_urls.append(add)
				path = self.outDir+str(i)+"/"
	   			if not os.path.exists(path):   os.makedirs(path)
 def __init__(self, **kwargs):
     BaseSpider.__init__(self)
     try:
         self.outDir = kwargs['outDir']
         if self.outDir[-1] != '/': self.outDir += '/'
         startYear = int(kwargs['startYear'])
         endYear = int(kwargs['endYear'])
         assert startYear <= endYear
     except:
         print >> sys.stderr, "eventSpider needs 3 arguments: outDir, startYear, endYear"
         exit(1)
     startingAdd = "https://en.wikipedia.org/wiki/"
     self.start_urls = []
     if startYear < -500:
         for i in range(startYear, min(-499, endYear), 10):
             add = startingAdd + str(-i) + "_BC"
             self.start_urls.append(add)
             path = self.outDir + str(-i) + "_BC/"
             if not os.path.exists(path): os.makedirs(path)
         if endYear > -500: startYear = -499
     if startYear > -500 and startYear < 0:
         for i in range(max(startYear, -499), min(0, endYear), 1):
             add = startingAdd + str(-i) + "_BC"
             self.start_urls.append(add)
             path = self.outDir + str(-i) + "_BC/"
             if not os.path.exists(path): os.makedirs(path)
         if endYear > 0: startYear = 1
     if startYear > 0:
         for i in range(startYear, endYear + 1):
             add = startingAdd + str(i)
             self.start_urls.append(add)
             path = self.outDir + str(i) + "/"
             if not os.path.exists(path): os.makedirs(path)
    def test_rules_manager_callbacks(self):
        mycallback = lambda: True

        spider = BaseSpider('foo')
        spider.parse_item = lambda: True

        response1 = HtmlResponse('http://example.org')
        response2 = HtmlResponse('http://othersite.org')

        rulesman = RulesManager([
            Rule('example', mycallback),
            Rule('othersite', 'parse_item'),
                ], spider, default_matcher=UrlRegexMatcher)

        rule1 = rulesman.get_rule_from_response(response1)
        rule2 = rulesman.get_rule_from_response(response2)

        self.failUnlessEqual(rule1.callback, mycallback)
        self.failUnlessEqual(rule2.callback, spider.parse_item)

        # fail unknown callback
        self.assertRaises(AttributeError, RulesManager, [
                            Rule(BaseMatcher(), 'mycallback')
                            ], spider)
        # fail not callable
        spider.not_callable = True
        self.assertRaises(AttributeError, RulesManager, [
                            Rule(BaseMatcher(), 'not_callable')
                            ], spider)
 def get_defaults_spider_mw(self):
     crawler = get_crawler()
     spider = BaseSpider('foo')
     spider.set_crawler(crawler)
     defaults = dict([(k, [v]) for k, v in \
         crawler.settings.get('DEFAULT_REQUEST_HEADERS').iteritems()])
     return defaults, spider, DefaultHeadersMiddleware()
	def __init__(self, **kwargs):
		BaseSpider.__init__(self)
		try:
			self.outDir=kwargs['outDir']
			if self.outDir[-1]!= '/': self.outDir += '/'
			self.endYear=int(kwargs['endYear'])
		except:
			print >>sys.stderr, "eventSpider needs 3 arguments: outDir, outFile, endYear"
			exit(1)
		startingAdd = "http://en.wikipedia.org/wiki/"
		self.start_urls = []
#		self.start_urls = [startingAdd+"2011"]
#  		if not os.path.exists(self.outDir+"2011"):   os.makedirs(self.outDir+"2011")
		for i in range(1500, 499, -10):
			add = startingAdd+str(i)+"_BC"
			self.start_urls.append(add)
			path = self.outDir+str(i)+"_BC/"
    			if not os.path.exists(path):   os.makedirs(path)
		for i in range(499, 0, -1):
			add = startingAdd+str(i)+"_BC"
			self.start_urls.append(add)
			path = self.outDir+str(i)+"_BC/"
    			if not os.path.exists(path):   os.makedirs(path)
		for i in range(1, self.endYear+1):
			add = startingAdd+str(i)
			self.start_urls.append(add)
			path = self.outDir+str(i)+"/"
   			if not os.path.exists(path):   os.makedirs(path)
Beispiel #8
0
    def test_scheduler_persistent(self):
        messages = []
        spider = BaseSpider('myspider')
        spider.log = lambda *args, **kwargs: messages.append([args, kwargs])

        self.scheduler.persist = True
        self.scheduler.open(spider)

        self.assertEqual(messages, [])

        self.scheduler.enqueue_request(Request('http://example.com/page1'))
        self.scheduler.enqueue_request(Request('http://example.com/page2'))

        self.assertTrue(self.scheduler.has_pending_requests())
        self.scheduler.close('finish')

        self.scheduler.open(spider)
        self.assertEqual(messages, [
            [('Resuming crawl (2 requests scheduled)', ), {}],
        ])
        self.assertEqual(len(self.scheduler), 2)

        self.scheduler.persist = False
        self.scheduler.close('finish')

        self.assertEqual(len(self.scheduler), 0)
    def __init__(self):
        BaseSpider.__init__(self)
        
        self.handle_httpstatus_list = range(0,1000)
        self.requestCount = 0
        
        print 'Opening Alexa URL CSV, please wait.'
        maxSites = 200000
        selectionInterval = 5   #Include every nth site
        skipSites = 861010      #Skip the first n sites
        
        csv_file = open('top-1m.csv','r') 
        alexaReader = csv.reader(csv_file)
        
        rank=1
        queuedCount = 0
        for line in alexaReader :
            domain = line[1]
            if (rank % selectionInterval) == 0 and rank > skipSites:
                self.allowed_domains.append( domain )
                self.start_urls.append(domain)
                queuedCount = queuedCount + 1
                if (queuedCount >= maxSites) :
                    break

            rank += 1
        
        csv_file.close()
        print 'Done opening URLs, starting crawler....'
Beispiel #10
0
    def __init__(self, **kwargs):
        BaseSpider.__init__(self)
        self.driver = webdriver.Remote(
            command_executor='http://127.0.0.1:4444/wd/hub',
            desired_capabilities=DesiredCapabilities.FIREFOX)
        url = self.default
        if 'from' in kwargs:
            self.start_urls[0] += kwargs['from']
            if 'to' in kwargs:
                self.start_urls[0] += '/' + kwargs['to']
                if 'date' in kwargs:
                    dat = kwargs['date']
                    if re.match('\d{6}', dat):
                        self.monthly = False
                        self.start_urls[0] += '/' + dat
                    elif re.match('\d{4}', dat):
                        self.monthly = True
                        self.start_urls[0] += ('/blah.html?oym=' + dat +
                                               '&charttype=1')
                    else:
                        self.this_month()
                else:
                    self.this_month()
                if self.monthly and 'rtn' in kwargs:
                    self.rtn = kwargs['rtn']
                    self.start_urls[0] += '&rtn=' + self.rtn
                else:
                    self.rtn = '0'
                url = self.start_urls[0]

        self.driver.get(url)
Beispiel #11
0
    def test_scheduler_persistent(self):
        messages = []
        spider = BaseSpider('myspider')
        spider.log = lambda *args, **kwargs: messages.append([args, kwargs])

        self.scheduler.persist = True
        self.scheduler.open(spider)

        self.assertEqual(messages, [])

        self.scheduler.enqueue_request(Request('http://example.com/page1'))
        self.scheduler.enqueue_request(Request('http://example.com/page2'))

        self.assertTrue(self.scheduler.has_pending_requests())
        self.scheduler.close('finish')

        self.scheduler.open(spider)
        self.assertEqual(messages, [
            [('Resuming crawl (2 requests scheduled)',), {}],
        ])
        self.assertEqual(len(self.scheduler), 2)

        self.scheduler.persist = False
        self.scheduler.close('finish')

        self.assertEqual(len(self.scheduler), 0)
    def __init__(self):
        BaseSpider.__init__(self)

        # settings 
        settings.overrides['DOWNLOAD_DELAY'] = 0.1

        # regex object for extracting image url
        self.reobj_image = re.compile(r"http://\S+.gstatic.com[^\"\s]+")

        self.num_images_per_page = 20
        self.num_images = 200

        # initialize word searching url list
        self.base_url = "http://images.google.com/search?tbm=isch&safe=off"

        f_word_dict = file(r'SogouLabDic_tab_utf8_linux.dic')
#        f_word_dict = file(r'test_dict') 
        word_lines = f_word_dict.readlines()

        print "initialize image searching urls"
        for word_line in word_lines:
            word = word_line[ : word_line.index("\t")]

            start = 0 
            while start < self.num_images:
                self.start_urls.append( self.base_url + 
                                        "&q=" + word + 
                                        "&start=" + str(start)
                                      )
                start += self.num_images_per_page
        print "created " + str( len(self.start_urls) ) + " image searching urls."
Beispiel #13
0
    def test_rules_manager_callbacks(self):
        mycallback = lambda: True

        spider = BaseSpider('foo')
        spider.parse_item = lambda: True

        response1 = HtmlResponse('http://example.org')
        response2 = HtmlResponse('http://othersite.org')

        rulesman = RulesManager([
            Rule('example', mycallback),
            Rule('othersite', 'parse_item'),
        ],
                                spider,
                                default_matcher=UrlRegexMatcher)

        rule1 = rulesman.get_rule_from_response(response1)
        rule2 = rulesman.get_rule_from_response(response2)

        self.failUnlessEqual(rule1.callback, mycallback)
        self.failUnlessEqual(rule2.callback, spider.parse_item)

        # fail unknown callback
        self.assertRaises(AttributeError, RulesManager,
                          [Rule(BaseMatcher(), 'mycallback')], spider)
        # fail not callable
        spider.not_callable = True
        self.assertRaises(AttributeError, RulesManager,
                          [Rule(BaseMatcher(), 'not_callable')], spider)
Beispiel #14
0
    def test_url_is_from_spider_with_allowed_domains(self):
        spider = BaseSpider(name='example.com',
                            allowed_domains=['example.org', 'example.net'])
        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html',
                               spider))
        self.assertTrue(
            url_is_from_spider('http://sub.example.com/some/page.html',
                               spider))
        self.assertTrue(
            url_is_from_spider('http://example.com/some/page.html', spider))
        self.assertTrue(
            url_is_from_spider('http://www.example.org/some/page.html',
                               spider))
        self.assertTrue(
            url_is_from_spider('http://www.example.net/some/page.html',
                               spider))
        self.assertFalse(
            url_is_from_spider('http://www.example.us/some/page.html', spider))

        spider = BaseSpider(name='example.com',
                            allowed_domains=set(
                                ('example.com', 'example.net')))
        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html',
                               spider))

        spider = BaseSpider(name='example.com',
                            allowed_domains=('example.com', 'example.net'))
        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html',
                               spider))
	def __init__(self, *arg1, **arg2):
		log.msg(message="man_spider, __init__", _level = log.INFO)
		BaseSpider.__init__(self, *arg1, **arg2)
		self.man_spider_callback = {}
		self.man_spider_callback['list'] = self.callback_list
		self.man_spider_callback['parse'] = self.callback_parse
		self.man_spider_callback['all'] = self.callback_all
Beispiel #16
0
 def _assert_stores(self, storage, path):
     yield storage.store(StringIO("content"), BaseSpider("default"))
     self.failUnless(os.path.exists(path))
     self.failUnlessEqual(open(path).read(), "content")
     # again, to check files are overwritten properly
     yield storage.store(StringIO("new content"), BaseSpider("default"))
     self.failUnlessEqual(open(path).read(), "new content")
Beispiel #17
0
    def __init__(self):
        BaseSpider.__init__(self)
        # starting virtual display
        # comment this line if you are using desktop
        display.start()

        # estabilishing browser
        self.browser = webdriver.Firefox() 
 def setUp(self):
     self.crawler = get_crawler(self.settings_dict)
     self.spider = BaseSpider('foo')
     self.spider.set_crawler(self.crawler)
     self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
     # some mw depends on stats collector
     self.crawler.stats.open_spider(self.spider)
     return self.mwman.open_spider(self.spider)
Beispiel #19
0
 def setUp(self):
     self.spider1 = BaseSpider('name1')
     self.spider2 = BaseSpider('name2')
     open_spiders = set([self.spider1, self.spider2])
     crawler = CrawlerMock(open_spiders)
     self.spref = SpiderReferencer(crawler)
     self.encoder = ScrapyJSONEncoder(spref=self.spref)
     self.decoder = ScrapyJSONDecoder(spref=self.spref)
Beispiel #20
0
 def __init__(self):
     BaseSpider.__init__(self)
     self.verificationErrors = []
     self.profile = webdriver.FirefoxProfile("C:/Users/Administrator/AppData/Roaming/Mozilla/Firefox/Profiles/rbqs2eme.")
     self.browser = webdriver.Firefox(self.profile)
             
     self.duplicatesurl = {}
     dispatcher.connect(self.spider_opened, signals.spider_opened)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
 def __init__(self, **kwargs):
     BaseSpider.__init__(self)
     startingAdd = "https://en.wikipedia.org/wiki/"
     self.inFile = kwargs['infile']
     self.outFile = kwargs['outfile']
     self.start_urls = []
     self.url2locDic = {}
     self.readFile(self.inFile)
     fout = codecs.open(self.outFile, "w", encoding='utf-8')
     fout.close()
Beispiel #22
0
    def __init__(self, domain_name=None):
        BaseSpider.__init__(self, domain_name)

        consumer_key = config.get('yammer', 'consumer_key')
        consumer_secret = config.get('yammer', 'consumer_secret')
        app_token = config.get('yammer', 'app_token')

        self.consumer = OAuthConsumer(consumer_key, consumer_secret)
        self.signature = OAuthSignatureMethod_PLAINTEXT()
        self.token = OAuthToken.from_string(app_token)
Beispiel #23
0
    def __init__(self):
        BaseSpider.__init__(self)
        self.verificationErrors = []
        # self.profile = webdriver.FirefoxProfile("C:/Users/Administrator/AppData/Roaming/Mozilla/Firefox/Profiles/rbqs2eme")
        # self.browser = webdriver.Firefox(self.profile)
        self.browser = webdriver.Chrome('C:\Users\ZERO\AppData\Local\Google\Chrome\Application\chromedriver.exe')

        self.duplicatesurl = {}
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
Beispiel #24
0
    def __init__(self, domain_name=None):
        BaseSpider.__init__(self, domain_name)

        consumer_key    = config.get('yammer', 'consumer_key')
        consumer_secret = config.get('yammer', 'consumer_secret')
        app_token       = config.get('yammer', 'app_token')

        self.consumer  = OAuthConsumer(consumer_key, consumer_secret)
        self.signature = OAuthSignatureMethod_PLAINTEXT()
        self.token     = OAuthToken.from_string(app_token)
	def __init__(self, **kwargs):
		BaseSpider.__init__(self)
		startingAdd = "http://en.wikipedia.org/wiki/"
		self.inFile=kwargs['infile']
		self.outFile=kwargs['outfile']
		self.start_urls = []
		self.url2locDic = {}
		self.readFile(self.inFile)
		fout = codecs.open(self.outFile,"w", encoding='utf-8')
		fout.close
 def __init__(self, name=None, **kwargs):
     if not hasattr(self, 'start_urls'):
         self.start_urls = []
         file_list = [i for i in os.listdir(
             self.result_path) if i.endswith('.html')]
         for i in file_list:
             path = os.path.join(self.result_path, i).replace('?', '%3F')
             url = 'file://%s' % (path)
             self.start_urls.append(url)
     BaseSpider.__init__(self, kwargs=kwargs)
     self.item = Commonshop()
 def __init__(self):
    BaseSpider.__init__(self)
    self.verificationErrors = []
    with open(self.contactsDataFile, 'rb') as csvfile:
      csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
      self.log('Initialing with contact urls from file : ' + self.contactsDataFile + ' ...')
      for row in csvreader:
         if row[1].startswith('https') == True:
            self.start_urls.append(row[1])
            
    self.log('Total contacts loaded : %d' % len(self.start_urls))
Beispiel #28
0
	def __init__(self):
		BaseSpider.__init__(self)	# inizializzo il baseSpider con il metodo originale (sto riscrivendo il metodo '__init__()')
		self.verificationErrors = []
		
		# --- Disattivare l'apertura del brawser -------------------------------------
		# Funziona soltatno con Linux, per via delle dipendenze grafiche...
		# self.display = Display(visible=0,backend ='xvnb', size=(800, 600))
		# self.display = Display(visible=0, size=(800, 600))
		# self.display.start()
		# ----------------------------------------------------------------------------
		self.driver = webdriver.Firefox(self.disableImages()) # carico il webdriver con il profilo che crea la funzione 'disableImages()'
    def test_host_header_seted_in_request_headers(self):
        def _test(response):
            self.assertEquals(response.body, 'example.com')
            self.assertEquals(request.headers.get('Host'), 'example.com')

        request = Request(self.getURL('host'), headers={'Host': 'example.com'})
        return self.download_request(request, BaseSpider('foo')).addCallback(_test)

        d = self.download_request(request, BaseSpider('foo'))
        d.addCallback(lambda r: r.body)
        d.addCallback(self.assertEquals, 'example.com')
        return d
	def __init__(self, **kwargs):
		BaseSpider.__init__(self)
		startingAdd = "http://en.wikipedia.org/wiki/"
		self.inFile=kwargs['infile']
		self.outFileLoc=kwargs['outfileLoc']
		self.outFilePer=kwargs['outfilePer']
		self.start_urls = []
		self.url2locDic = {}
		self.url2urlDic = {}
		self.readFile(self.inFile)
		fout = open(self.outFileLoc,"w")
		fout = open(self.outFilePer,"w")
		fout.close
Beispiel #31
0
    def __init__(self):
        BaseSpider.__init__(self)
        # use any browser you wish
        display.start()

        profile = webdriver.FirefoxProfile()
        # setup configuration for browser driver to download file properly
        profile.set_preference("browser.download.folderList", 2)
        profile.set_preference("browser.download.manager.showWhenStarting", False)
        profile.set_preference("browser.download.dir", ROOT_DIR)
        profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/csv")

        self.browser = webdriver.Firefox(firefox_profile=profile) 
Beispiel #32
0
 def test_store(self):
     out = StringIO()
     storage = StdoutFeedStorage('stdout:', _stdout=out)
     file = storage.open(BaseSpider("default"))
     file.write("content")
     yield storage.store(file)
     self.assertEqual(out.getvalue(), "content")
Beispiel #33
0
 def test_payload(self):
     body = '1' * 100  # PayloadResource requires body length to be 100
     request = Request(self.getURL('payload'), method='POST', body=body)
     d = self.download_request(request, BaseSpider('foo'))
     d.addCallback(lambda r: r.body)
     d.addCallback(self.assertEquals, body)
     return d
    def __init__(self, username=None):
        BaseSpider.__init__(self)
        self.username, self.pwd = username.split(":")
        self.db = MySQLdb.connect(
            host="localhost", port=3306, user="******", passwd="pw", db="weibosearch2", charset="utf8", use_unicode=True
        )
        self.cursor = self.db.cursor()
        self.logined = False

        log.msg("login with %s" % self.username, level=log.INFO)
        login_url = self.weibo.login(self.username, self.pwd)
        if login_url:
            log.msg("login successful, start crawling.", level=log.INFO)
            self.start_urls.append(login_url)
        else:
            log.msg("login failed", level=log.ERROR)
    def test_host_header_not_in_request_headers(self):
        def _test(response):
            self.assertEquals(response.body, '127.0.0.1:%d' % self.portno)
            self.assertEquals(request.headers, {})

        request = Request(self.getURL('host'))
        return self.download_request(request, BaseSpider('foo')).addCallback(_test)
Beispiel #36
0
    def test_rules_manager_callback_with_arguments(self):
        spider = BaseSpider('foo')
        response = HtmlResponse('http://example.org')

        kwargs = {'a': 1}

        def myfunc(**mykwargs):
            return mykwargs

        # verify return validation
        self.failUnlessEquals(kwargs, myfunc(**kwargs))

        # test callback w/o arguments
        rulesman = RulesManager([
            Rule(BaseMatcher(), myfunc),
        ], spider)
        rule = rulesman.get_rule_from_response(response)

        # without arguments should return same callback
        self.failUnlessEqual(rule.callback, myfunc)

        # test callback w/ arguments
        rulesman = RulesManager([
            Rule(BaseMatcher(), myfunc, **kwargs),
        ], spider)
        rule = rulesman.get_rule_from_response(response)

        # with argument should return partial applied callback
        self.failUnless(isinstance(rule.callback, partial))
        self.failUnlessEquals(kwargs, rule.callback())
Beispiel #37
0
    def setUp(self):
        self.spider = BaseSpider('scrapytest.org')

        self.stats = StatsCollector()
        self.stats.open_spider(self.spider)

        self.mw = DepthMiddleware(1, self.stats, True)
Beispiel #38
0
 def _assert_stores(self, storage, path):
     spider = BaseSpider("default")
     file = storage.open(spider)
     file.write("content")
     yield storage.store(file)
     self.failUnless(os.path.exists(path))
     self.failUnlessEqual(open(path).read(), "content")
    def test_store_load(self):
        jobdir = self.mktemp()
        os.mkdir(jobdir)
        spider = BaseSpider(name='default')
        dt = datetime.now()

        ss = SpiderState(jobdir)
        ss.spider_opened(spider)
        spider.state['one'] = 1
        spider.state['dt'] = dt
        ss.spider_closed(spider)

        spider2 = BaseSpider(name='default')
        ss2 = SpiderState(jobdir)
        ss2.spider_opened(spider2)
        self.assertEqual(spider.state, {'one': 1, 'dt': dt})
        ss2.spider_closed(spider2)
    def test_download_without_proxy(self):
        def _test(response):
            self.assertEquals(response.status, 200)
            self.assertEquals(response.url, request.url)
            self.assertEquals(response.body, '/path/to/resource')

        request = Request(self.getURL('path/to/resource'))
        return self.download_request(request, BaseSpider('foo')).addCallback(_test)
    def setUp(self):
        self.spider = BaseSpider('scrapytest.org')
        self.mw = DownloaderStats()

        stats.open_spider(self.spider)

        self.req = Request('http://scrapytest.org')
        self.res = Response('scrapytest.org', status=400)
 def setUp(self):
     self.crawler = get_crawler(self.settings_dict)
     self.spider = BaseSpider('foo')
     self.spider.set_crawler(self.crawler)
     self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
     # some mw depends on stats collector
     self.crawler.stats.open_spider(self.spider)
     return self.mwman.open_spider(self.spider)
    def test_store_load(self):
        jobdir = self.mktemp()
        os.mkdir(jobdir)
        spider = BaseSpider(name='default')
        dt = datetime.now()

        ss = SpiderState(jobdir)
        ss.spider_opened(spider)
        spider.state['one'] = 1
        spider.state['dt'] = dt
        ss.spider_closed(spider)

        spider2 = BaseSpider(name='default')
        ss2 = SpiderState(jobdir)
        ss2.spider_opened(spider2)
        self.assertEqual(spider.state, {'one': 1, 'dt': dt})
        ss2.spider_closed(spider2)
Beispiel #44
0
    def setUp(self):
        self.spider = BaseSpider('scrapytest.org')

        self.stats = StatsCollector()
        self.stats.open_spider(self.spider)

        self.mw = DepthMiddleware(1, self.stats)
        self.assertEquals(self.stats.get_value('envinfo/request_depth_limit'), 1)
    def test_filter(self):
        spider = BaseSpider('foo')
        filter = NullDupeFilter()
        filter.open_spider(spider)

        r1 = Request('http://scrapytest.org/1')
        assert not filter.request_seen(spider, r1)
        filter.close_spider(spider)
 def test_state_attribute(self):
     # state attribute must be present if jobdir is not set, to provide a
     # consistent interface
     spider = BaseSpider(name='default')
     ss = SpiderState()
     ss.spider_opened(spider)
     self.assertEqual(spider.state, {})
     ss.spider_closed(spider)
    def test_download_with_proxy(self):
        def _test(response):
            self.assertEquals(response.status, 200)
            self.assertEquals(response.url, request.url)
            self.assertEquals(response.body, 'https://example.com')

        http_proxy = self.getURL('')
        request = Request('https://example.com', meta={'proxy': http_proxy})
        return self.download_request(request, BaseSpider('foo')).addCallback(_test)
    def test_download(self):
        def _test(response):
            self.assertEquals(response.url, request.url)
            self.assertEquals(response.status, 200)
            self.assertEquals(response.body, '0123456789')

        request = Request(path_to_file_uri(self.tmpname + '^'))
        assert request.url.upper().endswith('%5E')
        return self.download_request(request, BaseSpider('foo')).addCallback(_test)
    def setUp(self):
        self.spider = BaseSpider('foo')
        self.mw = HttpErrorMiddleware()
        self.req = Request('http://scrapytest.org')

        self.res200 = Response('http://scrapytest.org', status=200)
        self.res200.request = self.req
        self.res404 = Response('http://scrapytest.org', status=404)
        self.res404.request = self.req
Beispiel #50
0
 def _schedule(self, request, spider):
     if spider is None:
         spider = create_spider_for_request(self.crawler.spiders, request, \
             BaseSpider('default'), log_multiple=True)
     spider.set_crawler(self.crawler)
     self.crawler.engine.open_spider(spider)
     d = self.crawler.engine.schedule(request, spider)
     d.addCallback(lambda x: (x, spider))
     return d
  def __init__(self, username=None):
    BaseSpider.__init__(self)
    self.username, self.pwd = username.split(':')
    self.db = MySQLdb.connect(host="localhost", port=3306, user="******", passwd="pw", db="weibosearch2",
      charset='utf8', use_unicode=True)
    self.cursor = self.db.cursor()
    self.logined = False

    host = settings.get('REDIS_HOST', REDIS_HOST)
    port = settings.get('REDIS_PORT', REDIS_PORT)

    log.msg('login with %s' % self.username, level=log.INFO)
    login_url = self.weibo.login(self.username, self.pwd)
    if login_url:
      log.msg('login successful, start crawling.', level=log.INFO)
      self.start_urls.append(login_url)
    else:
      log.msg('login failed', level=log.ERROR)
Beispiel #52
0
    def __init__(self):
        BaseSpider.__init__(self)

        # settings 
        settings.overrides['DOWNLOAD_DELAY'] = 0
        settings.overrides['LOG_FILE'] = "scrapy.log"
        settings.overrides['LOG_STDOUT'] = True
        settings.overrides['DOWNLOAD_TIMEOUT'] = 180
        settings.overrides['RETRY_TIMES'] = 10

        self.num_images_per_page = 20
        self.num_images = 60

        # base url for image searching
        self.base_url = "http://images.google.com/search?tbm=isch&safe=off"

        # regex object for extracting image url
        self.reobj_image = re.compile(r"http://\S+.gstatic.com[^\"\s]+")

        # initialize start_urls
        self.fill_start_urls()
Beispiel #53
0
	def __init__(self, **kwargs):
		BaseSpider.__init__(self)
		try:
			self.outFile=kwargs['outfile']
			self.endYear=int(kwargs['endYear'])
		except:
			print >>sys.stderr, "eventSpider needs 2 arguments: outfile, endYear"
			exit(1)
		startingAdd = "http://en.wikipedia.org/wiki/"
		self.start_urls = []
		for i in range(1500, 499, -10):
			add = startingAdd+str(i)+"_BC"
			self.start_urls.append(add)
		for i in range(499, 0, -1):
			add = startingAdd+str(i)+"_BC"
			self.start_urls.append(add)
		for i in range(1, self.endYear+1):
			add = startingAdd+str(i)
			self.start_urls.append(add)
		
		fout = open(self.outFile,"w")
		fout.close
    def __init__(self, name=None, **kwargs):
        if not hasattr(self, 'start_urls'):
            self.start_urls = []
            skiplist = []
            skip_file = '%s/skip.txt' % self.result_path
            if not os.path.isdir(self.result_path):
                os.makedirs(self.result_path)
            if os.path.isfile(skip_file):
                with open(skip_file, 'r') as fp:
                    for eachline in fp:
                        shopid = eachline.replace('\n', '')
                        if shopid.isdigit():
                            skiplist.append(int(shopid))
            id_range = self.id_range
            if len(id_range) == 2:
                for i in xrange(id_range[0], id_range[1]):
                    if i not in skiplist:
                        url = '%s%s' % (self.start_url, i)
                        self.start_urls.append(url)
            if len(id_range) == 3:
                for i in xrange(id_range[0], id_range[1], id_range[2]):
                    if i not in skiplist:
                        url = '%s%s' % (self.start_url, i)
                        self.start_urls.append(url)
            else:
                for i in id_range:
                    if i not in skiplist:
                        url = '%s%s' % (self.start_url, i)
                        self.start_urls.append(url)

        if 1:
            file_list = [i for i in os.listdir(
                self.result_path) if i.endswith('.html')]
            exist_urls = [i.replace('.html', '') for i in file_list]
            self.start_urls = [
                i for i in self.start_urls
                if i.split('/')[-1] not in exist_urls]
        BaseSpider.__init__(self, kwargs=kwargs)
class ManagerTestCase(TestCase):

    settings_dict = None

    def setUp(self):
        self.crawler = get_crawler(self.settings_dict)
        self.spider = BaseSpider('foo')
        self.spider.set_crawler(self.crawler)
        self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
        # some mw depends on stats collector
        self.crawler.stats.open_spider(self.spider)
        return self.mwman.open_spider(self.spider)

    def tearDown(self):
        self.crawler.stats.close_spider(self.spider, '')
        return self.mwman.close_spider(self.spider)

    def _download(self, request, response=None):
        """Executes downloader mw manager's download method and returns
        the result (Request or Response) or raise exception in case of
        failure.
        """
        if not response:
            response = Response(request.url)

        def download_func(**kwargs):
            return response

        dfd = self.mwman.download(download_func, request, self.spider)
        # catch deferred result and return the value
        results = []
        dfd.addBoth(results.append)
        self._wait(dfd)
        ret = results[0]
        if isinstance(ret, Failure):
            ret.raiseException()
        return ret
 def __init__(self):
     BaseSpider.__init__(self)
     DeputadoSpider.start_urls = self.get_start_urls()
 def __init__(self):
     BaseSpider.__init__(self)
     self.counter = 1
Beispiel #58
0
 def __init__(self):
     BaseSpider.__init__(self)