def get_defaults_spider_mw(self): crawler = get_crawler() spider = BaseSpider('foo') spider.set_crawler(crawler) defaults = dict([(k, [v]) for k, v in \ crawler.settings.get('DEFAULT_REQUEST_HEADERS').iteritems()]) return defaults, spider, DefaultHeadersMiddleware()
def __init__(self): BaseSpider.__init__(self) # settings settings.overrides["DOWNLOAD_DELAY"] = 0 settings.overrides["LOG_FILE"] = "scrapy.log" settings.overrides["LOG_STDOUT"] = True settings.overrides["DOWNLOAD_TIMEOUT"] = 180 settings.overrides["RETRY_TIMES"] = 10 # base url of all the pages self.base_url = "http://www.365zn.com/fyc/" # regex objects # example: <a href="fyc_h.htm" self.reobj_word_list_page = re.compile(r"fyc_\w+.htm") # example: <a href=htm/11474.htm title='把持'> self.reobj_word_and_page = re.compile(r"href=\S+\s+title='[^']+'") # 【同义词】 <font color=blue>胸有成竹 心中有数 稳操胜券</font> self.reobj_synonym = re.compile(r"【同义词】\W+<font color=blue>([^<]*)</font>") # 【反义词】 <font color=red>心中无数 手忙脚乱</font> self.reobj_antonym = re.compile(r"【反义词】\W+<font color=red>([^<]*)</font>") # chinese character(s) # self.reobj_chinese = re.compile(r"[\u4e00-\u9fa5]+") self.reobj_chinese = re.compile(r"[\x80-\xff]+")
def __init__(self, **kwargs): BaseSpider.__init__(self) try: self.outDir=kwargs['outDir'] if self.outDir[-1]!= '/': self.outDir += '/' startYear=int(kwargs['startYear']) endYear=int(kwargs['endYear']) assert startYear <= endYear except: print >>sys.stderr, "eventSpider needs 3 arguments: outDir, startYear, endYear" exit(1) startingAdd = "https://en.wikipedia.org/wiki/" self.start_urls = [] if startYear < -500: for i in range(startYear, min(-499, endYear), 10): add = startingAdd+str(-i)+"_BC" self.start_urls.append(add) path = self.outDir+str(-i)+"_BC/" if not os.path.exists(path): os.makedirs(path) if endYear > -500: startYear = -499 if startYear >-500 and startYear < 0: for i in range(max(startYear,-499), min(0,endYear), 1): add = startingAdd+str(-i)+"_BC" self.start_urls.append(add) path = self.outDir+str(-i)+"_BC/" if not os.path.exists(path): os.makedirs(path) if endYear > 0: startYear = 1 if startYear > 0: for i in range(startYear, endYear+1): add = startingAdd+str(i) self.start_urls.append(add) path = self.outDir+str(i)+"/" if not os.path.exists(path): os.makedirs(path)
def __init__(self, **kwargs): BaseSpider.__init__(self) try: self.outDir = kwargs['outDir'] if self.outDir[-1] != '/': self.outDir += '/' startYear = int(kwargs['startYear']) endYear = int(kwargs['endYear']) assert startYear <= endYear except: print >> sys.stderr, "eventSpider needs 3 arguments: outDir, startYear, endYear" exit(1) startingAdd = "https://en.wikipedia.org/wiki/" self.start_urls = [] if startYear < -500: for i in range(startYear, min(-499, endYear), 10): add = startingAdd + str(-i) + "_BC" self.start_urls.append(add) path = self.outDir + str(-i) + "_BC/" if not os.path.exists(path): os.makedirs(path) if endYear > -500: startYear = -499 if startYear > -500 and startYear < 0: for i in range(max(startYear, -499), min(0, endYear), 1): add = startingAdd + str(-i) + "_BC" self.start_urls.append(add) path = self.outDir + str(-i) + "_BC/" if not os.path.exists(path): os.makedirs(path) if endYear > 0: startYear = 1 if startYear > 0: for i in range(startYear, endYear + 1): add = startingAdd + str(i) self.start_urls.append(add) path = self.outDir + str(i) + "/" if not os.path.exists(path): os.makedirs(path)
def test_rules_manager_callbacks(self): mycallback = lambda: True spider = BaseSpider('foo') spider.parse_item = lambda: True response1 = HtmlResponse('http://example.org') response2 = HtmlResponse('http://othersite.org') rulesman = RulesManager([ Rule('example', mycallback), Rule('othersite', 'parse_item'), ], spider, default_matcher=UrlRegexMatcher) rule1 = rulesman.get_rule_from_response(response1) rule2 = rulesman.get_rule_from_response(response2) self.failUnlessEqual(rule1.callback, mycallback) self.failUnlessEqual(rule2.callback, spider.parse_item) # fail unknown callback self.assertRaises(AttributeError, RulesManager, [ Rule(BaseMatcher(), 'mycallback') ], spider) # fail not callable spider.not_callable = True self.assertRaises(AttributeError, RulesManager, [ Rule(BaseMatcher(), 'not_callable') ], spider)
def __init__(self, **kwargs): BaseSpider.__init__(self) try: self.outDir=kwargs['outDir'] if self.outDir[-1]!= '/': self.outDir += '/' self.endYear=int(kwargs['endYear']) except: print >>sys.stderr, "eventSpider needs 3 arguments: outDir, outFile, endYear" exit(1) startingAdd = "http://en.wikipedia.org/wiki/" self.start_urls = [] # self.start_urls = [startingAdd+"2011"] # if not os.path.exists(self.outDir+"2011"): os.makedirs(self.outDir+"2011") for i in range(1500, 499, -10): add = startingAdd+str(i)+"_BC" self.start_urls.append(add) path = self.outDir+str(i)+"_BC/" if not os.path.exists(path): os.makedirs(path) for i in range(499, 0, -1): add = startingAdd+str(i)+"_BC" self.start_urls.append(add) path = self.outDir+str(i)+"_BC/" if not os.path.exists(path): os.makedirs(path) for i in range(1, self.endYear+1): add = startingAdd+str(i) self.start_urls.append(add) path = self.outDir+str(i)+"/" if not os.path.exists(path): os.makedirs(path)
def test_scheduler_persistent(self): messages = [] spider = BaseSpider('myspider') spider.log = lambda *args, **kwargs: messages.append([args, kwargs]) self.scheduler.persist = True self.scheduler.open(spider) self.assertEqual(messages, []) self.scheduler.enqueue_request(Request('http://example.com/page1')) self.scheduler.enqueue_request(Request('http://example.com/page2')) self.assertTrue(self.scheduler.has_pending_requests()) self.scheduler.close('finish') self.scheduler.open(spider) self.assertEqual(messages, [ [('Resuming crawl (2 requests scheduled)', ), {}], ]) self.assertEqual(len(self.scheduler), 2) self.scheduler.persist = False self.scheduler.close('finish') self.assertEqual(len(self.scheduler), 0)
def __init__(self): BaseSpider.__init__(self) self.handle_httpstatus_list = range(0,1000) self.requestCount = 0 print 'Opening Alexa URL CSV, please wait.' maxSites = 200000 selectionInterval = 5 #Include every nth site skipSites = 861010 #Skip the first n sites csv_file = open('top-1m.csv','r') alexaReader = csv.reader(csv_file) rank=1 queuedCount = 0 for line in alexaReader : domain = line[1] if (rank % selectionInterval) == 0 and rank > skipSites: self.allowed_domains.append( domain ) self.start_urls.append(domain) queuedCount = queuedCount + 1 if (queuedCount >= maxSites) : break rank += 1 csv_file.close() print 'Done opening URLs, starting crawler....'
def __init__(self, **kwargs): BaseSpider.__init__(self) self.driver = webdriver.Remote( command_executor='http://127.0.0.1:4444/wd/hub', desired_capabilities=DesiredCapabilities.FIREFOX) url = self.default if 'from' in kwargs: self.start_urls[0] += kwargs['from'] if 'to' in kwargs: self.start_urls[0] += '/' + kwargs['to'] if 'date' in kwargs: dat = kwargs['date'] if re.match('\d{6}', dat): self.monthly = False self.start_urls[0] += '/' + dat elif re.match('\d{4}', dat): self.monthly = True self.start_urls[0] += ('/blah.html?oym=' + dat + '&charttype=1') else: self.this_month() else: self.this_month() if self.monthly and 'rtn' in kwargs: self.rtn = kwargs['rtn'] self.start_urls[0] += '&rtn=' + self.rtn else: self.rtn = '0' url = self.start_urls[0] self.driver.get(url)
def test_scheduler_persistent(self): messages = [] spider = BaseSpider('myspider') spider.log = lambda *args, **kwargs: messages.append([args, kwargs]) self.scheduler.persist = True self.scheduler.open(spider) self.assertEqual(messages, []) self.scheduler.enqueue_request(Request('http://example.com/page1')) self.scheduler.enqueue_request(Request('http://example.com/page2')) self.assertTrue(self.scheduler.has_pending_requests()) self.scheduler.close('finish') self.scheduler.open(spider) self.assertEqual(messages, [ [('Resuming crawl (2 requests scheduled)',), {}], ]) self.assertEqual(len(self.scheduler), 2) self.scheduler.persist = False self.scheduler.close('finish') self.assertEqual(len(self.scheduler), 0)
def __init__(self): BaseSpider.__init__(self) # settings settings.overrides['DOWNLOAD_DELAY'] = 0.1 # regex object for extracting image url self.reobj_image = re.compile(r"http://\S+.gstatic.com[^\"\s]+") self.num_images_per_page = 20 self.num_images = 200 # initialize word searching url list self.base_url = "http://images.google.com/search?tbm=isch&safe=off" f_word_dict = file(r'SogouLabDic_tab_utf8_linux.dic') # f_word_dict = file(r'test_dict') word_lines = f_word_dict.readlines() print "initialize image searching urls" for word_line in word_lines: word = word_line[ : word_line.index("\t")] start = 0 while start < self.num_images: self.start_urls.append( self.base_url + "&q=" + word + "&start=" + str(start) ) start += self.num_images_per_page print "created " + str( len(self.start_urls) ) + " image searching urls."
def test_rules_manager_callbacks(self): mycallback = lambda: True spider = BaseSpider('foo') spider.parse_item = lambda: True response1 = HtmlResponse('http://example.org') response2 = HtmlResponse('http://othersite.org') rulesman = RulesManager([ Rule('example', mycallback), Rule('othersite', 'parse_item'), ], spider, default_matcher=UrlRegexMatcher) rule1 = rulesman.get_rule_from_response(response1) rule2 = rulesman.get_rule_from_response(response2) self.failUnlessEqual(rule1.callback, mycallback) self.failUnlessEqual(rule2.callback, spider.parse_item) # fail unknown callback self.assertRaises(AttributeError, RulesManager, [Rule(BaseMatcher(), 'mycallback')], spider) # fail not callable spider.not_callable = True self.assertRaises(AttributeError, RulesManager, [Rule(BaseMatcher(), 'not_callable')], spider)
def test_url_is_from_spider_with_allowed_domains(self): spider = BaseSpider(name='example.com', allowed_domains=['example.org', 'example.net']) self.assertTrue( url_is_from_spider('http://www.example.com/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://sub.example.com/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://example.com/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://www.example.org/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://www.example.net/some/page.html', spider)) self.assertFalse( url_is_from_spider('http://www.example.us/some/page.html', spider)) spider = BaseSpider(name='example.com', allowed_domains=set( ('example.com', 'example.net'))) self.assertTrue( url_is_from_spider('http://www.example.com/some/page.html', spider)) spider = BaseSpider(name='example.com', allowed_domains=('example.com', 'example.net')) self.assertTrue( url_is_from_spider('http://www.example.com/some/page.html', spider))
def __init__(self, *arg1, **arg2): log.msg(message="man_spider, __init__", _level = log.INFO) BaseSpider.__init__(self, *arg1, **arg2) self.man_spider_callback = {} self.man_spider_callback['list'] = self.callback_list self.man_spider_callback['parse'] = self.callback_parse self.man_spider_callback['all'] = self.callback_all
def _assert_stores(self, storage, path): yield storage.store(StringIO("content"), BaseSpider("default")) self.failUnless(os.path.exists(path)) self.failUnlessEqual(open(path).read(), "content") # again, to check files are overwritten properly yield storage.store(StringIO("new content"), BaseSpider("default")) self.failUnlessEqual(open(path).read(), "new content")
def __init__(self): BaseSpider.__init__(self) # starting virtual display # comment this line if you are using desktop display.start() # estabilishing browser self.browser = webdriver.Firefox()
def setUp(self): self.crawler = get_crawler(self.settings_dict) self.spider = BaseSpider('foo') self.spider.set_crawler(self.crawler) self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler) # some mw depends on stats collector self.crawler.stats.open_spider(self.spider) return self.mwman.open_spider(self.spider)
def setUp(self): self.spider1 = BaseSpider('name1') self.spider2 = BaseSpider('name2') open_spiders = set([self.spider1, self.spider2]) crawler = CrawlerMock(open_spiders) self.spref = SpiderReferencer(crawler) self.encoder = ScrapyJSONEncoder(spref=self.spref) self.decoder = ScrapyJSONDecoder(spref=self.spref)
def __init__(self): BaseSpider.__init__(self) self.verificationErrors = [] self.profile = webdriver.FirefoxProfile("C:/Users/Administrator/AppData/Roaming/Mozilla/Firefox/Profiles/rbqs2eme.") self.browser = webdriver.Firefox(self.profile) self.duplicatesurl = {} dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, **kwargs): BaseSpider.__init__(self) startingAdd = "https://en.wikipedia.org/wiki/" self.inFile = kwargs['infile'] self.outFile = kwargs['outfile'] self.start_urls = [] self.url2locDic = {} self.readFile(self.inFile) fout = codecs.open(self.outFile, "w", encoding='utf-8') fout.close()
def __init__(self, domain_name=None): BaseSpider.__init__(self, domain_name) consumer_key = config.get('yammer', 'consumer_key') consumer_secret = config.get('yammer', 'consumer_secret') app_token = config.get('yammer', 'app_token') self.consumer = OAuthConsumer(consumer_key, consumer_secret) self.signature = OAuthSignatureMethod_PLAINTEXT() self.token = OAuthToken.from_string(app_token)
def __init__(self): BaseSpider.__init__(self) self.verificationErrors = [] # self.profile = webdriver.FirefoxProfile("C:/Users/Administrator/AppData/Roaming/Mozilla/Firefox/Profiles/rbqs2eme") # self.browser = webdriver.Firefox(self.profile) self.browser = webdriver.Chrome('C:\Users\ZERO\AppData\Local\Google\Chrome\Application\chromedriver.exe') self.duplicatesurl = {} dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, **kwargs): BaseSpider.__init__(self) startingAdd = "http://en.wikipedia.org/wiki/" self.inFile=kwargs['infile'] self.outFile=kwargs['outfile'] self.start_urls = [] self.url2locDic = {} self.readFile(self.inFile) fout = codecs.open(self.outFile,"w", encoding='utf-8') fout.close
def __init__(self, name=None, **kwargs): if not hasattr(self, 'start_urls'): self.start_urls = [] file_list = [i for i in os.listdir( self.result_path) if i.endswith('.html')] for i in file_list: path = os.path.join(self.result_path, i).replace('?', '%3F') url = 'file://%s' % (path) self.start_urls.append(url) BaseSpider.__init__(self, kwargs=kwargs) self.item = Commonshop()
def __init__(self): BaseSpider.__init__(self) self.verificationErrors = [] with open(self.contactsDataFile, 'rb') as csvfile: csvreader = csv.reader(csvfile, delimiter=',', quotechar='"') self.log('Initialing with contact urls from file : ' + self.contactsDataFile + ' ...') for row in csvreader: if row[1].startswith('https') == True: self.start_urls.append(row[1]) self.log('Total contacts loaded : %d' % len(self.start_urls))
def __init__(self): BaseSpider.__init__(self) # inizializzo il baseSpider con il metodo originale (sto riscrivendo il metodo '__init__()') self.verificationErrors = [] # --- Disattivare l'apertura del brawser ------------------------------------- # Funziona soltatno con Linux, per via delle dipendenze grafiche... # self.display = Display(visible=0,backend ='xvnb', size=(800, 600)) # self.display = Display(visible=0, size=(800, 600)) # self.display.start() # ---------------------------------------------------------------------------- self.driver = webdriver.Firefox(self.disableImages()) # carico il webdriver con il profilo che crea la funzione 'disableImages()'
def test_host_header_seted_in_request_headers(self): def _test(response): self.assertEquals(response.body, 'example.com') self.assertEquals(request.headers.get('Host'), 'example.com') request = Request(self.getURL('host'), headers={'Host': 'example.com'}) return self.download_request(request, BaseSpider('foo')).addCallback(_test) d = self.download_request(request, BaseSpider('foo')) d.addCallback(lambda r: r.body) d.addCallback(self.assertEquals, 'example.com') return d
def __init__(self, **kwargs): BaseSpider.__init__(self) startingAdd = "http://en.wikipedia.org/wiki/" self.inFile=kwargs['infile'] self.outFileLoc=kwargs['outfileLoc'] self.outFilePer=kwargs['outfilePer'] self.start_urls = [] self.url2locDic = {} self.url2urlDic = {} self.readFile(self.inFile) fout = open(self.outFileLoc,"w") fout = open(self.outFilePer,"w") fout.close
def __init__(self): BaseSpider.__init__(self) # use any browser you wish display.start() profile = webdriver.FirefoxProfile() # setup configuration for browser driver to download file properly profile.set_preference("browser.download.folderList", 2) profile.set_preference("browser.download.manager.showWhenStarting", False) profile.set_preference("browser.download.dir", ROOT_DIR) profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/csv") self.browser = webdriver.Firefox(firefox_profile=profile)
def test_store(self): out = StringIO() storage = StdoutFeedStorage('stdout:', _stdout=out) file = storage.open(BaseSpider("default")) file.write("content") yield storage.store(file) self.assertEqual(out.getvalue(), "content")
def test_payload(self): body = '1' * 100 # PayloadResource requires body length to be 100 request = Request(self.getURL('payload'), method='POST', body=body) d = self.download_request(request, BaseSpider('foo')) d.addCallback(lambda r: r.body) d.addCallback(self.assertEquals, body) return d
def __init__(self, username=None): BaseSpider.__init__(self) self.username, self.pwd = username.split(":") self.db = MySQLdb.connect( host="localhost", port=3306, user="******", passwd="pw", db="weibosearch2", charset="utf8", use_unicode=True ) self.cursor = self.db.cursor() self.logined = False log.msg("login with %s" % self.username, level=log.INFO) login_url = self.weibo.login(self.username, self.pwd) if login_url: log.msg("login successful, start crawling.", level=log.INFO) self.start_urls.append(login_url) else: log.msg("login failed", level=log.ERROR)
def test_host_header_not_in_request_headers(self): def _test(response): self.assertEquals(response.body, '127.0.0.1:%d' % self.portno) self.assertEquals(request.headers, {}) request = Request(self.getURL('host')) return self.download_request(request, BaseSpider('foo')).addCallback(_test)
def test_rules_manager_callback_with_arguments(self): spider = BaseSpider('foo') response = HtmlResponse('http://example.org') kwargs = {'a': 1} def myfunc(**mykwargs): return mykwargs # verify return validation self.failUnlessEquals(kwargs, myfunc(**kwargs)) # test callback w/o arguments rulesman = RulesManager([ Rule(BaseMatcher(), myfunc), ], spider) rule = rulesman.get_rule_from_response(response) # without arguments should return same callback self.failUnlessEqual(rule.callback, myfunc) # test callback w/ arguments rulesman = RulesManager([ Rule(BaseMatcher(), myfunc, **kwargs), ], spider) rule = rulesman.get_rule_from_response(response) # with argument should return partial applied callback self.failUnless(isinstance(rule.callback, partial)) self.failUnlessEquals(kwargs, rule.callback())
def setUp(self): self.spider = BaseSpider('scrapytest.org') self.stats = StatsCollector() self.stats.open_spider(self.spider) self.mw = DepthMiddleware(1, self.stats, True)
def _assert_stores(self, storage, path): spider = BaseSpider("default") file = storage.open(spider) file.write("content") yield storage.store(file) self.failUnless(os.path.exists(path)) self.failUnlessEqual(open(path).read(), "content")
def test_store_load(self): jobdir = self.mktemp() os.mkdir(jobdir) spider = BaseSpider(name='default') dt = datetime.now() ss = SpiderState(jobdir) ss.spider_opened(spider) spider.state['one'] = 1 spider.state['dt'] = dt ss.spider_closed(spider) spider2 = BaseSpider(name='default') ss2 = SpiderState(jobdir) ss2.spider_opened(spider2) self.assertEqual(spider.state, {'one': 1, 'dt': dt}) ss2.spider_closed(spider2)
def test_download_without_proxy(self): def _test(response): self.assertEquals(response.status, 200) self.assertEquals(response.url, request.url) self.assertEquals(response.body, '/path/to/resource') request = Request(self.getURL('path/to/resource')) return self.download_request(request, BaseSpider('foo')).addCallback(_test)
def setUp(self): self.spider = BaseSpider('scrapytest.org') self.mw = DownloaderStats() stats.open_spider(self.spider) self.req = Request('http://scrapytest.org') self.res = Response('scrapytest.org', status=400)
def setUp(self): self.spider = BaseSpider('scrapytest.org') self.stats = StatsCollector() self.stats.open_spider(self.spider) self.mw = DepthMiddleware(1, self.stats) self.assertEquals(self.stats.get_value('envinfo/request_depth_limit'), 1)
def test_filter(self): spider = BaseSpider('foo') filter = NullDupeFilter() filter.open_spider(spider) r1 = Request('http://scrapytest.org/1') assert not filter.request_seen(spider, r1) filter.close_spider(spider)
def test_state_attribute(self): # state attribute must be present if jobdir is not set, to provide a # consistent interface spider = BaseSpider(name='default') ss = SpiderState() ss.spider_opened(spider) self.assertEqual(spider.state, {}) ss.spider_closed(spider)
def test_download_with_proxy(self): def _test(response): self.assertEquals(response.status, 200) self.assertEquals(response.url, request.url) self.assertEquals(response.body, 'https://example.com') http_proxy = self.getURL('') request = Request('https://example.com', meta={'proxy': http_proxy}) return self.download_request(request, BaseSpider('foo')).addCallback(_test)
def test_download(self): def _test(response): self.assertEquals(response.url, request.url) self.assertEquals(response.status, 200) self.assertEquals(response.body, '0123456789') request = Request(path_to_file_uri(self.tmpname + '^')) assert request.url.upper().endswith('%5E') return self.download_request(request, BaseSpider('foo')).addCallback(_test)
def setUp(self): self.spider = BaseSpider('foo') self.mw = HttpErrorMiddleware() self.req = Request('http://scrapytest.org') self.res200 = Response('http://scrapytest.org', status=200) self.res200.request = self.req self.res404 = Response('http://scrapytest.org', status=404) self.res404.request = self.req
def _schedule(self, request, spider): if spider is None: spider = create_spider_for_request(self.crawler.spiders, request, \ BaseSpider('default'), log_multiple=True) spider.set_crawler(self.crawler) self.crawler.engine.open_spider(spider) d = self.crawler.engine.schedule(request, spider) d.addCallback(lambda x: (x, spider)) return d
def __init__(self, username=None): BaseSpider.__init__(self) self.username, self.pwd = username.split(':') self.db = MySQLdb.connect(host="localhost", port=3306, user="******", passwd="pw", db="weibosearch2", charset='utf8', use_unicode=True) self.cursor = self.db.cursor() self.logined = False host = settings.get('REDIS_HOST', REDIS_HOST) port = settings.get('REDIS_PORT', REDIS_PORT) log.msg('login with %s' % self.username, level=log.INFO) login_url = self.weibo.login(self.username, self.pwd) if login_url: log.msg('login successful, start crawling.', level=log.INFO) self.start_urls.append(login_url) else: log.msg('login failed', level=log.ERROR)
def __init__(self): BaseSpider.__init__(self) # settings settings.overrides['DOWNLOAD_DELAY'] = 0 settings.overrides['LOG_FILE'] = "scrapy.log" settings.overrides['LOG_STDOUT'] = True settings.overrides['DOWNLOAD_TIMEOUT'] = 180 settings.overrides['RETRY_TIMES'] = 10 self.num_images_per_page = 20 self.num_images = 60 # base url for image searching self.base_url = "http://images.google.com/search?tbm=isch&safe=off" # regex object for extracting image url self.reobj_image = re.compile(r"http://\S+.gstatic.com[^\"\s]+") # initialize start_urls self.fill_start_urls()
def __init__(self, **kwargs): BaseSpider.__init__(self) try: self.outFile=kwargs['outfile'] self.endYear=int(kwargs['endYear']) except: print >>sys.stderr, "eventSpider needs 2 arguments: outfile, endYear" exit(1) startingAdd = "http://en.wikipedia.org/wiki/" self.start_urls = [] for i in range(1500, 499, -10): add = startingAdd+str(i)+"_BC" self.start_urls.append(add) for i in range(499, 0, -1): add = startingAdd+str(i)+"_BC" self.start_urls.append(add) for i in range(1, self.endYear+1): add = startingAdd+str(i) self.start_urls.append(add) fout = open(self.outFile,"w") fout.close
def __init__(self, name=None, **kwargs): if not hasattr(self, 'start_urls'): self.start_urls = [] skiplist = [] skip_file = '%s/skip.txt' % self.result_path if not os.path.isdir(self.result_path): os.makedirs(self.result_path) if os.path.isfile(skip_file): with open(skip_file, 'r') as fp: for eachline in fp: shopid = eachline.replace('\n', '') if shopid.isdigit(): skiplist.append(int(shopid)) id_range = self.id_range if len(id_range) == 2: for i in xrange(id_range[0], id_range[1]): if i not in skiplist: url = '%s%s' % (self.start_url, i) self.start_urls.append(url) if len(id_range) == 3: for i in xrange(id_range[0], id_range[1], id_range[2]): if i not in skiplist: url = '%s%s' % (self.start_url, i) self.start_urls.append(url) else: for i in id_range: if i not in skiplist: url = '%s%s' % (self.start_url, i) self.start_urls.append(url) if 1: file_list = [i for i in os.listdir( self.result_path) if i.endswith('.html')] exist_urls = [i.replace('.html', '') for i in file_list] self.start_urls = [ i for i in self.start_urls if i.split('/')[-1] not in exist_urls] BaseSpider.__init__(self, kwargs=kwargs)
class ManagerTestCase(TestCase): settings_dict = None def setUp(self): self.crawler = get_crawler(self.settings_dict) self.spider = BaseSpider('foo') self.spider.set_crawler(self.crawler) self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler) # some mw depends on stats collector self.crawler.stats.open_spider(self.spider) return self.mwman.open_spider(self.spider) def tearDown(self): self.crawler.stats.close_spider(self.spider, '') return self.mwman.close_spider(self.spider) def _download(self, request, response=None): """Executes downloader mw manager's download method and returns the result (Request or Response) or raise exception in case of failure. """ if not response: response = Response(request.url) def download_func(**kwargs): return response dfd = self.mwman.download(download_func, request, self.spider) # catch deferred result and return the value results = [] dfd.addBoth(results.append) self._wait(dfd) ret = results[0] if isinstance(ret, Failure): ret.raiseException() return ret
def __init__(self): BaseSpider.__init__(self) DeputadoSpider.start_urls = self.get_start_urls()
def __init__(self): BaseSpider.__init__(self) self.counter = 1
def __init__(self): BaseSpider.__init__(self)