def _construct_query(self, page_num, query): url = 'http://www.innojoy.com/client/interface.aspx' data = {"requestModule": "PatentSearch", "userId": "", "patentSearchConfig": { "Query": query, "TreeQuery": "", "Database": "idpat,mypat,phpat,sgpat,itpat,inpat,inapp,chpat,frpat,gbpat,depat,jpapp,eppat,wopat,usapp,usdes,uspp,usre,uspat,fmsq,wgzl,syxx,fmzl", "Action": "Search", "Page": str(page_num), "PageSize": self._page_size, "GUID": "", "Sortby": "", "AddOnes": "", "DelOnes": "", "RemoveOnes": "", "TrsField": "", "SmartSearch": "" } } data_bin = json.dumps(data) headers = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'http://www.innojoy.com/SearchResult/default.shtml', } request = Request(url=url, method='post', headers=headers, body=data_bin) # noinspection PyUnresolvedReferences request.callback = self.query_callback return request
def parse(self, response): send_emails = True try: with open(self.links): pass except IOError: open(self.links, 'a') #If this is the first time the file is created do not send emails send_emails = False; hxs = HtmlXPathSelector(response) results = hxs.select('//*[@id="results-anchor"]/*/a') for result in results: title = result.select('text()').extract()[0].strip() link = 'http://www.supost.com' + result.select('@href').extract()[0].strip() exists = False #check to see if we have already looked at this page for line in open(self.links): if link in line: exists = True break #If we have not seen the page before add it to the links list if exists == False: request = Request(link, callback=self.get_description) request.meta['title'] = title request.meta['link'] = link request.meta['send_emails'] = send_emails yield request
def test_request_cacheability(self): res0 = Response(self.request.url, status=200, headers={'Expires': self.tomorrow}) req0 = Request('http://example.com') req1 = req0.replace(headers={'Cache-Control': 'no-store'}) req2 = req0.replace(headers={'Cache-Control': 'no-cache'}) with self._middleware() as mw: # response for a request with no-store must not be cached res1 = self._process_requestresponse(mw, req1, res0) self.assertEqualResponse(res1, res0) assert mw.storage.retrieve_response(self.spider, req1) is None # Re-do request without no-store and expect it to be cached res2 = self._process_requestresponse(mw, req0, res0) assert 'cached' not in res2.flags res3 = mw.process_request(req0, self.spider) assert 'cached' in res3.flags self.assertEqualResponse(res2, res3) # request with no-cache directive must not return cached response # but it allows new response to be stored res0b = res0.replace(body=b'foo') res4 = self._process_requestresponse(mw, req2, res0b) self.assertEqualResponse(res4, res0b) assert 'cached' not in res4.flags res5 = self._process_requestresponse(mw, req0, None) self.assertEqualResponse(res5, res0b) assert 'cached' in res5.flags
def parse(self, response): hxs = HtmlXPathSelector(response) for div in hxs.select('//div[@id="contem_boxes"]'): titulo = div.select('.//div[@id="contem_titulo"]/text()').extract()[0] if not titulo.endswith(u'mara dos Deputados/BR'): continue else: reg = re.compile('<a class="listapar" href="(?P<url>.*?)">(?P<name>[\w\s]*[\w]+)\s*\(<b>[\w\s]+</b>\)\s-\s(?P<party>.*?)\/(?P<state>.*?)</a><br>', flags=re.U) for r in reg.finditer(div.extract()): dict_deputy = r.groupdict() #if dict_deputy['state'] in settings['STATE_TO_FILTER']: db_deputy = self.api.get_deputado_por_nome(dict_deputy['name']) if not db_deputy: dep = Deputado(dict_deputy['name'], dict_deputy['state'], dict_deputy['party']) self.api.inserir_deputado(dep) else: dep = db_deputy[0] id = urlparse.parse_qs(urlparse.urlparse(dict_deputy['url']).query).get('id', [0])[0] if not id: continue request = Request(urljoin(self.base_url, '@presencas.php?id=%s' % id), callback=self.parse_deputy_assiduity) request.meta['dep'] = dep yield request request = Request(urljoin(self.base_url, '@uso_verbas_als.php?uf=16&id=%s' % id), callback=self.parse_deputy_costs) request.meta['dep'] = dep yield request
def process_wsj_sitemap(spider, body): print "Enter processing sitemap for wsj" data = bs(body) urls = data.find_all('url') for url in urls: link = url.loc.text news = url.find('news:news') item = None if news is not None: item = SitemapItem() title = news.find('news:title') item['title'] = title.text #format: 2014-04-27T05:49:00-05:00 date = news.find('news:publication_date') dt = parse(date.text) dt_utc = dt.astimezone(dateutil.tz.tzutc()).replace(tzinfo=None) item['update'] = dt_utc #need to save/get last crawled timestamp to decide whether we need to recrawl the link #pattern http://online.wsj.com/google_sitemap_Q1_1996.xml req = Request(link, callback = spider.process_page) if item is not None: req.meta['item'] = item else: pass yield req
def parse(self, response): i = 0 for div in response.xpath('//li[@class="conference vevent"]'): item = AfeventItem() item['location'] = div.xpath('.//p[@class="location"]/a[3]/text()').extract_first() item['title'] = div.xpath('//h4/a/text()').extract()[i] item['date'] = div.xpath('//p[@class="date"]/abbr[1]/@title').extract()[i] item['host'] = '' item['time'] = '' item['description'] = '' follow_url_1 = div.xpath('//h4/a/@href').extract()[i] follow_url = 'http://lanyrd.com' + follow_url_1 request = Request(follow_url, callback = self.parse_url) url = div.xpath('//h4/a/@href').extract()[i] url = 'http://lanyrd.com' + url request = Request(url, callback = self.parse_url) request.meta['item'] = item if i < len(response.xpath('//li[@class="conference vevent"]')): i = i + 1 yield request
def parse(self, response): hxs = HtmlXPathSelector(response) magic_sets_full = hxs.select('//div[@class="left_block"]//ul[@class="left_menu"]//li/a/text()').extract() links_to_magic_sets_full = hxs.select( '//div[@class="left_block"]//ul[@class="left_menu"]//li/a/@href' ).extract() # lets cut first category for debuging purposes: magic_sets = magic_sets_full[0] links_to_magic_sets = links_to_magic_sets_full[0] # self.log("This is first category and link to they: %s, %s, %s" % (type(magic_sets), magic_sets, links_to_magic_sets)) # Now all magic sets are all together with the links to them: # uncoment this after debug: # magic_sets_zip = dict(zip(magic_sets, links_to_magic_sets)) magic_sets_zip = dict([[magic_sets, links_to_magic_sets]]) date_prefix = time.strftime("%Y%m%d", time.localtime()) try: os.mkdir("./archive/HTML/" + date_prefix) except OSError: self.log("The folder exists!") filename = "./archive/HTML/" + date_prefix + "/" + response.url.split("/")[-1] + ".htm" self.log("This is filename for index: %s" % (filename,)) try: open(filename, "wb").write(response.body) except OSError: os.remove(filename) open(filename, "wb").write(response.body) # Continue to extract data: for magic_set, url in magic_sets_zip.iteritems(): abs_url = urljoin("http://www.blackborder.com", url) self.log("This is magic set name and url to it: %s ---> %s" % (magic_set, abs_url)) request = Request(abs_url, callback=self.parse_set_page) request.meta["magic_set"] = magic_set request.meta["date_prefix"] = date_prefix yield request
def start_requests(self): page = 1 search_url = self.get_search_url(page) request = Request(search_url) request.meta['page'] = page yield request
def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 spider = SingleRequestSpider(seed=req0) yield docrawl(spider) # basic asserts in case of weird communication errors self.assertIn('responses', spider.meta) self.assertNotIn('failures', spider.meta) # start requests doesn't set Referer header echo0 = json.loads(spider.meta['responses'][2].body) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(spider.meta['responses'][1].body) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(spider.meta['responses'][2].body) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(spider.meta['responses'][3].body) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])
def _test(method): url = 'http://www.example.com/301' url2 = 'http://www.example.com/redirected' req = Request(url, method=method) req.meta['origin_url'] = url resp = Response(url, headers={'Location': url2}, status=301) req2 = mw.process_response(req, resp, self.spider) assert isinstance(req2, Request) self.assertEqual(req2.url, url2) self.assertEqual(req2.method, method) del resp.headers['Location'] assert mw.process_response(req, resp, self.spider) is resp bad_url1 = 'http://baidu.com/' bad_url2 = 'http://baidu.com/xx' resp2 = Response(url, headers={'Location': bad_url1}, status=301) resp3 = Response(url, headers={'Location': bad_url2}, status=301) req.meta['proxy'] = 'xx.xx.xx.xx:301' req2 = mw.process_response(req, resp2, self.spider) req3 = mw.process_response(req, resp3, self.spider) assert isinstance(req2, Request) assert isinstance(req3, Request) self.assertEqual(req2.url, url) self.assertEqual(req3.url, url)
def start_requests(self): with open(getattr(self, "file", "todo.csv"), "rU") as f: reader = csv.DictReader(f) for line in reader: request = Request(line.pop('url')) request.meta['fields'] = line yield request
def getComments(self, response): Item = response.meta['item'] res_text = response.body_as_unicode().encode('ascii', 'ignore') res_text = smart_str(self.parser.unescape(self.parser.unescape(res_text))).replace('\xc2\xa0','') res_text = res_text.replace('\n', ' ').replace('\t', ' ').replace('\r', '') res_text = re.subn('<script.*?</script>', '', res_text)[0] res_text = re.subn('<style.*?</style>', '', res_text)[0] hxs = HtmlXPathSelector(text=res_text) tmp = hxs.select('//div[@id="ds_div"]//text()').extract() comments = '' for val in tmp: val = val.strip() if val != '': comments += val + ' ' Item['Comments'] = comments try: offers_url = 'http://offer.ebay.com/ws/eBayISAPI.dll?ViewBids&item=' + Item['eBay_Item_Number'] if Item['eBay_Item_Number'] != 'NA' and Item['eBay_Item_Number'] != '': req = Request(offers_url, dont_filter=True, callback=self.getPostingDate) req.meta['item'] = Item return req except: pass return Item
def start_requests(self): page = 1 search_url = SEARCH_URL.format(page=page) request = Request(search_url) request.meta['page'] = page yield request
def start_requests(self): """ default Scrapy method to send requests """ # if spider already active if self.settings['active'] == 'T': log.msg('[OVERLAP] - at %s EST' % (datetime.now(timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S")), level=log.INFO) # Close the spider raise exceptions.CloseSpider('Recon Spider already active') # Set spider is activating ReconSpiderSettings(self.site).write_active('T') log.msg('[START_ID] - %s at %s EST' % (str(self.settings['recon_startid']), datetime.now(timezone('US/Eastern')) .strftime("%Y-%m-%d %H:%M:%S")), level=log.INFO) log.msg('[CYCLES] - %s at %s EST' % ( str(self.settings['cycles']), datetime.now(timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S")), level=log.INFO) # requires a new recon_startid, if not, close the spider if self.settings['recon_startid'] == -1: # Close the spider and notice to provive initial start_id raise exceptions.CloseSpider('Provide start_id value via start_id parameter for initilizing') # Generate ids list for reconnoitering url_ids = generate_ids(self.site) # Send URL requests for id in url_ids: req = Request("".join((self.base_url, str(id))), dont_filter=True, callback=self.parse) # save url_id for calling back req.meta['url_id'] = id yield req
def parse(self, response): sel = Selector(response) if self.dont_crawl: request = Request(response.url, callback=self.parse_single_episode) item = PlayGrabberItem() item['show_url'] = response.url # Store the original show id (to be able to detect mixing of seasons) item['original_show_id'] = '00000' # Pass on the item for further populating request.meta['episode-item'] = item return request # If this is a show index page, we need to get the URL for a single episode # (anyone will do, let's take the latest) try: any_episode_base_url = sel.xpath('//a[@class="play_title-page-trailer__start-button"]/@href').extract()[0] any_episode_url = 'http://www.svtplay.se' + any_episode_base_url except: # Otherwise we assume this url is for a single episode and not an index # page, and use it directly any_episode_url = response.url # Call this page again and make sure we get all episodes all_season_tabs = sel.xpath("//a[@class='play_accordion__section-title']/@href").re('[^#]*') # Don't include the shorts check_season_tabs = [t for t in all_season_tabs if t != '?tab=klipp'] requests = [] for tab in check_season_tabs: all_episodes_url = any_episode_url.split('?')[0] + tab + '&sida=99' request = Request(all_episodes_url, callback=self.parse_all_episodes) requests.append(request) return requests
def parse_all_episodes(self, response): # Now extract all episodes and grab each of them sel = Selector(response) all_episode_urls = sel.xpath("//li/article//a/@href").extract() if not all_episode_urls: if response.url.endswith('sida=99'): # If the number of episodes fit on just one page, the "sida=99" barfs # and returns zero hits. Retry without it. self.log("Retrying for all episodes assuming a single page for %s" % response.url) all_episodes_url = response.url.split('?')[0] + '?tab=senast' return Request(all_episodes_url, callback=self.parse_all_episodes) else: self.log("No episodes available for show %s" % response.url) else: # Original show_id is not used anymore original_show_id = '00000' # Get the show url (only valid for top-level pages), but not really important show_url = sel.xpath("//meta[@property='og:url']/@content").extract()[0] content_type = sel.xpath("//meta[@property='og:type']/@content").extract()[0] if content_type != 'video.tv_show': self.log("WARNING: This is not a top-level page.") requests = [] for url in all_episode_urls: request = Request('http://www.svtplay.se' + url, callback=self.parse_single_episode) item = PlayGrabberItem() item['show_url'] = show_url # Store the original show id (to be able to detect mixing of seasons) item['original_show_id'] = original_show_id # Pass on the item for further populating request.meta['episode-item'] = item requests.append(request) return requests
def parse_page (self, response): task = response.meta['task'] county_id = response.meta['county_id'] hxs = HtmlXPathSelector(response) # inspect_response (response); #get next page next = hxs.select("//a[contains(text(),'Next')]/@href") # if 0: if len(next) > 0: request = Request (urljoin(response.url, next[0].extract()), callback=self.parse_page, errback=self.error_callback, dont_filter=True) request.meta['task'] = task request.meta['county_id'] = county_id yield request else: yield self.form_request(task) rows = hxs.select ('/html/body/table[4]/tr') if (len(rows) == 0): self.send_alert ('No permit data found in search response') self.log('No permit data table present in response', log.ERROR) elif (len(rows) == 1): self.log('No incident reports found in response', log.WARNING) else: # Skip the first report record because this is the header row rows.pop (0) self.log('Retrieved {0} permits'.format(len(rows)), log.INFO) for row in rows: r = dict(zip(self.field_names, [f.strip() for f in row.select ('td/text()').extract_unquoted()])) r['county'] = self.counties[county_id] for item in self.process_row(r, task): yield item
def parse_chapter_page(self, response): hxs = HtmlXPathSelector(response) chapter = response.meta['chapter'] page_number = response.meta['page_number'] image_url = hxs.select('id("image")/@src').extract()[0] chapter.pages[page_number] = image_url if len(chapter.pages) == chapter.pages_count: #Sinal de progresso incrementado # print 'All urls of chapter %s of volume %s retrieved. Starting download...' % (chapter.chapter_number, chapter.volume) # brave_10/brave_10_v01/brave_10_v01_c01/brave_10_v01_c001_p001.jpg chapter_dir_name = chapter.chapter_number #volume_dir_name = '%s_%s' % (self.title, chapter.volume) #chapter_dir_name = '%s_%s' % (volume_dir_name, chapter.chapter_number) #(volume_dir_name, chapter.chapter_number) chapter_dir = os.path.join(self.title, chapter_dir_name) #((self.title, volume_dir_name, chapter_dir_name)) chapter.storage_dir = chapter_dir chapter.filename_pattern = '%03d.jpg' #chapter_dir_name + '_p%03d.jpg' if os.path.exists(chapter_dir): shutil.rmtree(chapter_dir) os.makedirs(chapter_dir) reqs = [] for page, image_url in chapter.pages.iteritems(): page_image_request = Request(image_url, callback=self.process_page_image) page_image_request.meta['chapter'] = chapter page_image_request.meta['page_number'] = page reqs.append(page_image_request) self.count += 1 # self.emit(QtCore.SIGNAL("progress(int)"), (self.count*100)/self.totalChapt) self.prog.setValue((self.count*100)/self.totalChapt) return reqs
def parse_hospital(self, response): hxs = Selector(response) department_urls = hxs.xpath("//table[@id='hosbra']/tr/td/a[@class='blue']/@href").extract() for department_url in department_urls: request = Request(department_url, callback=self.parse_doctors) request.meta['city'] = response.meta['city'] yield request
def parse(self, response): sel = Selector(response) title = sel.xpath('//div/div[@id="gd2"]/h1[@id="gj"]/text()').extract()[0] p = re.compile('[!|?|\\|/]') title = re.sub(p, '', title) sites = sel.xpath('//div[@id="gdt"]/div[@class="gdtm"]') next_page = '' if sel.xpath('(//table/tr/td[@onclick="sp({0})"])/a/@href'.format(self.next_page_count)).extract() != []: next_page = sel.xpath('(//table/tr/td[@onclick="sp({0})"])/a/@href'.format(self.next_page_count)).extract()[0] self.next_page_count += 1 for site in sites: item = EhendownItem() item['title'] = title item['page'] = '{0:0>3d}'.format(self.page_count) item['image_page'] = site.xpath('div/a/@href').extract()[0] request = Request(item['image_page'], callback = self.parse_image) request.meta['item'] = item self.page_count += 1 yield request print('page: ' + next_page) print(self.page_count) if next_page: yield Request(next_page, callback=self.parse)
def process_bbc_sitemap(spider, body): print "Enter processing sitemap for bbc" data = bs(body) urls = data.find_all('url') for url in urls: link = url.loc.text news = url.find('news:news') item = None if news is not None: item = SitemapItem() title = news.find('news:title') item['title'] = title.text #format: 2014-04-25T09:43:49Z date = news.find('news:publication_date') item['update'] = datetime.datetime.strptime(date.text.strip(), '%Y-%m-%dT%H:%M:%SZ') else: lastmod = url.find('lastmod') if lastmod: item = SitemapItem() #format: 2014-04-25T10:27:05Z item['update'] = datetime.datetime.strptime(lastmod.text.strip(), '%Y-%m-%dT%H:%M:%SZ') req = Request(link, callback = spider.process_page) if item is not None: req.meta['item'] = item else: pass yield req
def parse(self, response): """""" currentPage = response.xpath('//div[@class="page mb10"]/span/text()').extract()[0] print '---------------- Page: %s ----------------' % str(currentPage) jbs = response.xpath('//i[@class="iDes"]') for jb in jbs: jburl = jb.xpath('em[@class="eName"]/span/a/@href').extract()[0] if jb.xpath('em[@class="eName"]/span/a/@href') else '' keshi = jb.xpath('em[@class="eSym"]/a/text()').extract() if jb.xpath('em[@class="eName"]/span/a/@href') else '' if jburl != '': request = Request(jburl, callback=self.parse_nav) request.meta['ks'] = keshi yield request #urlList = jb.xpath('a/@href').extract() #if len(urlList) > 0: #jb_url = urlList[0] #request = Request(jb_url, callback=self.parse_nav) #yield request # 循环爬取“下一页” nextpages = response.xpath('//div[@class="page mb10"]/a[@class="next"]/@href').extract() if len(nextpages) > 0: nextpage = nextpages[0] print '----- next page ------' print nextpage print '----- next page ------' req = Request(url=nextpage, callback=self.parse) yield req
def parse_education_structure_page(self, response): """ This method is specific for the VO-schools, as these can have multiple educational structures (vmbo, havo, vwo, ...) """ hxs = HtmlXPathSelector(response) structures = hxs.select('//li[@class="match"]/noscript/a') # The VOSchool item to be populated organisation = VOSchool() organisation['education_structures_to_scrape'] = set() # If we end up at the schools page directly, immediately yield request if not structures: request = Request(response.url, self.parse_organisation_detail_page) request.meta['item'] = organisation yield request organisation['name'] = hxs.select('//h1[@class="stitle"]/text()').extract()[0].strip() crawl_structures = {} for structure in structures: url = 'http://toezichtkaart.owinsp.nl/schoolwijzer/%s'\ % structure.select('@href').extract()[0] url = self.open_blocks(url) crawl_structures[url] = structure.select('text()').extract()[0] organisation['education_structures_to_scrape'].add(url) for url, structure in crawl_structures.iteritems(): request = Request(url, self.parse_organisation_detail_page) request.meta['item'] = organisation request.meta['structure'] = structure yield request
def parse_categories(self,response): #item = response.meta['item'] sel = Selector(response) #item["Category"] = "" url = sel.xpath("//table[@class='PageNormalTextSmall']/tr/td[@align='center']/a/@href").extract() brand = sel.xpath("//td[@colspan='3']/span/text()").extract() category = [] size = len(brand) for i in range(size): category.append(sel.xpath("//span[@class='PageHeaderText']/text()").extract()[0]) for x,name,cat in zip(url,brand,category): item = BigCItem() item["Category"] = cat for i in range(len(starkenncat)): if cat == starkenncat[i]: item["Category"] = LYScat[i] break else: item["Category"] = "NA-"+cat item["Brand_Name"] = name request = Request("http://www.starkennbikes.com/"+x,callback=self.parse_items) #For Parsing Information if search keyword found request.meta["item"] = item yield request
def parse(self, response): for href in response.xpath('//span[@class="stats-comments"]/a/@href').extract(): detail_url = response.urljoin(href) req = Request(detail_url, self.parse_detail_page) item = QiubaiItem() req.meta['item'] = item yield req
def parse_items(self, response): hxs = Selector(response) print "came here" data = imdbItem() data["seriesRating"] = hxs.xpath('//span[@itemprop="ratingValue"]/text()').extract() print data["seriesRating"] seasonLink = hxs.xpath('//*[@id="title-episode-widget"]/div/div[3]/a/@href').extract() print seasonLink #Directly go to ratings page ''' if not seasonLink==[]: #print data["link"] url = data["link"][0]+'epdate' request = Request(url,callback=self.parse_episode_ratings) request.meta['item'] = data yield request ''' #follow season links - can get more data as opposed to above method if not seasonLink==[]: for season in seasonLink: link = 'http://www.imdb.com/'+season request = Request(link,callback=self.parse_season_links) request.meta['item'] = data yield request
def get_next_page_request(self, response): sleep_time = self.crawler.settings.get('DOWNLOAD_DELAY',1) time.sleep(sleep_time) request_data = response.request.body data = json.loads(request_data) page_number = data.get('PageNumber',None) if not page_number: return page_number = int(page_number) + 1 data['PageNumber'] = page_number referer = response.request.headers['Referer'] url = 'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.aspx/GetXmlResult' # url = 'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.aspx/GetXmlResult' headers = { 'Content-Type': 'application/json; charset=UTF-8', 'Referer': referer, 'Cookie': get_cookie(), } _response = requests.post(url, data=json.dumps(data),headers=headers) try: result = _response.json() if result['d'][0] is not None: immediate_response = response_requests2scrapy(_response) meta = { 'immediate_response': immediate_response } request = Request(url,headers=headers,method='POST',meta=meta) # noinspection PyUnresolvedReferences request.callback = self.query_callback return request except Exception as e: log.msg('spider turn page error:%s' % str(e), level=log.INFO) return None
def parse_themes(self, response): item = response.meta['item'] colors_groups = response.meta['colors_groups'] themes = response.xpath("//div[@class='sf_items sf_colors'][2]/ul/li/@title").extract() item["themes"] = ",".join(themes) if colors_groups.get('varies') != None: colors_groups["varies_res"] = list(colors_groups["varies"]) colors = [] colors_x = response.xpath("//div[@class='sf_items sf_colors'][1]/ul/li") for color_x in colors_x: colors.append([color_x.xpath("@title").extract()[0], color_x.xpath("substring-after(a/@href, '=')").extract()[0]]) color = colors.pop(0) request = Request(url="http://www.madeleine.de%s?cf=%s" % (item["path"], color[1]) , callback=self.find_colors) request.meta['item'] = item request.meta['colors_groups'] = colors_groups request.meta['cur_color'] = color[0] request.meta['colors'] = colors return request else: item["colors"] = self.make_colors(colors_groups) self.check_item(item) return item
def test_download_gzip_response(self): if twisted_version > (12, 3, 0): crawler = get_crawler(SingleRequestSpider) body = b"1" * 100 # PayloadResource requires body length to be 100 request = Request("http://localhost:8998/payload", method="POST", body=body, meta={"download_maxsize": 50}) yield crawler.crawl(seed=request) failure = crawler.spider.meta["failure"] # download_maxsize < 100, hence the CancelledError self.assertIsInstance(failure.value, defer.CancelledError) if six.PY2: request.headers.setdefault(b"Accept-Encoding", b"gzip,deflate") request = request.replace(url="http://localhost:8998/xpayload") yield crawler.crawl(seed=request) # download_maxsize = 50 is enough for the gzipped response failure = crawler.spider.meta.get("failure") self.assertTrue(failure == None) reason = crawler.spider.meta["close_reason"] self.assertTrue(reason, "finished") else: # See issue https://twistedmatrix.com/trac/ticket/8175 raise unittest.SkipTest("xpayload only enabled for PY2") else: raise unittest.SkipTest("xpayload and payload endpoint only enabled for twisted > 12.3.0")
def parse_content(self, response): item = UyuItem() selector_content = Selector(response) req = [] Article_Content = selector_content.xpath('//div[@class="content"]/article[@class="excerpt"]') for article in Article_Content: article_names = article.xpath( '//div[@class="content"]/article[@class="excerpt"]/header/h2/a/text()' ).extract() article_urls = article.xpath( '//div[@class="content"]/article[@class="excerpt"]/header/h2/a/@href' ).extract() # 文章简介标题 for article_name in article_names: item["article_name"] = article_name for url in article_urls: r = Request(url, callback=self.parse_article) r.meta["item"] = item req.append(r) nextLink = article.xpath('//li[@class="next-page"]/a/@href').extract() if nextLink: nr = Request(nextLink[0], callback=self.parse_content) req.append(nr) return req
def parse(self, response): links = response.xpath( "//div[@class='entry-content']//a/@href").extract() links = links[:-1] for i in links: yield Request(i, callback=self.parse_book, dont_filter=True)
def parse(self, response): base_url = get_base_url(response) categories = response.xpath('//ul[@class="main-nav"]/li/a/@href').extract()[1:] for url in categories: yield Request(urljoin_rfc(base_url, url), cookies=self.additional_cookies) sub_categories = response.xpath('//div[@class="sidenav-title" and span/text()="Browse Categories"]' '/following-sibling::div[@class="inner"]//a/@href').extract() for url in sub_categories: yield Request(urljoin_rfc(base_url, url), cookies=self.additional_cookies) per_page = set(response.xpath('//div[contains(@class, "showing-per-page")]//option/@value').extract()) if per_page: per_page_param = url_query_parameter(response.url, 'productsPerPage') if per_page_param != '48': url = add_or_replace_parameter(response.url, 'productsPerPage', '48') url = add_or_replace_parameter(url, 'page', '0') yield Request(url, cookies=self.additional_cookies) return # Check for valid location is_valid, country_detected = self._is_valid_location(response) if not is_valid: reason = 'Wrong country detected: %s' % country_detected new_request = self._retry_request(response, self.parse, reason) if new_request: yield new_request return # Parse products mde = MicrodataExtractor() data = mde.extract(response.body) if data: product_ids = response.xpath('//div[@itemtype="http://schema.org/Product"]/@data-id').extract() product_urls = map(lambda u: urljoin_rfc(base_url, u), response.xpath('//div[@itemtype="http://schema.org/Product"]' '/div[@class="product-info"]/div[@class="title"]/a/@href').extract()) product_imgs = map(lambda u: urljoin_rfc(base_url, u), response.xpath('//div[@itemtype="http://schema.org/Product"]//a[@class="product-image"]' '//img[@class="product-image-file"]/@src').extract()) rrp_prices = {} for product_id in product_ids: rrp_price = response.xpath('//div[@data-id="%s"]//div/@data-tc-original-price' % product_id).extract() if rrp_price: rrp_prices[product_id] = rrp_price[0] products_extra_data = {} for product_id, product_url, product_img in zip(product_ids, product_urls, product_imgs): products_extra_data[product_id] = { 'url': product_url, 'image_url': product_img, } category = '' categories = filter(lambda item: item['type'] == 'http://data-vocabulary.org/Breadcrumb', data['items']) if categories: category = categories[0]['properties']['title'][1] brands = set(response.xpath('//div[@class="filter-brand-wrapper"]' '//label[contains(@for, "product-listings__filter-top-brands-")]/a[@disabled]/text()')\ .re(r'(.*) \(')) products = filter(lambda item: item.get('type', '') == 'http://schema.org/Product', data['items']) for product in products: product_id = product['properties']['productId'] ajax_url = self.AJAX_URL % product_id headers = {'X-Requested-With': 'XMLHttpRequest'} req = Request(ajax_url, headers=headers, callback=self.parse_options, meta={'main_product': product['properties'], 'category': category, 'products_extra': products_extra_data, 'brands': brands, 'rrp_prices': rrp_prices, 'proxy': response.meta.get('proxy'), 'proxy_service_disabled': True}, cookies=self.additional_cookies) yield req # Check for next page and follow this if exists next_page = response.xpath('//li[@class="next"]/a/@href').extract() if next_page: yield Request(urljoin_rfc(get_base_url(response), next_page[0]), cookies=self.additional_cookies)
def test_request_response(self): req = Request('http://example.com/index.html') resp = Response(req.url, status=200) ret = self._download(req, resp) self.assertTrue(isinstance(ret, Response), "Non-response returned")
def parse(self, response): try: chf = " — " cny = " — " eur = " — " gbp = " — " ron = " — " rub = " — " usd = " — " print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!! NachBank!") sel = Selector(response) result1 = sel.xpath('//table[@class="datatable"]/tbody/tr') for some1 in result1: name = some1.xpath('.//td//text()').extract()[0] #print (name) value = some1.xpath('.//td//text()').extract()[-1] if (name == "EUR"): eur = value elif (name == "USD"): usd = value elif (name == "CNY"): cny = value elif (name == "RUB"): rub = value elif (name == "RON"): ron = value elif (name == "GBP"): gbp = value elif (name == "CHF"): chf = value yield ArticleItem( EUR=eur, USD=usd, CNY=cny, RUB=rub, RON=ron, GBP=gbp, CHF=chf, #description = des, url='https://www.mnb.hu/arfolyamok', title="NachBank") except: contact_message = """ Bank NachBank:\n ERROR """ send_mail("Bank fail", contact_message, self.from_email, [self.to_email]) print( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!! NachBank! ERROR--ERROR--ERROR--ERROR" ) #KREDIT yield Request( "https://www.bankracio.hu/hitelkalkulator/lakashitel/2-lakasvasarlasi-hitel-uj-lakasra", callback=self.kredit) #BUDAPEST yield Request( "https://www.budapestbank.hu/info/arfolyamok/db_arfolyamok.php?sent=1&frm_arfolyam=CCR", callback=self.budapest) #CIB yield Request("http://www.cib.hu/maganszemelyek/arfolyamok/arfolyamok", callback=self.cib) # ERSTE! try: resp4 = yield Request( "http://www.erstebank.hu/ekwa-web-web/includes/content/currency/exchangeRates.xhtml" ) yield FormRequest.from_response( resp4, formxpath='//input[@id="exchangeRateForm:j_idt31"]', callback=self.erste) except: contact_message = """ Bank ERSTE:\n ERROR """ send_mail("Bank fail", contact_message, self.from_email, [self.to_email]) print( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ERSTE! ERROR--ERROR--ERROR--ERROR" ) # Valuta History resp12 = yield Request("https://www.mnb.hu/en/arfolyam-lekerdezes") yield FormRequest.from_response(resp12, formxpath='//input[@id="geterates"]', callback=self.nach_history) #GRANIT yield Request("https://granitbank.hu/arfolyamok", callback=self.granit) #OTP yield Request( "https://www.otpbank.hu/apps/exchangerate/api/exchangerate/otp/{}". format(str(datetime.now()).split(" ")[0]), callback=self.otp) #RAIFFEISEN yield Request( "https://www.raiffeisen.hu/hasznos/arfolyamok/lakossagi/valutaarfolyamok", callback=self.raiffeisen) #K&H k_and_h yield Request("https://www.kh.hu/valuta-deviza-arfolyam", callback=self.k_and_h) #MKB yield Request("https://www.mkb.hu/apps/rates/rates?type=CAD", callback=self.mkb, method="GET") #UNICREDIT try: print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!! UNICREDIT!") d = datetime.today() date_now = str(d.year) if len(str(d.month)) == 1: date_now = date_now + "0" + str(d.month) else: date_now = date_now + str(d.month) if len(str(d.day)) == 1: date_now = date_now + "0" + str(d.day) else: date_now = date_now + str(d.day) date_now = date_now + "T" if len(str(d.hour)) == 1: date_now = date_now + "0" + str(d.hour) else: date_now = date_now + str(d.hour) date_now = date_now + "23:00:00.000+0300" payload = { 'Currency': '*ALL', 'DateFrom': date_now, 'DateTo': date_now } headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept': '*/*', 'Content-Type': 'application/json', 'EntityCode': 'HU', 'Language': 'HU', 'SourceSystem': 'PWS', 'Product': 'PWS' } respons0 = requests.post( 'https://www.unicreditbank.hu/cwa/GetExchangeRates', headers=headers, data=json.dumps(payload)) sel_11 = Selector(respons0) result = json.loads(sel_11.xpath('//p').extract()[0][3:-4]) chf = " — " cny = " — " eur = " — " gbp = " — " ron = " — " rub = " — " usd = " — " for some in range(0, len(result)): if result[some]["CurrencyCode"] in [ 'CHF', 'CNY', 'EUR', 'GBP', 'RON', 'RUB', 'USD' ]: if result[some]["CurrencyCode"] == 'CHF': chf = str( "%.2f" % result[some]["PurchaseRate"]) + "/" + str( "%.2f" % result[some]["SaleRate"]) elif result[some]["CurrencyCode"] == 'CNY': cny = str( "%.2f" % result[some]["CashPurchaseRate"]) + "/" + str( "%.2f" % result[some]["CashSaleRate"]) elif result[some]["CurrencyCode"] == 'EUR': eur = str( "%.2f" % result[some]["PurchaseRate"]) + "/" + str( "%.2f" % result[some]["SaleRate"]) elif result[some]["CurrencyCode"] == 'GBP': gbp = str( "%.2f" % result[some]["PurchaseRate"]) + "/" + str( "%.2f" % result[some]["SaleRate"]) elif result[some]["CurrencyCode"] == 'RON': ron = str( "%.2f" % result[some]["CashPurchaseRate"]) + "/" + str( "%.2f" % result[some]["CashSaleRate"]) elif result[some]["CurrencyCode"] == 'RUB': rub = str( "%.2f" % result[some]["CashPurchaseRate"]) + "/" + str( "%.2f" % result[some]["CashSaleRate"]) elif result[some]["CurrencyCode"] == 'USD': usd = str( "%.2f" % result[some]["PurchaseRate"]) + "/" + str( "%.2f" % result[some]["SaleRate"]) yield ArticleItem( EUR=eur, USD=usd, CNY=cny, RUB=rub, RON=ron, GBP=gbp, CHF=chf, #description = des, url= "https://www.unicreditbank.hu/hu/maganszemelyek/exchange_rate.html", title="UNICREDIT") except: contact_message = """ Bank UNICREDIT:\n ERROR or Weekend """ send_mail("Bank fail", contact_message, self.from_email, [self.to_email]) print( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!! UNICREDIT! ________ ERROR or Weekend" ) try: print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!! SBERBANK!") d = datetime.today() date_now = str(d.year) if len(str(d.month)) == 1: date_now = date_now + ".0" + str(d.month) else: date_now = date_now + "." + str(d.month) if len(str(d.day)) == 1: date_now = date_now + ".0" + str(d.day) else: date_now = date_now + "." + str(d.day) payload = { 'maxDays': "60", 'language': "hu", 'rateType': "valuta", 'dateFrom': date_now, 'allCurrency': "true" } headers = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' } respons5 = requests.post( 'https://www.sberbank.hu/servlet/currencyRateServlet', headers=headers, data=payload) sel_10 = Selector(respons5) result = json.loads(sel_10.xpath('//p').extract()[0][3:-4]) print(result["notFound"]) if result["notFound"] == False: cny = " — " ron = " — " for some in range( 0, len(result["currencyRatesByDay"][0]["currencyRates"])): if result["currencyRatesByDay"][0]["currencyRates"][some][ "currency"] == 'CHF': chf = str( result["currencyRatesByDay"][0]["currencyRates"] [some]["buyRate"]) + "/" + str( result["currencyRatesByDay"][0] ["currencyRates"][some]["sellRate"]) elif result["currencyRatesByDay"][0]["currencyRates"][ some]["currency"] == 'EUR': eur = str( result["currencyRatesByDay"][0]["currencyRates"] [some]["buyRate"]) + "/" + str( result["currencyRatesByDay"][0] ["currencyRates"][some]["sellRate"]) elif result["currencyRatesByDay"][0]["currencyRates"][ some]["currency"] == 'GBP': gbp = str( result["currencyRatesByDay"][0]["currencyRates"] [some]["buyRate"]) + "/" + str( result["currencyRatesByDay"][0] ["currencyRates"][some]["sellRate"]) elif result["currencyRatesByDay"][0]["currencyRates"][ some]["currency"] == 'RUB': rub = str( result["currencyRatesByDay"][0]["currencyRates"] [some]["buyRate"]) + "/" + str( result["currencyRatesByDay"][0] ["currencyRates"][some]["sellRate"]) elif result["currencyRatesByDay"][0]["currencyRates"][ some]["currency"] == 'USD': usd = str( result["currencyRatesByDay"][0]["currencyRates"] [some]["buyRate"]) + "/" + str( result["currencyRatesByDay"][0] ["currencyRates"][some]["sellRate"]) yield ArticleItem( EUR=eur, USD=usd, CNY=cny, RUB=rub, RON=ron, GBP=gbp, CHF=chf, description=date_now, url= "http://www.sberbank.hu/hu/alkalmazasok/arfolyamok.html", title="SBERBANK") else: contact_message = """ Bank SBERBANK:\n ERROR or Weekend """ send_mail("Bank fail", contact_message, self.from_email, [self.to_email]) print( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!! SBERBANK! ________ ERROR or Weekend" ) except: contact_message = """ Bank SBERBANK:\n ERROR or Weekend """ send_mail("Bank fail", contact_message, self.from_email, [self.to_email]) print( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!! UNICREDIT! ________ ERROR or Weekend" )
def start_requests(self): for url in self.start_urls: yield Request(url=url, callback=self.parse) for url in self.ipod_urls: yield Request(url=url, callback=self.parse_ipods)
def get_media_requests(self, item, info): for image_url in item['image_urls']: yield Request(image_url)
def start_requests(self): for url in self.products: yield Request(url)
def load_listpage(self, response): yield Request(url=self.list_url, callback=self.process_list, dont_filter=True)
def parse(self, response): self.logger.info("===========parse=============") responseUrl = response._url category = response.meta['category'] newsTitle = response.meta['newsTitle'] newsCover = response.meta['newsCover'] # id =re.compile('\d+') # idResult = id.match(response._url) # commentId =re.findall("\d+",responseUrl)[0] # commentUrl = self.commentBaseUrl+commentId # self.allCommentUrlList.append({newsTitle:commentUrl}) # commentDict = self.allCommentUrlList.pop(0) # print(commentDict) # for title,url in commentDict.items(): # yield Request(url, callback=self.comment_url_callback, meta={'title': title}, dont_filter=True) try: # commentId = re.findall("\d+", responseUrl)[0] # commentUrl = self.commentBaseUrl + commentId # self.allCommentUrlList.append({newsTitle: commentUrl}) keywords = response.xpath( "//head/meta[@name='Keywords']/@content").extract_first() description = response.xpath( "//head/meta[@name='Description']/@content").extract_first() print('Keywords:' + keywords + ',' + 'Description:' + description) newsContents = response.xpath("//div[@class='newscontent']") # newsTitle = newsContents.xpath("./h1[@class='news_title']/text()").extract_first() newsAuthor = newsContents.xpath( "./div[@class='news_about']/p[1]/text()").extract_first() newsDate = newsContents.xpath( "./div[@class='news_about']/p[2]/text()").extract_first() # newsCover = newsContents.xpath(".//img/@src").extract_first() newsContentList = newsContents.xpath( "./div[@class='news_txt']/text()").extract() newsContent = "".join(newsContentList) print(newsContent) news_love = newsContents.xpath( "./div[@class='news_love']//a[@class='zan']/text()" ).extract_first() item = ThepaperspiderItem() item["title"] = newsTitle item["author"] = newsAuthor item["datetime"] = newsDate item["newsCover"] = newsCover item["newsContent"] = newsContent item["keywords"] = keywords item["description"] = description item["collectedCount"] = news_love item["category"] = category item["story_id"] = '1' item["comefrom"] = 'news' print(item) yield item imgItem = ImgItem() imgItem['image_urls'] = {newsTitle: newsCover} imgItem['comefrom'] = 'imgs' print(imgItem) yield (imgItem) commentId = re.findall("\d+", responseUrl)[0] commentUrl = self.commentBaseUrl + commentId self.allCommentUrlList.append({newsTitle: commentUrl}) if len(self.allCommentUrlList) != 0: commentDict = self.allCommentUrlList.pop(0) print(commentDict) for title, url in commentDict.items(): yield Request(url, callback=self.comment_url_callback, meta={'title': title}, dont_filter=True) except: print("糟糕,出现exception") pass if len(self.allNewsUrlList) != 0: urlDict = self.allNewsUrlList.pop(0) newsInfo = [] for key in urlDict: newsInfo.append(key) category = newsInfo[0] newsTitle = newsInfo[1] newsUrl = urlDict[category] newsCover = urlDict[newsTitle] yield Request(newsUrl, callback=self.parse, meta={ 'category': category, 'newsTitle': newsTitle, 'newsCover': newsCover }, dont_filter=True)
def parse(self, response): # Check url at start of parse to catch links that were potentially redirected. orig_domain = response.url if "orig_domain" in response.meta: orig_domain = response.meta["orig_domain"] else: response.meta["orig_domain"] = orig_domain if not self.validate_link(response.url, orig_domain): return self._logger.debug("starting parse on url {}".format( response.request.url)) cur_depth = 0 if 'curdepth' in response.meta: cur_depth = response.meta['curdepth'] else: response.meta['curdepth'] = cur_depth self._logger.debug("Forming response object") # capture raw response item = RawResponseItem() # populated from response.meta item['appid'] = response.meta['appid'] item['crawlid'] = response.meta['crawlid'] item['attrs'] = response.meta['attrs'] # populated from raw HTTP response item["url"] = response.request.url item["response_url"] = response.url item["status_code"] = response.status item["status_msg"] = "OK" item["response_headers"] = self.reconstruct_headers(response) item["request_headers"] = response.request.headers item["links"] = [] item["curdepth"] = str(cur_depth) is_pdf = False url = response.url.lower() if (url[len(url) - 4:] == '.pdf') or ('.pdf?' in url): is_pdf = True item["is_pdf"] = str(is_pdf) if is_pdf: self._logger.debug("Handling pdf file") self.download_file(response.url) item["body"] = self.pdfparser("temp_document.pdf") else: item["body"] = self.gather_text(response.body) self._logger.debug("Current depth: " + str(cur_depth)) # determine whether to continue spidering if cur_depth >= response.meta['maxdepth']: self._logger.debug("Not spidering links in '{}' because" \ " cur_depth={} >= maxdepth={}".format( response.url, cur_depth, response.meta['maxdepth'])) else: # we are spidering -- yield Request for each discovered link link_extractor = LinkExtractor( allow_domains=response.meta['allowed_domains'], allow=response.meta['allow_regex'], deny=response.meta['deny_regex'], deny_extensions=response.meta['deny_extensions']) for link in link_extractor.extract_links(response): # link that was discovered the_url = link.url the_url = the_url.replace('\n', '') if not self.validate_link(the_url, orig_domain): continue item["links"].append( str({ "url": the_url, "text": link.text, })) req = Request(the_url, callback=self.parse) req.meta['priority'] = response.meta['priority'] - 10 req.meta['curdepth'] = response.meta['curdepth'] + 1 if 'useragent' in response.meta and \ response.meta['useragent'] is not None: req.headers['User-Agent'] = response.meta['useragent'] self._logger.debug("Trying to follow link '{}'".format( req.url)) yield req # raw response has been processed, yield to item pipeline yield item
def parse_cat_0(self, response): metadata = response.meta['userdata'] sel = Selector(response) # MINI-BAG temp = sel.xpath( '//article[contains(@class,"sliding-backgrounds")]//a[@href and contains(@class,"background")]' ) if temp: return Request(url=self.process_href(temp[0]._root.attrib['href'], response.url), callback=self.parse_list, meta={'userdata': metadata}, errback=self.onerr) node = None temp = sel.xpath( '//div[@class="menu"]/ul[@class="collections"]/li[contains(@class,"collection")]/' 'div[contains(@class,"name")]/a[@href]') if temp: for temp1 in temp: if self.process_href(temp1._root.attrib['href'], response.url) == response.url: node = temp1 break if not node: return None ret = [] for node1 in node.xpath( '../../ul[contains(@class,"departments")]/li[contains(@class,"department")]/div/a[@href]' ): m1 = copy.deepcopy(metadata) href = node1._root.attrib['href'] mt = re.search('/([^/]+)$', href) if mt: tag_name = unicodify(mt.group(1)).lower() tag_text = unicodify( node1._root.text).lower() if node1._root.text else tag_name m1['tags_mapping']['category-1'] = [{ 'name': tag_name, 'title': tag_text }] # 是否有子分类级别 for node2 in node1.xpath( '../../ul[contains(@class,"categories")]/li[contains(@class,"category")]//a[@href]' ): m2 = copy.deepcopy(m1) href = node2._root.attrib['href'] mt = re.search('/([^/]+)$', href) if mt: tag_name = unicodify(mt.group(1)) tag_text = unicodify( node2._root.text) if node2._root.text else tag_name m2['tags_mapping']['category-2'] = [{ 'name': tag_name, 'title': tag_text }] ret.append( Request(url=self.process_href(href, response.url), meta={'userdata': m2}, callback=self.parse_list, errback=self.onerr)) return ret
def parse(self, response): zk.start() zode_path = zk.create("/pid/taobao/node-", ephemeral=True, sequence=True) myid = zode_path[-10:] mytask_dir = task_dir + "node-" + myid try: zk.create('/task/taobao') Master = True except: Master = False if Master == True: zk.create(mytask_dir) sleep(3) themes = response.xpath( '//ul[@class="service-bd"]/li/span/a/@href').extract() nodes = len(zk.get_children("/pid/taobao")) real_nodes = zk.get_children("/task/taobao") print "realnodes" + str(real_nodes) while nodes != len(real_nodes): real_nodes = zk.get_children("/task/taobao") nodes = len(zk.get_children("/pid/taobao")) sleep(0.01) peer_tasks = len(themes) / nodes print "master is " + str(os.getpid()) i = 0 while i < nodes: j = 0 while j < peer_tasks: msg = '[{ "url":"' + str( themes[i * peer_tasks + j]) + '", "level":"2", "content":"0"}]' zk.create(task_dir + real_nodes[i] + "/task-", value=msg, sequence=True) j += 1 i += 1 else: zk.create(mytask_dir) print "sleep" while True: global work_co try: tasks = zk.get_children(mytask_dir) except Exception, e: print "get_children %s" % e while len(tasks) == 0: sleep(1) tasks = zk.get_children(mytask_dir) obj_tasks = mytask_dir + '/' + tasks[random.randint( 0, len(tasks) - 1)] working_set.add(obj_tasks) mytask_data, mytask_stat = zk.get(obj_tasks) task = json.loads(mytask_data) if task[0]['level'] == '2': temp = task[0]['url'].split(':') work_co += 1 yield Request(url='http:' + temp[len(temp) - 1], meta={ "task": obj_tasks, "task_dir": mytask_dir }, callback=self.classification) if task[0]['level'] == '3': temp = task[0]['url'] work_co += 2 yield Request(url=temp, meta={ "task": obj_tasks, "task_dir": mytask_dir }, callback=self.pageturning) if task[0]['level'] == '4': temp = task[0]['url'] work_co += 4 yield Request(url=temp, meta={ "task": obj_tasks, "task_dir": mytask_dir }, callback=self.goods)
def parse(self, response): models = response.xpath('.//a[@class="view-all"]/@href').extract() print(len(models)) for model in models: yield Request(model, callback=self.parse_model, dont_filter=True) pass
def start_requests(self): yield Request(urls.start, method='POST')
def _extract_requests(self, response): r = [] if isinstance(response, HtmlResponse): links = self.link_extractor.extract_links(response) r.extend(Request(x.url, callback=self.parse) for x in links) return r
def parse_cars(self, response): cars = response.xpath('//a[@class="readmore"]/@href').extract() for car in cars: yield Request(car, callback=self.parse_data, dont_filter=True)
def check_login(self, response): print(response.text) yield Request(url="http://dig.chouti.com/", callback=self.good)
def start_requests(self): return [Request(self.url, callback=self.parse, dont_filter=True)]
def parse_detail_item(self, response, **kwargs): info_type = kwargs.get('type') if info_type == 'ershoufang': item = ErShouFangSourceItem() item['city'] = kwargs.get('city') item['area'] = kwargs.get('area') item['business_circle'] = kwargs.get('bankuai_name') item['village_name'] = response.css( '.communityName a.info::text').get(default='') lis = response.css('.base .content ul li') for li in lis: span_text = li.css('span::text').get(default='') if span_text == '房屋户型': item['residence_room'] = li.css('::text').getall()[1] elif span_text == '所在楼层': item['floor'] = li.css('::text').getall()[1] elif span_text == '建筑面积': item['area1'] = li.css('::text').getall()[1].replace( '㎡', '') elif span_text == '套内面积': item['area2'] = li.css('::text').getall()[1].replace( '㎡', '') elif span_text == '房屋朝向': item['orientation'] = li.css('::text').getall()[1] elif span_text == '挂牌时间': item['listing_time'] = li.css( '::text').getall()[1].replace('\n', '') check_field_list = [ 'residence_room', 'area1', 'area2', 'floor', 'orientation', 'listing_time' ] for check_field in check_field_list: if check_field not in item.keys(): item[check_field] = '' all_price = float( response.css('.price .total::text').get(default=0)) single_price = float( response.css('.price .unitPriceValue::text').get(default=0)) item['all_price'] = all_price item['single_price'] = single_price item['build_time'] = response.css('.area .subInfo::text').get( default='') item['link'] = response.url yield Request(url=kwargs.get('village_url'), callback=self.parse_village_info, cb_kwargs={ 'item': item, 'village_url': kwargs.get('village_url') }, dont_filter=True) elif info_type == 'zufang': item = RentingHouseSourceItem() item['city'] = kwargs.get('city') item['area'] = kwargs.get('area') item['business_circle'] = kwargs.get('bankuai_name') item['village_name'], item['residence_room'], item[ 'orientation'] = response.css('.content__title::text').get( default='').split(' ') # item['orientation'] = response.css('div.content__article__info:nth-child(2) > ul:nth-child(2) > li:nth-child(3)::text').get(default='').replace('朝向:', '') item['floor'] = response.css( 'div.content__article__info:nth-child(2) > ul:nth-child(2) > li:nth-child(8)::text' ).get(default='').replace('楼层:', '') item['house_area'] = response.css( 'div.content__article__info:nth-child(2) > ul:nth-child(2) > li:nth-child(2)::text' ).get(default='').replace('面积:', '').replace('㎡', '') item['price'] = response.css( 'div.content__aside--title span::text').get(default='') item['renting_time'] = response.css( 'div.content__article__info:nth-child(2) > ul:nth-child(3) > li:nth-child(2)::text' ).get(default='').replace('租期:', '') item['link'] = response.url yield item
def get_media_requests(self, item, info): urls = ItemAdapter(item).get(self.files_urls_field, []) return [Request(u) for u in urls]
def _prepare_request_object(item_url): return Request( item_url, meta={'response': Response(item_url, status=200, body=b'data')})
def parse(self, response): ''' zk.start() zode_path = zk.create("/pid/huanqiunews/node-" , ephemeral = True, sequence = True) myid = zode_path[-10 : ] mytask_dir = task_dir + "node-" + myid print "hello" if zk.exists("/task/huanqiunews") == None: zk.create('/task/huanqiunews') zk.create(mytask_dir) sleep(3) nodes = len(zk.get_children("/pid/huanqiunews")) themes = response.xpath('//a[@class="cate_menu_lk"]/@href').extract() real_nodes = zk.get_children("/task/huanqiunews") while nodes != len(real_nodes): real_nodes = zk.get_children("/task/huanqiunews") sleep(0.01) peer_tasks = len(themes) / nodes #tot do: chu bu jun yun ru he cao zuo ?? i = 0 while i < nodes: j = 0 while j < peer_tasks: try: url = "http:" + theme[i*peer_tasks + j] msg = '[{"motian":"0", "url":"' + url+ '", "level":"2", "content":"0"}]' zk.create("/task/huanqiunews/" + real_nodes[i] + "/task-", value = msg, sequence = True) except Exception,e: print "%s" % e j += 1 i += 1 else: zk.create(mytask_dir) work_co = 0 while True: if work_co > 10: sleep(10) try: tasks = zk.get_children(mytask_dir) except Exception,e: print "get_children %s" % e while len(tasks) == 0: sleep(1) tasks = zk.get_children(mytask_dir) obj_tasks = mytask_dir + '/' + tasks[random.randint(0, len(tasks) - 1)] mytask_data, mytask_stat = zk.get(obj_tasks) task = json.loads(mytask_data) if task[0]['level'] == '2': url = task[0]['url'] print "url-->" + url yield Request(url=url,callback=self.classification) work_co += 1 ''' themes = response.xpath('//a[@class="cate_menu_lk"]/@href').extract() #for theme in themes: #url = "http:" + theme #yield Request(url=url, callback=self.classification) yield Request(url='http:'+themes[0], callback=self.classification)
def parse(self, response): #构造不同版块下的页面url url = self.start_urls[0] yield Request(url, self.parse_news,meta={'url':self.url})
def parse_village_trend(self, response: Response, **kwargs): url = re.search(r"analysis.init\('(.*?)'\)", response.text).group(1) yield Request(url='https://bj.ke.com' + url, callback=self.yield_city_item, cb_kwargs=kwargs)
def parse_application(self, response): app_url = response.xpath( '//*[@class="glyphicon glyphicon-inbox btn-lg"]/following-sibling::a/@href').extract_first() yield Request(response.urljoin(app_url), callback=self.parse_form)
def parse_news(self,response): #获取所有页面的所有新闻url news_urls = response.xpath('//*[@id="leftList"]/div[2]/dl/dd/ul/li/a/@href').extract() for i in range(len(news_urls)): news_url = response.meta['url'] + news_urls[i] yield Request(news_url,self.parse_content,meta={"url":news_url})
def parse(self, response): index = response.meta['index'] url_ = response.meta['url_'] self.Maxpage_List[index] += 1 #爬取页数加1 soup = BeautifulSoup(response.body_as_unicode(), "lxml") #以浏览器的方式解析文档 founds = soup.find('div', class_='searchResultArea').find_all('li') item_list = [] print '------------url---------------' for found in founds: item = items.PostItem() title = found.find('h3').get_text().strip() url = found.find('h3').find('a').get('href') # print title print url print index m = md5.new() m.update(url) md_str = m.hexdigest() #MD5算法编码获得ID号 post_time = found.find_all('p')[1].find_all( 'span')[3].get_text().strip() post_time = re.findall(self.tt_pa, post_time)[0] #正则匹配得到时间 post_time = post_time[6:] + '-' + post_time[3:5] + '-' + post_time[ 0:2] + ' ' + '00:00:00' print post_time item['url'] = url item['id'] = md_str item['post_time'] = post_time item['data_type'] = settings.DATA_TYPE #政府类都是1 item['site_id'] = settings.SITE_ID[self.name] item['topic_id'] = index item['scratch_time'] = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime()) #time.strftime()可以用来获得当前时间,可以将时间格式化为字符串等等 item['title'] = title item['poster_name'] = '' item['poster_id'] = '' item['poster_url'] = '' item_list.append(item) print '-----------------------------------' res_items = self.sqldb.get_newest_time( item_list) #判断这个链接是不是比上次爬取的要新,如果是就爬取 #调用parse_content解析每篇文章内容 for item in res_items: if '.htm' in item['url']: self.sum += 1 print self.sum print '-----------------add new urls to Request-------------------' yield Request(item['url'], callback=self.parse_content, meta={'item': item}) # else: # url = item['url'] # if '.xls' in url or '.xlsx' in url or '.doc' in url or '.docx' in url or '.pdf' in url or '.txt' in url: # name = url.split('/') # filename = name[len(name)-1] # dir = 'D:\\Workspace\\Python\\Scrapy\\file\\' + self.name # if os.path.exists(dir): # print 'dir is existing...' # else: # os.makedirs(dir) # dir = dir + '\\' + item['id'] + '_' + str(item['topic_id']) # if os.path.exists(dir): # print 'filepath is existing...' # else: # os.makedirs(dir) # filepath = os.path.join(dir, filename) # if os.path.exists(filepath): # print 'already down...' # else: # print "-----------------downloading with requests-------------------" # r = requests.get(url) # with open(filepath, "wb") as code: # code.write(r.content) # item['content'] = filepath # yield Request(item['url'], callback = self.parse_fileurl, meta = {'item':item}) next_pages = soup.find('div', class_='advancedIndex').find_all('span') try: next_page = next_pages[len(next_pages) - 1].a['href'] next_page = 'http://lp.search.gov.hk/search.html' + next_page yield scrapy.Request(next_page, meta={ 'index': index, 'url_': next_page }) except: print 'last page-----'
def parse(self, response): for i in range(2, 57): url = 'https://veromoda.tmall.com/category.htm?&pageNo={0}'.format( i) '''iteration / generation''' yield Request(url=url, callback=self.parse_detail) # 注意这里调用不需要有()
def start_requests(self): price_urls = mongoservice.get_dealerprice_url() for url in price_urls: yield Request(self.api_url % url, callback=self.get_url)
def test_proxy_already_seted(self): os.environ['http_proxy'] = http_proxy = 'https://proxy.for.http:3128' mw = HttpProxyMiddleware() req = Request('http://noproxy.com', meta={'proxy': None}) assert mw.process_request(req, spider) is None assert 'proxy' in req.meta and req.meta['proxy'] is None