def form_part_request(url, callback, part): request = Request( url=url, callback=callback ) request.meta['part'] = part return request
def parse(self, response): #翻页请求,每10页,停30秒 self.pageNo += 1 if self.pageNo % 2 == 0: time.sleep(25) select = Selector(response) if not "shopDetail" in response.meta: # 店铺列表页 allNo = self.questionIdPatten.findall(response.url) cityId = allNo[0] # cityid pageNumber = allNo[-1] #记录page self.fw.write("%s cityId:%s, pageNumber:%s\n" % (response.url, cityId, pageNumber)) self.fw.flush() item = DianpingItem() item["city_id"] = cityId try: cityName = select.css(".city").xpath("./text()").extract()[0] except Exception,e: cityName="" print e # self.fw.write("%s\t%s\n"%(cityId, cityName)) # self.fw.flush() yieldPageFlag = False shop_list = select.xpath(".//div[@class='info']") for li in shop_list: yieldPageFlag = True item["shop_name"] = li.xpath(".//p[@class='title']/a/text()").extract()[0] item["shop_cityname"] = cityName # 地区 # domain,当做标签,非区域,抓取区域指地区 item["shop_domain"] = ",".join(li.xpath(".//p[@class='area-key']/span[@class='area-list']/a/text()").extract()) key_list = ",".join(li.xpath(".//p[@class='area-key']/span[@class='key-list']/a/text()").extract()) item["shop_tag"] = ",".join([key_list, item["shop_domain"]]) # 标签包含区域 # href = '/shop/123456' href = li.xpath(".//p[@class='title']/a[@class='shopname']/@href").extract()[0] item["shop_id"] = href.split("/")[-1] shopUrl = "http://www.dianping.com" + href request = Request(shopUrl, callback=self.parse, priority=1234567)#店铺请求 request.meta["shopDetail"] = copy.deepcopy(item) yield request pass if yieldPageFlag: # 如果当前页有数据,则继续请求下一页 nextPageNumber = int(pageNumber) + 1 url = self.pageUrl % (cityId, nextPageNumber) request = Request(url, callback=self.parse, priority=1234) yield request pass
def parse_hiker_info(self, response): # TODO: Somehow obtain the Hiker's direction 'dir'. # TODO: Somehow obtain the Hiker's trail start date 'start_date' # TODO: Somehow obtain the Hiker's trail estimated end date 'end_date' print("Response received: %s" % response) print("Parsing Hiker Info from response: %s" % response) hiker = HikerItem() hiker['id'] = self.extract_hiker_id(response=response) hiker_name_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]/table//tr[2]/td//font[2]" hiker_name = Selector(response=response).xpath(hiker_name_xpath).extract()[0] hiker_name_start = str.find(hiker_name, "-", 0, len(hiker_name)) hiker_name_end = str.find(hiker_name, "<", hiker_name_start, len(hiker_name)) hiker_name = hiker_name[hiker_name_start + 1:hiker_name_end] hiker_name = str.strip(hiker_name, " ") hiker['name'] = hiker_name hiker_trail_name_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]/table//tr[2]/td//font[1]/b" hiker_trail_name = Selector(response=response).xpath(hiker_trail_name_xpath).extract()[0] hiker_trail_name_start = str.find(hiker_trail_name, ">", 0, len(hiker_trail_name)) hiker_trail_name_end = str.find(hiker_trail_name, "<", hiker_trail_name_start, len(hiker_trail_name)) hiker_trail_name = hiker_trail_name[hiker_trail_name_start + 1:hiker_trail_name_end] hiker['trail_name'] = hiker_trail_name hiker['about_url'] = response.url # TODO: Verify that the 'journal_url' is the FIRST journal entry. hiker['journal_url'] = str.replace(response.url, "about", "entry") journal_parse_request = Request(hiker['journal_url'], callback=self.parse_hiker_journal) journal_parse_request.meta['hiker'] = hiker yield journal_parse_request
def test_hs_middlewares_retry(hs_downloader_middleware, hs_spider_middleware): spider = Spider('test') url = 'http://resp-url' request_0 = Request(url) response_0 = Response(url) hs_downloader_middleware.process_request(request_0, spider) assert HS_REQUEST_ID_KEY not in request_0.meta assert HS_PARENT_ID_KEY not in request_0.meta assert len(hs_spider_middleware._seen_requests) == 0 assert len(hs_downloader_middleware._seen_requests) == 0 hs_downloader_middleware.process_response(request_0, response_0, spider) assert request_0.meta[HS_REQUEST_ID_KEY] == 0 assert request_0.meta[HS_PARENT_ID_KEY] is None assert hs_spider_middleware._seen_requests[request_0] == 0 request_1 = request_0.copy() response_1 = Response(url) assert request_1.meta[HS_REQUEST_ID_KEY] == 0 assert request_1.meta[HS_PARENT_ID_KEY] is None hs_downloader_middleware.process_request(request_1, spider) assert HS_REQUEST_ID_KEY not in request_1.meta assert request_1.meta[HS_PARENT_ID_KEY] == 0 hs_downloader_middleware.process_response(request_1, response_1, spider) assert request_1.meta[HS_REQUEST_ID_KEY] == 1 assert request_1.meta[HS_PARENT_ID_KEY] == 0
def parse_data(self, response): rows = response.selector.xpath( '//*[@id="container-outer"]/div[1]/div[3]/div/div/div[2]/table/tbody/tr') for rows in rows: if rows.xpath('td/p'): url1_temp = rows.xpath('td/p').extract() count = 0 for url1_temp in url1_temp: item = SpiderItem() url_tem = rows.xpath('td/p/a/@href').extract() item['url'] = urljoin(response.url, url_tem[count]) item['publishdate'] = rows.xpath('td/div/a/@title').extract() time_temp = rows.xpath('td/p[' + str(count + 1) + ']/text()[2]').extract() item['publishtime'] = process_string(time_temp[0].strip().split('[')[0]) item['Source'] = "[House Committe on Appropriations - Subcommittee on Interior and Environment]" item['_type'] = "[Hearings and Markups]" item['ekwhere'] = "[Fed]" link = 'http://docs.house.gov/Committee/Calendar/' + url_tem[count] request = Request(link, callback=self.grab_title) request.meta['item'] = item yield request count = count + 1 yield item
def parse(self, response): '获取商铺详情页' req = [] plazaId=response.url.split('/')[-1] sel = Selector(response) tmplist=['10'] plazaShop_list=sel.xpath('//*[@class="mod-title"]/a') for plazaShops in plazaShop_list: str=plazaShops.xpath('text()').extract()[0] if str=='更多店铺': for category in tmplist: if category=='10': shopsurl='http://www.dianping.com'+plazaShops.xpath('@href').extract()[0].replace('20_','10_').strip() shopCatetory1='餐饮' if category=='20': shopsurl='http://www.dianping.com'+plazaShops.xpath('@href').extract()[0].replace('20_','20_').strip() shopCatetory1='购物' shopStreet=plazaShops.xpath('@href').extract()[0].split('/')[-2].replace('20_','').strip() item=PlazaShop() item['plazaId']=plazaId item['shopStreet']=shopStreet item['shopCatetory1']=shopCatetory1 item['shopUrl']=shopsurl r = Request(shopsurl, callback=self.shop_next_page) r.meta['item'] = item req.append(r) return req
def parse(self, response): """Crawl article index pages. From the index page, for each article extract it's topic first because in this old version, there is no text information about the topic in the article page. On index pages, it's contained in the alt attribute of article/topic image, but alt is empty on the article page. After that, follow the "Read more" link and get the other article fields. """ for i, a in enumerate(response.xpath( "//div[@class='articletrailer']/descendant::a[@class='trailer'][1]/@href")): article = Article() # If the image is not the default topic image, it will not have # an appropriate selector, so we use it's div. article["category"] = response.xpath( "//div[@class='articleheading']/descendant::img/@alt").extract()[i] article_url = response.urljoin(a.extract()) request = Request(article_url, callback=self.parse_article) request.meta["article"] = article yield request
def parse_job_list_page(self, response): self.get_connector().log(self.name, self.ACTION_CRAWL_LIST, response.url) feed_parser = feedparser.parse(response.body) for job_entry in feed_parser.entries: job_url = job_entry.link job_publication_date = datetime.fromtimestamp(mktime(job_entry.published_parsed)) job_publication_time = mktime(job_publication_date.timetuple()) last_job_publication_time = mktime(self._last_job_date.timetuple()) if job_publication_time <= last_job_publication_time: self.get_connector().log(self.name, self.ACTION_MARKER_FOUND, "%s <= %s" % (job_publication_time, last_job_publication_time)) return prepared_job = JobItem() request = Request(job_url, self.parse_job_page) request.meta['item'] = prepared_job prepared_job['title'] = job_entry.title prepared_job['description'] = job_entry.description prepared_job['publication_datetime'] = job_publication_date yield request
def parse_index(self, response): """处理目录页面,返回指向待爬取网页的Request列表 """ conf = response.meta['conf'] requests = [] page_list = self._get_result(response,conf) # 如果目录中没有内容,返回空列表 if not page_list: return requests next_page = True # 目录是否需要翻页 # 逐条测试从目录中提取的网页列表 for item in page_list: if isinstance(item, Request): # 返回了新的Request requests.append(item) next_page = False break if item['publish_time']: if item['publish_time'] <= self.from_time: # 网页发布时间早于self.from_time next_page = False break req = Request(item['crawl_url'], self.parse_page) # 传递已抽取信息 req.meta["item"] = item requests.append(req) # 如需要翻页,添加下一页的Request;否则关闭生成器 if next_page: requests.append(Request(self._next_result_page(response), callback = self.parse_index, meta = {'conf':conf})) return requests
def parse_depute(self, response): depute = json.loads(response.body_as_unicode()) if 'depute' in depute: depute = depute['depute'] depute['photo_url'] = self.photo_url % depute['slug'] req = None for ad in depute['adresses']: adresse = ad['adresse'] pattern = ur'Télé(phone|copie)\s*:\s*(\d[0-9 ]+\d)' for telm in re.finditer(pattern, adresse): if telm.group(1) == 'phone': ad['tel'] = telm.group(2) else: ad['fax'] = telm.group(2) lad = adresse.lower() if not req and not lad.startswith(u'assemblée nationale'): trimmed = re.sub(pattern, '', adresse) req = Request(url=self.get_geocode_url(adresse), callback=self.parse_geocode) req.meta['depute'] = depute req.meta['adresse'] = ad if req is not None: yield req else: yield depute
def parse(self, response): '获取商铺详情页' req = [] plazaId=response.url.split('/')[-1] sel = Selector(response) gouwu=sel.xpath('//*[@class="hot-top fn-clear"]/div') i=1 for gouwushop in gouwu: shopsurl='http://www.dianping.com'+gouwushop.xpath('a[1]/@href').extract()[0].strip() shopImg=[] shopImg=gouwushop.xpath('a[1]/img/@src').extract() item=PlazaShop() item['plazaId']=plazaId if i<=4: item['shopCatetory1']='购物' else: item['shopCatetory1']='餐饮' item['shopUrl']=shopsurl item['image_urls']=shopImg r = Request(shopsurl, callback=self.shop_detail) r.meta['item'] = item i=i+1 req.append(r) return req
def start_requests(self): for book in self.mongo.books_collection().find(): readers_url = u"https://www.livelib.ru/book/{}/readers/read".format(book["lib_id"]) self.logger.info(u"Queuing page: {}".format(readers_url)) readers_request = Request(readers_url, callback=self.parse) readers_request.meta["book_lib_id"] = book["lib_id"] yield readers_request
def start_requests(self): kwargs = { 'debug': self.settings.getbool('GIANT_DEBUG'), 'limit': self.settings.getint('GIANT_LIMIT'), 'opt': 'otc' } requests = [] for stockid in OtcIdDBHandler().stock.get_ids(**kwargs): for mon in range(2, -1, -1): timestamp = datetime.utcnow() - relativedelta(months=mon) if mon == 0: if timestamp.day == 1 and timestamp.hour <= 14: continue URL = ( 'http://www.gretai.org.tw/ch/stock/aftertrading/' + 'daily_trading_info/st43_download.php?d=%(year)d/%(mon)02d&' + 'stkno=%(stock)s') % { 'year': timestamp.year - 1911, 'mon': timestamp.month, 'stock': stockid } request = Request( URL, callback=self.parse, dont_filter=True) item = OtcHisStockItem() item['stockid'] = stockid request.meta['item'] = item requests.append(request) return requests
def parse(self, response): '获取商铺详情页' req = [] body= response.body.decode('gbk').replace('getCategoryCallback(','')[:-1] s = json.loads(body) datas=s["data"] for data in datas: for first_list in data["s"]: first=first_list["n"] for second_list in first_list["s"]: second=second_list["n"] for third_list in second_list["s"]: third=third_list["n"] if (first.split('|')[1]!='彩票' and first.split('|')[1]!='图书' and first.split('|')[1] !='理财' and second.split('|')[1]!='汽车品牌' and second.split('|')[1]!='汽车车型' and second.split('|')[1]!='京东通信' and third.split('|')[1]!='选号码' and third.split('|')[1]!='装宽带' and third.split('|')[1]!='中国移动' and third.split('|')[1]!='中国联通' and third.split('|')[1]!='中国电信'): item = JDItem() item['first']=first.split('|')[1] item['second']=second.split('|')[1] item['third']=third.split('|')[1] cat=third.split('|')[0] url='http://list.jd.com/list.html?cat='+cat.replace('-',',') if cat[:4]=='list': url='http://'+cat r = Request(url, callback=self.parse_brand) r.meta['item'] = item print 'url-222------------'+url req.append(r) return req
def parse(self, response): """Parse a APS record into a HEP record. Attempts to parse an XML JATS full text first, if available, and falls back to parsing JSON if such is not available. """ aps_response = json.loads(response.body_as_unicode()) for article in aps_response['data']: doi = get_value(article, 'identifiers.doi', default='') if doi: request = Request(url='{}/{}'.format(self.aps_base_url, doi), headers={'Accept': 'text/xml'}, callback=self._parse_jats, errback=self._parse_json_on_failure) request.meta['json_article'] = article request.meta['original_response'] = response yield request # Pagination support. Will yield until no more "next" pages are found if 'Link' in response.headers: links = link_header.parse(response.headers['Link']) next = links.links_by_attr_pairs([('rel', 'next')]) if next: next_url = next[0].href yield Request(next_url)
def parse_job_list_page(self, response): """ Pasring of job list """ self.get_connector().log(self.name, self.ACTION_CRAWL_LIST, response.url) try: for jobs in self._get_from_list__jobs_lists(response): for job in self._get_from_list__jobs(jobs): # first we check url. If the job exists, then skip crawling # (it means that the page has already been crawled try: url = self._get_from_list__url(job) except NotCrawlable: break if self.get_connector().job_exist(url): self.get_connector().log(self.name, self.ACTION_MARKER_FOUND, url) raise StopCrawlJobList() request = Request(url, self.parse_job_page) prefilled_job_item = self._get_prefilled_job_item(job, url) request.meta['item'] = prefilled_job_item if self.is_from_page_enabled(): yield request else: yield prefilled_job_item next_page_url = self._get_from_list__next_page(response) if next_page_url: yield Request(url=next_page_url) except NotFound, exc: self.get_connector().log(self.name, self.ACTION_CRAWL_ERROR, str(exc))
def start_requests(self): kwargs = { 'debug': self.settings.getbool('GIANT_DEBUG'), 'limit': self.settings.getint('GIANT_LIMIT'), 'opt': 'twse' } requests = [] for stockid in TwseIdDBHandler().stock.get_ids(**kwargs): for mon in range(4, -1, -1): timestamp = datetime.utcnow() - relativedelta(months=mon) if mon == 0: if timestamp.day == 1 and timestamp.hour <= 14: continue URL = ( 'http://www.twse.com.tw/ch/trading/exchange/' + 'STOCK_DAY/STOCK_DAY_print.php?genpage=genpage/' + 'Report%(year)d%(mon)02d/%(year)d%(mon)02d_F3_1_8_%(stock)s.php' + '&type=csv') % { 'year': timestamp.year, 'mon': timestamp.month, 'stock': stockid } request = Request( URL, callback=self.parse, dont_filter=True) item = TwseHisStockItem() item['stockid'] = stockid request.meta['item'] = item requests.append(request) return requests
def parse(self, response): """ response.body is a result of render.html call; it contains HTML processed by a browser. here we parse the html :param response: :return: request to detail page & request to next page if exists """ page = Selector(response) divs = page.xpath('//div[@class="list-game-app dotline-btn nofloat"]') current_url = response.url # parse details count = 0 for div in divs: if count >= 2: break item = AppstoreItem() info = div.xpath('.//div[@class="game-info whole"]') detail_url = info.xpath('./h4[@class="title"]/a/@href').extract_first() item["url"] = detail_url req = Request(detail_url, callback=self.parse_detail_page) req.meta["item"] = item count += 1 yield req # next page '''
def parse_zs_home(self, response): sel = Selector(response) item = self.init() con = sel.xpath("//meta[@name='keywords']/@content").extract() if con: item['company_shortname'] = con[0].split(',')[0] else: item['company_shortname'] = '' koubei = sel.xpath("//div[@class='zd_name']/p/a/text()").extract() if koubei: item['koubei'] = koubei[0] else: item['koubei'] = '' company_id = self.rule_getcompanyid.findall(response.url)[-1] url = self.url_company_des % company_id item["company_id"] = company_id item["service_content"] = response.url if len(sel.css(".zgshb_menu")) == 0: file("a.txt", "a").write(company_id + "\n") yield item else: request = Request(url, callback=self.parse_des) request.meta["item"] = copy.deepcopy(item) yield request
def parse(self, response): for href in response.xpath('//div[contains(@id, "dnn_ctr430_ExbList_pnlList")]//ul//li//a/@href'): url = response.urljoin(href.extract()) request = Request(url, callback=self.parse_exhibition) request.meta['dont_redirect'] = True yield request
def parse_nextpage(self,response): req = [] sel = Selector(response) #print(response.body) '下一页地址' next_list = sel.xpath('//*[@class="pull-right "]/text()').extract()#[0].replace('人评价','') next_list2 = sel.xpath('//*[@class="pull-right"]/text()').extract() if next_list2: total=next_list2[0].replace('人评价','') if next_list: total=next_list[0].replace('人评价','') #total=int(next_list)/15+1 shopid=sel.xpath('/html/head/link[4]/@href').extract()[0].split('/')[-1].split('.')[0] print '-----dafaaaaaaaaaaaaaa:---'+str(total) item = response.meta['item'] tag_list=sel.xpath('//*[@class="tag-category"]/span[1]/text()').extract() tag='' for tags in tag_list: if tag=='': tag=tags tag=tag+'|'+tags for page in range(int(total)/15+1): url = 'http://i.meituan.com/deal/'+str(shopid)+'/feedback/page_'+str(page+1) print '--------afag2:----------'+url ua = random.choice(self.user_agent_list) if ua: r = Request(url, callback=self.parse_comments) print 'useragent-333------------'+ua r.headers.setdefault('User-Agent', ua) item['url']=url.strip() item['tag']=tag.strip() r.meta['item'] = item req.append(r) return req
def parse(self, response): select = Selector(response) if "data" in response.meta: isNextPage = response.meta["data"] pageNo = self.digitalPattern.findall(response.url)[1] else: isNextPage = "firstPage" pageNo = "0" question_id = self.questionIdPatten.findall(response.url)[0] question_id = question_id[1:].replace("-", "") item = TobatoItem() item["question_id"] = question_id # pages 取第二页 try: logging.info("# pages 取第二页") totoal_answers = select.css(".pages").xpath(".//em/b/text()").extract()[0] pages = int(totoal_answers) / 20 + 1 # page从2开始计算 for page in xrange(2, pages + 1): # xrange[) requestUrl = self.pageUrl % (question_id, page) logging.info(requestUrl + "---------------------------------------") request = Request(url=requestUrl, callback=self.parse, priority=123456) request.meta["data"] = "true" yield request pass except Exception, e: print e
def parse_booklink(self, response): sel = Selector(response) # Xpath choose 'The content of first <div> </div> with class="p-name"' sites = sel.xpath('(//div[@class="p-name"])[1]') req = [] for site in sites: # This is the hyperlink to the details of the bookinfo. # Xpath chooses 'The @href content(hyperlink) in <a> </a>" books = site.xpath('a/@href').extract() for b in books: # Request pages from url, the page will show details of the book, including category info. # Uses encode to keep Chinese charaters from losing url = "http:" + b.encode('utf-8') # Store the URL in the 'request' method, callback function is parse() r = Request(url, callback=self.parse_category, dont_filter=True) # Bookid is stored as an additional data in 'request', r.meta['bkid']=response.meta['id'] req.append(r) return req
def parse(self, response): """ """ sel = Selector(response) sites = sel.xpath("//div[@class='tabs-container']//*//article//div[@class='description']") domain = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(response.url)) rub = u'\u0440\u0443\u0431.' items = [] for site in sites: item = RealtyItem() price = site.xpath(".//section[@class='d-1']//p[@class='price']//span/text()").extract()[0] price = price.replace(rub, '').replace(u' ', '') item['price'] = price item['floor'] = site.xpath(".//section[@class='d-2 params']//p[@class='row floor']//span[@class='value corporate_red']/text()").extract()[0] item['space'] = site.xpath(".//section[@class='d-2 params']//p[@class='row space']//span[@class='value corporate_red']/text()").extract()[0] item['url'] = urljoin(domain, site.xpath(".//p[@class='title-obj']/a/@href").extract()[0]) kitchen = site.xpath(".//section[@class='d-2 params']//p[@class='row kitchen']//span[@class='value corporate_red']/text()").extract() if kitchen: item['kitchen'] = kitchen[0] # item['district'] = request.meta['item'] request = Request(item['url'], callback=self.parse_page) request.meta['item'] = item yield request items.append(item)
def start_requests(self): for rating in self.mongo.ratings_collection().find(): if self.mongo.users_collection().find({"user_lib_id": rating["user_lib_id"]}).count() == 0: user_url = u"https://www.livelib.ru/reader/{0}".format(rating["user_lib_id"]) self.logger.info(u"Queuing page: {}".format(user_url)) user_request = Request(user_url, callback=self.parse) user_request.meta["user_lib_id"] = rating["user_lib_id"] yield user_request
def parse_article(self, response): """Parses the page, finds the comment page link, goes there""" comment_page = response.xpath("//a[@class='k_makeComment']/@href") if len(comment_page) > 0: article_link = response.url comment_page = "http://www.blic.rs" + comment_page.extract()[0] request = Request(comment_page, callback=self.parse_comment_page) request.meta['article_link'] = article_link yield request
def parse_node(self, response, node): item = MynewsItem() item['host'] = 'www.themalaysianinsider.com' item['url'] = node.xpath('link/text()').extract()[0].strip() item['title'] = node.xpath('title/text()').extract()[0].strip() item['category'] = 'news' request = Request(item['url'] , callback=self.parse_content) request.meta['item'] = item yield request
def parse_node(self, response, node): item = MynewsItem() item['host'] = 'www.thestar.com.my' item['url'] = node.xpath('link/text()').extract()[0].strip() item['title'] = node.xpath('title/text()').extract()[0].strip() item['date'] = node.xpath('pubDate/text()').extract()[0].strip() request = Request(item['url'] , callback=self.parse_content) request.meta['item'] = item return request
def parse(self, response): list_of_cities = response.xpath( '//h5[text()="us cities"]/parent::li/ul//a[@href][text()!="more ..."]/@href' ).extract() # print list_of_cities for l in list_of_cities: r = Request(l + "search/msa", callback=self.parse_city) r.meta["base_url"] = l[:-1] yield r
def parseProduct(self,response): item = response.meta["data"] sel = Selector(response) #产品id item['product_id'] = self.rule_getpid.findall(response.url)[0] #产品名称,去掉空白符 product_name = sel.xpath(".//div[@class='tb-detail-hd']/h1/text()").extract()[0] item['product_name'] = self.rule_removeblank.sub('',product_name) #brandid item['brand_id'] = response.headers['at_brid'] #userid==shopid content = sel.xpath(".//head/meta[@name='microscope-data']/@content").extract() if content: word = self.rule_getuserid.findall(content[0]) if word: item['shop_id'] = word[0].split('=')[-1] else: item['shop_id'] = '' else: item['shop_id'] = '' #product_point product_point = sel.xpath(".//div[@class='tb-detail-hd']/p/text()").extract()[0] item['product_point'] = self.rule_removeblank.sub('',product_point) #product_data product_data_temp = sel.xpath(".//ul[@id = 'J_AttrUL']/li/text()").extract() product_data =[] item['product_type'] = '' item['brand_name'] = '' for pd in product_data_temp: pd_temp = pd.replace(u'\xa0','') if self.rule_brandname.findall(pd_temp): item['brand_name'] = pd_temp.split(':')[-1] elif self.rule_producttype.findall(pd_temp): item['product_type'] = pd_temp.split(':')[-1] product_data.append(pd_temp) item['product_data'] = product_data item['category_id'] = '' item['category_name'] = '' item['product_judgementnum'] = '' item['product_searchword'] = '' item['product_specialjudge'] = '' item['url'] = response.url #group_img group_imgs = sel.xpath("//ul[@id='J_UlThumb']/li/a/img/@src").extract() for img in group_imgs: item['image_urls'].append('https:'+ img.replace('60x60','300x300')) request = Request(self.url_specialjudge%(item["product_id"]), callback = self.parse_specialjudge, priority=123456 ) request.meta["data"] = copy.deepcopy(item) yield request
def start_requests(self): yield Request(self.basic_url.format(offset = 0),self.hn_parse)
def get_media_requests(self, item, info): yield Request(item['url'])
def start_requests(self): yield Request("http://www.tudogostoso.com.br/", callback=self.initial_url, headers=self.headers)
def categories_urls(self, response): all_recipes_link = response.xpath("//div[contains(@class, 'submenu')]/ul/li/a/@href")[0].extract() yield Request(response.urljoin(all_recipes_link), callback=self.all_recipes_urls, headers=self.headers)
def start_requests(self): # 设置请求校园网网址 url = "http://202.207.247.49" yield Request(url, self.login_parse)
def test_callback_serialization(self): r = Request("http://www.example.com", callback=self.spider.parse_item, errback=self.spider.handle_error) self._assert_serializes_ok(r, spider=self.spider)
def parse_list(self, response): data = response.body if data == '' or data == '[]': log.msg(format='%(request)s post fail.response is [].', level=log.ERROR, request=response.url) return age = response.meta['age'] cat = response.meta['cat'] try: js = json.loads(data) except: log.msg(u'图书类别[%s]-[%s]列表请求结果解析异常,非json数据.url=%s' % (age['name'], cat['name'], response.url), level=log.INFO) return # log.msg(u'图书类别[%s]-[%s]页码[%d]总数=%d,开始请求详情...' % (age['name'], cat['name'], response.meta['page'], len(js['products']))) for item in js['products']: # ''' pc = Category() pc['product_id'] = item['id'] pc['category_path'] = '01.41.%s.%s.00.00' % (age['id'], cat['id']) pc['path_name'] = cat['name'] yield pc #详情请求 yield Request( url=self.info_url.replace('<?pid?>', item['id']), callback=self.parse_info, headers=self.headers, meta={'age': age, 'cat': cat} ) ''' #评论请求 yield Request(url=self.review_url.replace('<?pid?>', item['id']).replace( '<?page?>', '1'), callback=self.parse_review, headers=self.headers, meta={ 'page': 1, 'pid': item['id'] }) #下一页 if len(js['products']) >= 200: page = response.meta['page'] + 1 log.msg(u'请求类别[%s]-[%s]的第%d页' % (age['name'], cat['name'], page)) yield Request(url=self.list_url.replace( '<?page?>', str(page)).replace('<?age?>', age['id']).replace('<?cat?>', cat['id']), callback=self.parse_list, headers=self.headers, meta={ 'page': page, 'age': age, 'cat': cat })
def parse(self, response): self.logger.debug('response.url: {}'.format(response.url)) try: total = response.xpath( '//div[@class="category-items clearfix"]//div[@class="category-item m"]' ) # le = LinkExtractor( # restrict_xpaths='//div[@class="category-item m"]/div[@class="mc"]/div[@class="items"]/dl/dd/a') # links = le.extract_links(response) for t in total: first_category_name = t.xpath( './div[@class="mt"]/h2/span/text()').extract_first() second = t.xpath('./div[@class="mc"]/div[@class="items"]/dl') for s in second: second_links = s.xpath('./dt/a').extract_first() second_item = re.findall( r'<a href="(.*?)" target="_blank">(.*?)</a>', second_links) self.logger.debug('second_item: {} '.format(second_item)) second_category_name = second_item[0][1] second_category_url = 'https:' + second_item[0][0] third_links = s.xpath('./dd/a').extract() for third_link in third_links: third_items = re.findall( r'<a href="(.*?)" target="_blank">(.*?)</a>', third_link) self.logger.debug( 'third_items: {} '.format(third_items)) for item in third_items: if item[0].startswith('https:'): item[0] = item[0].lstrip('https:') if item[0].split('.')[0].split('//')[1] != 'list': self.logger.debug('not list url: {}'.format( item[0])) yield Request('https:' + item[0], callback=self.parse_not_list) else: category_item = CategoryItem() category_item[ 'first_category_name'] = first_category_name category_item[ 'second_category_name'] = second_category_name category_item[ 'second_category_url'] = second_category_url category_item['third_category_name'] = item[1] category_item[ 'third_category_url'] = 'https:' + item[0] category_item['id'] = item[0].split( '=')[1].split('&')[0] category_item['crawl_time'] = datetime.now( ).strftime("%Y-%m-%d %H:%M:%S") yield category_item category_info = { 'first_category_name': first_category_name, 'second_category_name': second_category_name, 'third_category_name': item[1], 'third_category_id': item[0].split('=')[1].split('&')[0], } yield Request( 'https:' + item[0], callback=self.parse_list, meta={'category_info': category_info}) except Exception as e: self.logger.debug('parse error:', e)
def parse_product(self, response): """商品页获取title,price,product_id""" category_info = response.meta.get('category_info') ids = re.findall(r"venderId:(.*?),\s.*?shopId:'(.*?)'", response.text) if not ids: ids = re.findall(r"venderId:(.*?),\s.*?shopId:(.*?),", response.text) vender_id = ids[0][0] shop_id = ids[0][1] # shop shopItem = ShopItem() shopItem['id'] = shop_id shopItem['venderId'] = vender_id shopItem['url1'] = 'http://mall.jd.com/index-%s.html' % (shop_id) try: shopItem['url2'] = 'https:' + \ response.xpath('//ul[@class="parameter2 p-parameter-list"]/li/a/@href').extract_first() except: shopItem['url2'] = shopItem['url1'] # name = '' if shop_id == '0': shopItem['name'] = '京东自营' else: if response.xpath( '//ul[@class="parameter2 p-parameter-list"]/li/a//text()' ).extract_first(): shopItem['name'] = response.xpath( '//ul[@class="parameter2 p-parameter-list"]/li/a//text()' ).extract_first() self.logger.debug('name1: {}'.format(shopItem['name'])) elif response.xpath( '//span[@class="shop-name"]//text()').extract_first(): shopItem['name'] = response.xpath( '//span[@class="shop-name"]//text()').extract_first( ).strip() self.logger.debug('name2: {}'.format(shopItem['name'])) elif response.xpath( '//div[@class="name"]/a//text()').extract_first(): self.logger.debug('name3 div[@class="name"]/a: {}'.format( response.xpath( '//div[@class="name"]/a//text()').extract_first())) shopItem['name'] = response.xpath( '//div[@class="name"]/a//text()').extract_first().strip() self.logger.debug('name3: {}'.format(shopItem['name'])) elif response.xpath( '//div[@class="shopName"]/strong/span/a//text()' ).extract_first(): shopItem['name'] = response.xpath( '//div[@class="shopName"]/strong/span/a//text()' ).extract_first().strip() self.logger.debug('name4: {}'.format(shopItem['name'])) elif response.xpath( '//div[@class="shopName"]/strong/span/a//text()' ).extract_first(): shopItem['name'] = response.xpath( '//div[@class="shopName"]/strong/span/a//text()' ).extract_first().strip() self.logger.debug('name5: {}'.format(shopItem['name'])) elif response.xpath( '//div[@class="seller-infor"]/a//text()').extract_first(): shopItem['name'] = response.xpath( '//div[@class="seller-infor"]/a//text()').extract_first( ).strip() self.logger.debug('name6: {}'.format(shopItem['name'])) else: shopItem['name'] = '京东自营' self.logger.debug('name7: {}'.format(shopItem['name'])) shopItem['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") yield shopItem productsItem = ProductsItem() productsItem['shopId'] = shop_id productsItem['first_category_name'] = category_info.get( 'first_category_name') productsItem['second_category_name'] = category_info.get( 'second_category_name') productsItem['third_category_name'] = category_info.get( 'third_category_name') productsItem['third_category_id'] = category_info.get( 'third_category_id') # try: # # title = response.xpath('//div[@class="sku-name"]/text()').extract()[0].replace(u"\xa0", "").strip() # title = ''.join(response.xpath('//div[@class="sku-name"]//text()').extract()) # self.logger.debug('title1: {}'.format(title)) # except Exception as e: # title = response.xpath('//div[@id="name"]/h1/text()').extract_first() if response.xpath('//div[@class="sku-name"]/text()').extract(): # title = ''.join(i.split() for i in response.xpath('//div[@class="sku-name"]//text()').extract()) # title = ''.join(response.xpath('//div[@class="sku-name"]/text()').extract_first().split()) title = ''.join(i.strip() for i in response.xpath( '//div[@class="sku-name"]/text()').extract()) self.logger.debug('title1: {}'.format(title)) elif response.xpath('//div[@id="name"]/h1/text()').extract_first(): title = response.xpath( '//div[@id="name"]/h1/text()').extract_first() self.logger.debug('title2: {}'.format(title)) else: title = response.xpath( '//ul[@class="parameter2 p-parameter-list"]/li[1]/@title' ).extract_first() self.logger.debug('title3: {}'.format(title)) productsItem['name'] = title.strip() product_id = response.url.split('/')[-1][:-5] productsItem['id'] = product_id productsItem['url'] = response.url # description desc = response.xpath( '//ul[@class="parameter2 p-parameter-list"]//text()').extract() productsItem['description'] = '/'.join(i.strip() for i in desc) # productsItem['description'] = '/'.join(desc) # price # response = requests.get(url=price_url + product_id) # price_response = html_from_uri(self.price_url.format(product_id=product_id)) total_price_url = self.price_url.format( product_id=product_id) + '&pduid=' + str( random.randint(100000, 999999)) self.logger.debug('total_price_url: {}'.format(total_price_url)) price_response = requests.get(total_price_url) price_json = price_response.json() self.logger.debug('price_json:{}'.format(price_json)) productsItem['reallyPrice'] = price_json[0]['p'] productsItem['originalPrice'] = price_json[0]['m'] # 优惠 # res_url = self.favourable_url % (product_id, shop_id, vender_id, category.replace(',', '%2c')) res_url = self.favourable_url.format( skuId=product_id, shopId=shop_id, venderId=vender_id, cat=category_info.get('third_category_id').replace(',', '%2c')) # print(res_url) # response = requests.get(res_url) # fav_response = html_from_uri(res_url) fav_response = requests.get(res_url) fav_data = fav_response.json() self.logger.debug('fav_data:{}'.format(fav_data)) if fav_data['skuCoupon']: desc1 = [] for item in fav_data['skuCoupon']: start_time = item['beginTime'] end_time = item['endTime'] time_dec = item['timeDesc'] fav_price = item['quota'] fav_count = item['discount'] fav_time = item['addDays'] desc1.append(u'有效期%s至%s,满%s减%s' % (start_time, end_time, fav_price, fav_count)) productsItem['favourableDesc1'] = ';'.join(desc1) if fav_data['prom'] and fav_data['prom']['pickOneTag']: desc2 = [] for item in fav_data['prom']['pickOneTag']: desc2.append(item['content']) productsItem['favourableDesc2'] = ';'.join(desc2) productsItem['crawl_time'] = datetime.now().strftime( "%Y-%m-%d %H:%M:%S") yield productsItem data = dict() data['product_id'] = product_id data['page'] = 0 yield Request(url=self.comment_url.format(productId=product_id, page=0), callback=self.parse_comments, meta=data)
def start_requests(self): for base_url in self.base_urls: yield Request(url=base_url, callback=self.parse)
def parseStockInfo(self, response): stock_sets = set() # 主要内容 main_content = response.css(".main > div.content") # print(main_content) # 大盘编码 index_code = response.meta['index_code'] # print(index_code) stockInfoList = main_content.css(".block02 > div.tab01").xpath( "//table/tr[position() > 1 and position() < 52]") # print("股票列表:\n") for stockInfo in stockInfoList: if (len(stockInfo.xpath("td").extract()) < 3): continue # 股票代码 stock_code = stockInfo.xpath("td[1]/a/text()").extract() # 股票名称 stock_name = stockInfo.xpath("td[2]/a/text()").extract() # 当前最新价格 last_trade = stockInfo.xpath("td[3]/span/text()").extract() # 创建时间 create_time = CommonUtil().getCreateTime() # 打印输出 print(stock_code, stock_name, index_code, last_trade, create_time) # 定义大盘基本信息item stock_info_item = StockInfoItem() stock_info_item['stock_code'] = stock_code stock_info_item['stock_name'] = stock_name stock_info_item['index_code'] = index_code stock_info_item['last_trade'] = last_trade stock_info_item['create_time'] = create_time stock_sets.add(stock_info_item) pass yield stock_info_item # 循环股票列表数据,提取每只股票的股东股本信息 for stock_info in stock_sets: stock_code = stock_info['stock_code'][0] stock_name = stock_info['stock_name'][0] # 提取股票股东股本信息的url holder_url = self.holder_url % stock_code print("======提取股票股东股本信息holder_url:" + holder_url) yield Request(url=holder_url, meta={ 'stock_code': stock_code, 'stock_name': stock_name }, callback=self.parsestockShareHolder) # 循环股票列表数据,提取股票分红信息 for stock_info in stock_sets: stock_code = stock_info['stock_code'][0] stock_name = stock_info['stock_name'][0] # 提取股票分红配送记录信息 dividend_url = self.dividend_url % stock_code print("======提取股票分红配送记录信息dividend_url:" + dividend_url) yield Request(url=dividend_url, meta={ 'stock_code': stock_code, 'stock_name': stock_name }, callback=self.parseStockDividendRecord) # 循环股票列表数据,提取股票所属板块(标签)信息 for stock_info in stock_sets: stock_code = stock_info['stock_code'][0] stock_name = stock_info['stock_name'][0] # 提取股票所属板块信息 stock_type_url = self.stock_type_url % stock_code print("======提取股票所属板块信息stock_type_url:" + stock_type_url) yield Request(url=stock_type_url, meta={ 'stock_code': stock_code, 'stock_name': stock_name }, callback=self.parseStockTypeData) # 循环股票列表数据,提取股票历史记录行情信息 for stock_info in stock_sets: stock_code = stock_info['stock_code'][0] stock_name = stock_info['stock_name'][0] # 循环从1990到2017年 for year in range(1990, 2018): # 每年四个季度 for quarter in range(1, 5): data_url = self.detail_url % (stock_code, year, quarter) print("======提取股票历史记录行情信息data_url:" + data_url) yield Request(url=data_url, meta={ 'stock_code': stock_code, 'stock_name': stock_name, 'year': year }, callback=self.parseHistoryStockData)
def start_requests(self): yield Request(url=self.start_urls[0], callback=self.parse, headers=self.headers)
def start_requests(self): keys = ['大数据', 'hadoop', 'spark'] for key in keys: url = 'http://www.neitui.me/?name=job&handle=lists&city=城市&keyword=' + key yield Request(url=url)
def start_requests(self): yield Request('http://ssdut.dlut.edu.cn/index/bkstz.htm', self.parse)
def start_requests(self): url = 'https://movie.douban.com/top250' yield Request(url, headers=self.headers)
def test_utf8_body(self): r = Request("http://www.example.com", body=b"\xc2\xa3") self._assert_serializes_ok(r)
def start_requests(self): yield Request(url=u"http://airquality.deq.louisiana.gov", callback=self.get_global_date)
def test_unserializable_callback2(self): r = Request("http://www.example.com", callback=self.spider.parse_item) self.assertRaises(ValueError, r.to_dict, spider=None)
def start_requests(self): url = "https://hoanghamobile.com/dien-thoai-di-dong-c14.html?sort=0&p=" page = ["1","2","3","4","5","6","7","8","9","10"] for x in page: yield Request(url+x, self.parse)
def start_requests(self): yield Request(ELECT_URL + 'speltyCommonCourse.aspx', dont_filter=True, callback=self.tongshi_1)
def test_basic(self): r = Request("http://www.example.com") self._assert_serializes_ok(r)
def start_requests(self): for book in book_list: yield Request(url=BASE_URL.format(book), callback=self.parse_book_info, cb_kwargs=dict(short_name=book))
def test_unserializable_callback1(self): r = Request("http://www.example.com", callback=lambda x: x) self.assertRaises(ValueError, r.to_dict, spider=self.spider)
def initial_url(self, response): for href in response.xpath("//div[contains(@class, 'menu')]/nav/ul/li/a/@href"): href_text = href.extract() if href_text not in ["/videos.php", "/especiais/15-chefs-tudo-gostoso", "/categorias/sopas.php"]: yield Request(response.urljoin(href.extract()), callback=self.categories_urls, headers=self.headers)
def test_mixin_private_callback_serialization(self): r = Request("http://www.example.com", callback=self.spider._TestSpiderMixin__mixin_callback, errback=self.spider.handle_error) self._assert_serializes_ok(r, spider=self.spider)
def parse(self, response): url = "https://hoanghamobile.com" url_phones = response.css('a.mosaic-overlay::attr(href)').getall() for url_phone in url_phones: yield Request(url+url_phone,self.save_info)
def start_requests(self): for uid in self.start_users: yield Request(self.user_url.format(uid=uid), callback=self.parse_user)
def parse(self, response: Response): for url in response.css('.sub-catalog a::attr("href")').extract(): yield Request('https://www.dushu.com' + url, callback=self.parse_item)
def start_requests(self): # First request yield Request( url=u'FILL_WITH_URL', callback=self.parse, )
def start_requests(self): yield Request(url=self.start_url, callback=self.parse, dont_filter=True)