def parse(self, response): i = 0 for item in response.xpath("/html/body/div[2]/div[2]/ul/li/a"): i = i+1 movClass = item.xpath("text()").extract() movUrl = item.xpath("@href").extract_first() oneItem = ExampleItem() oneItem['movClass'] = movClass oneItem['movUrl'] = movUrl for j in range(1,2): if j==1: mvUrl2 = movUrl+str('index.html') else: mvUrl2 = movUrl+str('index_%s.html'%j) try: # print("++++++++++"+mvUrl2) # yield oneItem # yield scrapy.Request(url=mvUrl2,callback=lambda response,mvclass=movClass: self.parse_url(response,mvclass)) yield scrapy.Request(url=mvUrl2,callback=self.parse_url()) except Exception as error: print("-------------") print(error) pass except RuntimeError as error: print("******************") print(error) if i>2: break
def parse_item(self, response): item = ExampleItem() item['name'] = response.css( 'tr#places_country__row td.w2p_fw::text').extract() item['population'] = response.css( 'tr#places_population__row td.w2p_fw::text').extract() return item
def parse_directory(self, response): item = ExampleItem() url = response.url jid = self.md5(url) title = response.xpath(r'//h1/text()').extract()[0] location = response.xpath( r'//div/span[@class="lname"]/text()').extract()[0] exp = response.xpath( r'//div[@class="t1"]/span[1]/text()').extract()[0].strip('年经验') if '无工作' in exp: exp = 0 else: exp = exp.strip('-')[0] degree = response.xpath( r'//div[@class="t1"]/span[2]/text()').extract()[0] crawled = datetime.datetime.now().strftime('%Y-%m-%d') money = response.xpath(r'//div[@class="cn"]/strong/text()').extract() maxmoney = minmoney = 0 if money: money = money[0] else: pass if '/月' in money: money = money.strip('/月') money = money.split('-') if 'k' in money[1]: maxmoney = float(money[1].strip('k')) * 1000 minmoney = float(money[0]) * 1000 elif '千' in money[1]: maxmoney = float(money[1].strip('千')) * 1000 minmoney = float(money[0]) * 1000 elif '万' in money[1]: maxmoney = float(money[1].strip('万')) * 10000 minmoney = float(money[0]) * 10000 elif '万/年' in money: money = money.strip('/年') money = money.split('-') if '万' in money[1]: maxmoney = float(money[1].strip('万')) * 1000 minmoney = float(money[0]) * 1000 elif '以上' in money: minmoney = maxmoney = money.strip('元/月以上') elif '面议' in money: minmoney = maxmoney = 0 elif '以下' in money: minmoney = maxmoney = int(money.strip('元/月以下')) item['title'] = title item['maxmoney'] = int(maxmoney) item['minmoney'] = int(minmoney) item['crawled'] = crawled item['location'] = location item['exp'] = exp item['degree'] = degree item['url'] = url item['jid'] = jid print(title, location, exp, degree, url) yield item
def parse(self, response): hxs = HtmlXPathSelector(response) titles = hxs.select("//p") items = [] for titles in titles: item = ExampleItem() item['year'] = titles.select("span/text()").extract()[0] item['title'] = titles.select("a/b/text()").extract()[0] yield item
def parse(self, response): print "s%d" % self.count, self.count += 1 news_item = ExampleItem() items = response.xpath("//div[@id='list']/a") for item in items: news_item["news_link"] = item.xpath("@href").extract()[0] news_item["news_title"] = item.xpath("span/text()").extract()[0] news_item["news_time"] = item.xpath("font/text()").extract()[0] yield news_item
def parse_item(self, response): item = ExampleItem() name_css = 'tr#places_country__row td.w2p_fw::text' item['name'] = response.css(name_css).extract() pop_css = 'tr#places_population__row td.w2p_fw::text' item['population'] = response.css(pop_css).extract() #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() return item
def parse_book(self, response): book = ExampleItem() table = response.xpath('//article/table') book['name'] = response.xpath('//article//h1/text()').extract_first() book['price'] = table.xpath('//tr[3]/td/text()').extract_first() book['availability'] = table.xpath('//tr[6]/td/text()').re_first( '(\d+)') book['review_num'] = table.xpath('//tr[7]/td/text()').extract_first() yield book
def parse(self, response): item = ExampleItem() for box in response.xpath( '//div[dl[@class="f-list-item-wrap f-clear"]]'): #获取每个div中的课程路径 item['url'] = box.xpath('./a[@title]/text()') #获取div中的课程标题 #item['title'] = box.xpath('.//a[@class="js-title value title-font"]/text()').extract()[0].strip() #获取div中的价格 #item['price'] = box.xpath('//div[1]/span[1]/text()') yield item
def parseJobDetail(self, response): print(response.status) item = ExampleItem() # 目标数据:职位名称、工作地点、职位类别、工作职责、工作要求 item['jobName'] = response.xpath('//td[@id="sharetitle"]/text()').extract()[0] item['workLocation'] = response.xpath('//tr[@class="c bottomline"]/td[1]/text()').extract()[0] item['jobType'] = response.xpath('//tr[@class="c bottomline"]/td[2]/text()').extract()[0] item['jobDesc'] = response.xpath('//table[@class="tablelist textl"]/tr[3]//li/text()').extract() item['jobInfo'] = response.xpath('//table[@class="tablelist textl"]/tr[4]//li/text()').extract() # print(jobName,workLocation,jobType,jobDesc,jobInfo) yield item
def parse(self, response): print 'c%d' % self.count, self.count += 1 news_item = ExampleItem() items = response.xpath("//ul[@class='news-list']/li") for item in items: news_item["news_link"] = item.xpath("a/@href").extract()[0] news_item["news_title"] = item.xpath("a/text()").extract()[0] news_item["news_time"] = item.xpath( "span[@class='time']/text()").extract()[0] yield news_item
def parse_directory(self, response): print(response.status) list = response.xpath('//div[@class="post floated-thumb"]') for node in list: item = ExampleItem() item['name'] = node.xpath('//div[@class="post-thumb"]/a/@title').extract_first() item['url'] = node.xpath('//div[@class="post-thumb"]/a/@href').extract_first() yield item
def parse_blog(self, response): hxs = HtmlXPathSelector(response) item = ExampleItem() item['title'] = hxs.select( "//div[@class='span9']/h1/a/text()").extract() item['description'] = hxs.select( "//div[@class='span9']/p[@class='lead']/text()").extract() item['date'] = hxs.select( "//div[@class='span9']/h1/small/text()").extract() item['post'] = hxs.select("//div[@class='span9']/p/text()").extract() return item
def parse_directory(self, response): item = ExampleItem() url = response.url jid = self.md5(url) title = response.xpath('//h1/text()').extract()[0] money = response.xpath( '//ul[@class="clearfix pos-relat"]/li[2]/em/text()').extract()[0] minmoney = maxmoney = 0 if '-' in money: money = money.split('-') minmoney = int(money[0]) maxmoney = int(money[1].strip('元')) elif '面议' in money: pass elif '元以上' in money: minmoney = maxmoney = money.strip('元以上') elif '元以下' in money: minmoney = maxmoney = int(money.strip('元以下')) degree = response.xpath( '//ul[@class="clearfix pos-relat"]/li[3]/em/text()').extract()[0] location = response.xpath( '//ul[@class="clearfix pos-relat"]/li[8]/em/text()|//ul[@class="clearfix pos-relat"]/li[7]/em/text()' ).extract()[0].strip(' ') # if location: # location = location # else: # location = response.xpath('//ul[@class="clearfix pos-relat"]/li[7]/em/text()').extract()[0] crawled = response.xpath('//p[@class="data-sty mb-5"]/span[1]/text()' ).extract()[0].strip('更新时间:') crawled = self.Strfdate(crawled) exp = response.xpath( '//ul[@class="clearfix pos-relat"]/li[4]/em/text()').extract()[0] p = re.compile(r'(\d+)') exp = p.search(exp) if exp: exp = exp.group(1) else: exp = 0 # item['item'] =money item["title"] = title item["maxmoney"] = maxmoney item["minmoney"] = minmoney item["location"] = location item["crawled"] = crawled item["exp"] = exp item["degree"] = degree item["url"] = url item["jid"] = jid yield item
def parse(self, response): sel = Selector(response) sites = sel.xpath('/html/body/div[4]/div/div[2]/div[1]/ul/li') #url = 'http://www.hbnu.edu.cn/' item = ExampleItem() url = [] item['title'] = '一级链接' for site in sites: item['url'] = site.xpath('a/@href').extract() url.append(item) yield (item) #print(url[2]['url']) yield scrapy.Request(url[2]['url'][0], callback=self.next)
def parse_directory(self, response): item = ExampleItem() url = response.url jid = self.md5(url) location = response.xpath( '//div[@class="pos-area"]/span/span[1]/text()').extract()[0] exp = response.xpath( '//div[@class="pos_base_condition"]/span[3]/text()').extract()[0] p = re.compile(r'(\d+)') exp = p.search(exp) if exp: exp = exp.group(1) else: exp = 0 date_pub = response.xpath( '//span[@class="pos_base_num pos_base_update"]/span/text()' ).extract()[0] crawled = self.Strfdate(date_pub) degree = response.xpath( '//span[@class="item_condition"]/text()').extract()[0] title = response.xpath( '//div[@class="pos_base_info"]/span[1]/text()').extract()[0] money = response.xpath('//div[@class="pos_base_info"]/span[2]/text()' ).extract()[0].strip('元/月\xa0') minmoney = maxmoney = 0 if '-' in money: money = money.split('-') minmoney = int(money[0]) maxmoney = int(money[1]) elif '面议' in money: pass elif '以上' in money: minmoney = maxmoney = money.strip('元/月以上') elif '以下' in money: minmoney = maxmoney = int(money.strip('元/月以下')) item['url'] = url item['jid'] = jid item['title'] = title item['location'] = location item['exp'] = exp item['degree'] = degree item['maxmoney'] = maxmoney item['minmoney'] = minmoney item['crawled'] = crawled yield item
def parse(self, response): l = ExampleItem() # Judge whether it's a not html file if self.isNoHtmlFile(response) is True: # Treat it like not html self.saveItToFile(self.debug_nohtml_cache_dir, response) else: # Treat it like html # According to the rules to judge whether it has the keywords, then save the file in directory if self.htmlParseRulesSelector(response) is True: self.saveItToFile(self.debug_dir, response) l['name'] = response.css('title::text').get() self.get_urls_store_redis(response) l['url'] = response.url yield l
def parse(self, response): item = ExampleItem() item['name'] = response.xpath( '//article[@class="product_pod"]/h3/a/text()').extract() print(item['name']) item['price'] = response.xpath( '//div[@class="product_price"]/p[@class="price_color"]/text()' ).extract() print(item['price']) #提取完数据后返回item yield item for i in range(1, 50): url = 'http://books.toscrape.com/catalogue/page-' + str( i) + '.html' yield Request(url, callback=self.parse)
def parse_directory(self, response): item = ExampleItem() url = response.url jid = self.md5(url) # jid = re.compile(r'https://www.lagou.com/jobs/(\d+)') # jid = jid.search(url) # if jid : # jid = jid.group(1) location = response.xpath('//dd[@class="job_request"]/p/span[2]/text()' ).extract()[0].strip('/').strip(' /') exp = response.xpath('//dd[@class="job_request"]/p/span[3]/text()' ).extract()[0].strip(' /').strip('经验') if '年' in exp: exp = exp.strip('年').strip('-')[0] else: exp = 0 degree = response.xpath('//dd[@class="job_request"]/p/span[4]/text()' ).extract()[0].strip(' /') crawled = datetime.datetime.now().strftime('%Y-%m-%d') title = response.xpath('//span[@class="name"]/text()').extract()[0] money = response.xpath('//dd[@class="job_request"]/p/span[1]/text()' ).extract()[0].split('-') maxmoney = minmoney = 0 if money: money = money else: pass if 'k' in money[0]: minmoney = int(money[0].strip('k')) * 1000 maxmoney = int(money[1].strip('k ')) * 1000 # 加载数据 item['url'] = url item['jid'] = jid item['title'] = title item['location'] = location item['exp'] = exp item['degree'] = degree item['maxmoney'] = maxmoney item['minmoney'] = minmoney # item['money'] = money item['crawled'] = crawled yield item
def parse_directory(self, response): node_all = response.xpath('//ul[@class="boxbdnopd"]/li') for node in node_all: item = ExampleItem() name = node.xpath('.//h4/a/text()').extract()[0] link = node.xpath('.//h4/a/@href').extract()[0] description = node.xpath( './/p[@class="description"]/text()').extract()[0] item["name"] = name item["link"] = link item["description"] = description print("link==", link) yield item
def parse(self, response): list = response.xpath('//div[@class="post floated-thumb"]') for node in list: item = ExampleItem() item['name'] = node.xpath( './/div[@class="post-thumb"]/a/@title').extract_first() item['detailurl'] = node.xpath( './/div[@class="post-thumb"]/a/@href').extract_first() yield item #取到当前页面下其他页面的url next_urls = response.xpath( '//a[@class="page-numbers"]/@href').extract() print(next_urls) for url in next_urls: yield Request(url=url, callback=self.parse)
def parse(self, response): with open('data.json', 'wb') as f: f.write(response.body) item = ExampleItem() '''print('aaaaaa') item['next_page'] = response.css('div.pageBox')[1].xpath('.//a[@class="next"]/@href').extract_first()''' yield item for box in response.xpath( '//div[@class="f-list-item ershoufang-list"]'): #获取每个div中的课程路径 #item['url'] = box.xpath('./@href') #//*[@id="puid-2998490926"]/dl/dd[1]/a #获取div中的课程标题 item['title'] = box.xpath( './/dd[@class="dd-item title"]/a[@class="js-title value title-font"]/text()' ).extract()[0] #获取div中的价格 item['price'] = box.xpath( './/dd[@class="dd-item info"]/div[@class="price"]/span[@class="num"]/text()' ).extract()[0] #获取房屋面积 item['area'] = box.xpath( './/dl[@class="f-list-item-wrap f-clear"]/dd[@class="dd-item size"]/span[5]/text()' ).extract()[0] #房屋租赁方式 item['rent'] = box.xpath( './/dl[@class="f-list-item-wrap f-clear"]/dd[@class="dd-item size"]/span[@class="first js-huxing"]/text()' ).extract()[0] yield item next_page = 'http://bj.ganji.com' + response.css('div.pageBox')[ 1].xpath('.//a[@class="next"]/@href').extract_first() #print(next_page) if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse, headers={'referer': next_page})
def parse_directory(self, response): item = ExampleItem() url = response.url.strip() jid = self.md5(url) location = response.xpath(r'//ul/li[2]/strong/a/text()').extract()[0] exp = response.xpath(r'//ul/li[5]/strong/text()').extract()[0].strip( '年') if '不限' in exp: exp = 0 else: exp = exp.strip('-')[0] degree = response.xpath(r'//ul/li[6]/strong/text()').extract()[0] crawled = datetime.datetime.now().strftime('%Y-%m-%d') title = response.xpath( r'//div[@class="inner-left fl"]/h1/text()').extract()[0].strip() money = response.xpath(r'//ul/li[1]/strong/text()').extract()[0].strip( '元/月\xa0') if '以上' in money: minmoney = maxmoney = money.strip('元/月以上') elif '面议' in money: minmoney = maxmoney = 0 elif '以下' in money: minmoney = maxmoney = money.strip('元/月以下') else: money = money.split('-') maxmoney = money[1].strip('元/月') minmoney = money[0] item['jid'] = jid item['title'] = title item['maxmoney'] = maxmoney item['minmoney'] = minmoney item['crawled'] = crawled item['location'] = location item['exp'] = exp item['degree'] = degree item['url'] = url yield item
def parse_page(self, response): item = ExampleItem() item['url'] = response.url item['title'] = response.xpath( '/html/head/title/text()').extract_first() m = re.match(r'(\d+)年(\d+)月份(\S+)\S车\(分制造商\)销量', item['title']) if m is not None: year = m.group(1) month = m.group(2) country = m.group(3) conn = pymysql.Connect(host='127.0.0.1', port=3306, user='******', passwd='123456', db='car', charset='utf8') trs2 = response.xpath( '//div[@class="newstext"]/table/tbody/tr/td[2]/font') trs = response.xpath( '//div[@class="newstext"]/table/tbody/tr/td[1]/font') for tr in trs: i = trs.index(tr) logo = tr.xpath('string(.)').extract()[0] num = trs2[i].xpath('string(.)').extract()[0] sql = "insert into sale values('%s','%s','%s','%s','%s')" % ( year, month, country, logo, num) cursor = conn.cursor() try: cursor.execute(sql) except: yield {'insert error': item['url']} cursor.close() yield {'logo': logo, 'num': num} yield {'year': year, 'month': month, 'country': country} conn.commit() conn.close() yield {'url': item['url'], 'name': item['title']} return item
def parse_mor(self, response,mvsclass,img,name,mvUrl): for select in response.xpath('//div[@class="contentinfo"]'): mvdownloadUrl = select.xpath("div/table/tbody/.//tr/td/a/@href").extract() # 下载地址,可能是多个 mvdtilte = select.xpath("div/table/tbody/.//tr/td/a/text()").extract()#下载标签的文本 mvdesc = select.xpath("div[@id='text']/.//p/text()")#/p[2]/text() desc = '' for p in mvdesc: desc = desc+p.extract().strip() # desc= str(desc).replace('\\u3000',' ') mvdownloadUrl = ";".join(mvdownloadUrl) Item = ExampleItem() Item['movClass'] = mvsclass Item['downLoadName'] = name if str(mvdtilte).strip()=='': mvdtilte = "点击下载" Item['downdtitle'] = str(mvdtilte) Item['downimgurl'] = img Item['downLoadUrl'] = mvdownloadUrl Item['mvdesc'] = desc Item['mvUrl'] = mvUrl yield Item pass
def parse(self, response): try: data_dict = json.loads(response.body) except Exception as e: print(response.body) print(response.status) print(e) return self.is_ok = True fares = data_dict.get('fares') journeys = jsonpath(data_dict, '$..journeys')[0] for journey in journeys: segments = journey.get('segments') if len(segments) > 1: continue journey_key = journey.get('journeySellKey') important = filter(lambda x: x, re.split(r'~[~|\s]*', journey_key)) carrier, fn_no, dep_port, dep_date, arr_port, arr_date = important flight_number = carrier + fn_no dep_time = time.mktime(time.strptime(dep_date, '%m/%d/%Y %H:%M')) arr_time = time.mktime(time.strptime(arr_date, '%m/%d/%Y %H:%M')) available_fares = jsonpath(segments, '$..availableFares')[0] dep_city = self.port_city.get(dep_port, dep_port) arr_city = self.port_city.get(arr_port, arr_port) # 获取有座位的最低价和套餐价格 fare_index_low = None dif_pack = [] seats = 0 for i, available_fare in enumerate(available_fares): fare_index_temp = available_fare.get('fareIndex') if fare_index_low is None: fare_index_low = fare_index_temp seats_str = available_fare.get('availableCount') seats = 9 if seats_str == 32767 else seats_str if i: fare_temp = fares[fare_index_temp] price_temp = jsonpath(fare_temp, '$..amount')[0] seats_temp_str = available_fare.get('availableCount') seats_temp = 9 if seats_temp_str == 32767 else seats_temp_str dif_pack.append([price_temp, seats_temp]) fare = fares[fare_index_low] cabin = fare.get('classOfService') price = jsonpath(fare, '$..amount')[0] net_fare = price adult_tax = 0 currency = jsonpath(fare, '$..currencyCode')[0] item = ExampleItem() item.update( dict( flight_number=flight_number, dep_time=dep_time, arr_time=arr_time, dep_port=dep_port, arr_port=arr_port, currency=currency, adult_price=price, adult_tax=adult_tax, net_fare=net_fare, max_seats=seats, cabin=cabin, carrier=carrier, is_change=1, segments=json.dumps(dif_pack), get_time=time.time(), from_city=dep_city, to_city=arr_city, info='', )) yield item