def parse_detail(self, response): res = BeautifulSoup(response.body) appleitem = AppleItem() appleitem["title"] = res.select("#h1")[0].text appleitem["content"] = res.select(".trans")[0].text appleitem["time"] = res.select(".gggs time")[0].text return appleitem
def parse_detail(self, response): res = BeautifulSoup(response.body, 'lxml') appleitem = AppleItem() appleitem['title'] = res.select('h1')[0].text appleitem['content'] = res.select('.ndArticle_margin p')[0].text appleitem['time'] = res.select('.ndArticle_creat')[0].text return appleitem
def parse_detail(self, response): res = BeautifulSoup(response.body) appleItem = AppleItem() appleItem['title'] = res.select('#h1')[0].text appleItem['content'] = res.select('.trans')[0].text appleItem['time'] = res.select('.gggs time')[0].text return appleItem
def parse_detail(self, response): appleitem = AppleItem() appleitem['title'] = response.xpath( '//article//h1/text()').extract_first() appleitem['content'] = response.xpath( '//*[@id="article"]//div//p//text()').extract_first() yield appleitem
def parse_detail(self, response): res = BeautifulSoup(response.text,'lxml') appleitem = AppleItem() appleitem['title'] = res.select('h1')[0].text appleitem['content'] =res.select('.case')[0].text appleitem['time'] = res.select('.stime')[0].text #print (res.select('h1')[0].text) return appleitem
def parse_detail(self, response): res = BeautifulSoup(response.body) appleitem = AppleItem() # define all fields appleitem['title'] = res.select('#h1')[0].text appleitem['content'] = res.select('#summary')[0].text appleitem['time'] = res.select('.gggs time')[0].text return appleitem
def parse(self, response): img_urls = response.xpath('//img') a = 0 for img_url in img_urls: url = img_url.xpath('@src').extract() item = AppleItem() item['name'] = a item['addr'] = url[0] a += 1 yield item
def parse_detail(self, response): item = AppleItem() res = BeautifulSoup(response.body, 'lxml') #print (res.select('.ndArticle_leftColumn')[0].select('h1')[0].text) #print (res.select('.ndArticle_margin')[0].select('p')[0].text) item['title'] = res.select('.ndArticle_leftColumn')[0].select( 'h1')[0].text item['content'] = res.select('.ndArticle_margin')[0].select( 'p')[0].text item['url'] = response.url yield item
def parse_detail(self, response): reload(sys) sys.setdefaultencoding('utf-8') res = BeautifulSoup(response.body) appleitem = AppleItem() appleitem['title'] = res.select('h1')[0].text appleitem['price'] = res.select('.priceinfo .price')[0].text appleitem['category'] = res.select('#cl-breadcrumbs h3')[0].text print res.select('#cl-breadcrumbs h3')[0].text print res.select('#cl-breadcrumbs h3')[1].text print res.select('#cl-breadcrumbs h3')[2].text return appleitem
def parse_list(self, response): res = BeautifulSoup(response.body, 'lxml') for news in res.select('.rtddt'): item = AppleItem() item['name'] = news.select('h1')[0].text item['url'] = news.select('a')[0]['href'] item['time'] = news.select('time')[0].text print('----------', item['time']) item['kind'] = news.select('h2')[0].text print('----------', item['kind']) yield item #print (news.select('h1')[0].text,news.select('a')[0]['href']) yield scrapy.Request( news.select('a')[0]['href'], self.parse_detail)
def parse(self, response): hxs = HtmlXPathSelector(response) apps = hxs.select('//section/div/ul/li') items = [] for app in apps: item = AppleItem() item['app_name'] = app.select('.//h3/a/text()').extract() item['appstore_link'] = app.select('.//h3/a/@href').extract() item['category'] = app.select('.//h4/a/text()').extract() item['img_src'] = app.select('.//a/img/@src').extract() items.append(item) return items
def next_price(self, response): item = AppleItem() price = response.css('.as-price-currentprice span::text')[0].re( '[^\s-].*[^\s-]') location = response.css('title::text').re('\(.+\)') item['price'] = price # two websites have exceptions if price == ['$1,299.00']: item['location'] = ['(US)'] elif price == ['¥142,800 (税別)']: item['location'] = ['(JP)'] else: item['location'] = location yield item
def parse(self, response): if response.url.index('https://tw.appledaily.com/new/realtime/') >= 0: res = BeautifulSoup(response.body, 'lxml') for news in res.select('.rtddt'): item = AppleItem() item['name'] = news.select('h1')[0].text item['url'] = news.select('a')[0]['href'] item['time'] = news.select('time')[0].text print('----------', item['time']) item['kind'] = news.select('h2')[0].text print('----------', item['kind']) yield item #print (news.select('h1')[0].text,news.select('a')[0]['href']) yield scrapy.Request( news.select('a')[0]['href'], self.parse_detail) for page in range(2, 10): yield scrapy.Request( 'https://tw.appledaily.com/new/realtime/' + str(page), self.parse_list)
def parse(self, response): items = AppleItem() lable = response.xpath("//div[@class='RTitem']") for i in lable: items['date'] = i.xpath( "div[@class='RTitemRHS']/div[@class='date']/text()" ).extract_first() items['time'] = i.xpath( "div[@class='RTitemRHS']/div[@class='time']/text()" ).extract_first() items['views'] = i.xpath( "div[@class='RTitemRHS']/div[@class='view02']/text()" ).extract_first() items['title'] = i.xpath( "div[@class='RTitemRHS']/div[@class='text']/a/text()").extract( )[0] items['url'] = i.xpath( "div[@class='RTitemRHS']/div[@class='text']/a/@href" ).extract_first() yield items
def parse(self, response): counter = 0 list_month = [] list_year = [] list_day = [] list_time = [] list_mission = [] list_where = [] list_TBD = [] res = BeautifulSoup(response.body, features="lxml") appleitem = AppleItem() for news in res.select('.sc-launch__month'): print(news.text) list_month.append(news.text) #appleitem['month'] = news.text for news in res.select('.sc-launch__year'): print(news.text) list_year.append(news.text) #appleitem['year'] = news.text for news in res.select('.sc-launch__day'): print(news.text) if (news.text == 'TBD'): list_TBD.append(news.text) else: list_day.append(news.text) #appleitem['day'] = news.text for item in list_TBD: list_day.append(item) for news in res.select('.sc-launch__time'): length = len(str(news.text)) - 1 while (length): if ((str(news.text)[length] != ' ') & (str(news.text)[length] != '\n')): end = length have_content = True if (have_content & (str(news.text)[length] == ' ')): #start = length have_content = False #print("start = ", start, "end = ", end) break length = length - 1 if (length != 0): print(str(news.text)[end - 6:end + 7]) list_time.append(str(news.text)[end - 6:end + 7]) #appleitem['time'] = "none" elif (length == 0): print("none") list_time.append("none") for news in res.select('.sc-launch__content'): length = len(str(news.text)) - 1 for i in range(0, length): if (str(news.text)[i].isalpha()): start2 = i break while (length): if ((str(news.text)[length] != ' ') & (str(news.text)[length] != '\n')): end2 = length break else: length = length - 1 counter = counter + 1 if (counter % 2 == 1): #mission print(news.text[start2:end2]) list_mission.append(news.text[start2:end2]) #appleitem['mission'] = news.text[start2:end2] else: #where print(news.text[start2:end2]) list_where.append(news.text[start2:end2]) #appleitem['where'] = news.text[start2:end2] size_of_continuous_none = len(list_TBD) del list_time[0:size_of_continuous_none] for i in range(0, size_of_continuous_none): list_time.append("none") appleitem['month'] = list_month appleitem['year'] = list_year appleitem['day'] = list_day appleitem['time'] = list_time appleitem['mission'] = list_mission appleitem['where'] = list_where return appleitem
def parse_detail(self, response): res = BeautifulSoup(response.body, 'html.parser') item = AppleItem() item['title'] = res.select('.entry-header h1')[0].text item['date'] = res.select('.entry-header time')[0].text return item