예제 #1
0
 def parse_detail(self, response):
     res = BeautifulSoup(response.body)
     appleitem = AppleItem()
     appleitem["title"] = res.select("#h1")[0].text
     appleitem["content"] = res.select(".trans")[0].text
     appleitem["time"] = res.select(".gggs time")[0].text
     return appleitem
예제 #2
0
	def parse_detail(self, response):
		res = BeautifulSoup(response.body, 'lxml')
		appleitem = AppleItem()
		appleitem['title'] = res.select('h1')[0].text
		appleitem['content'] = res.select('.ndArticle_margin p')[0].text
		appleitem['time'] = res.select('.ndArticle_creat')[0].text
		return appleitem
예제 #3
0
 def parse_detail(self, response):
     res = BeautifulSoup(response.body)
     appleItem = AppleItem()
     appleItem['title'] = res.select('#h1')[0].text
     appleItem['content'] = res.select('.trans')[0].text
     appleItem['time'] = res.select('.gggs time')[0].text
     return appleItem
예제 #4
0
 def parse_detail(self, response):
     appleitem = AppleItem()
     appleitem['title'] = response.xpath(
         '//article//h1/text()').extract_first()
     appleitem['content'] = response.xpath(
         '//*[@id="article"]//div//p//text()').extract_first()
     yield appleitem
예제 #5
0
 def parse_detail(self, response):
     res = BeautifulSoup(response.text,'lxml')
     appleitem = AppleItem()
     appleitem['title'] = res.select('h1')[0].text
     appleitem['content'] =res.select('.case')[0].text 
     appleitem['time'] = res.select('.stime')[0].text
     #print (res.select('h1')[0].text)
     return appleitem
예제 #6
0
 def parse_detail(self, response):
     res = BeautifulSoup(response.body)
     appleitem = AppleItem()
     # define all fields
     appleitem['title'] = res.select('#h1')[0].text
     appleitem['content'] = res.select('#summary')[0].text
     appleitem['time'] = res.select('.gggs time')[0].text
     return appleitem
예제 #7
0
    def parse(self, response):
        img_urls = response.xpath('//img')
        a = 0
        for img_url in img_urls:

            url = img_url.xpath('@src').extract()
            item = AppleItem()
            item['name'] = a
            item['addr'] = url[0]
            a += 1
            yield item
예제 #8
0
 def parse_detail(self, response):
     item = AppleItem()
     res = BeautifulSoup(response.body, 'lxml')
     #print (res.select('.ndArticle_leftColumn')[0].select('h1')[0].text)
     #print (res.select('.ndArticle_margin')[0].select('p')[0].text)
     item['title'] = res.select('.ndArticle_leftColumn')[0].select(
         'h1')[0].text
     item['content'] = res.select('.ndArticle_margin')[0].select(
         'p')[0].text
     item['url'] = response.url
     yield item
예제 #9
0
 def parse_detail(self, response):
     reload(sys)
     sys.setdefaultencoding('utf-8')
     res = BeautifulSoup(response.body)
     appleitem = AppleItem()
     appleitem['title'] = res.select('h1')[0].text
     appleitem['price'] = res.select('.priceinfo .price')[0].text
     appleitem['category'] = res.select('#cl-breadcrumbs h3')[0].text
     print res.select('#cl-breadcrumbs h3')[0].text
     print res.select('#cl-breadcrumbs h3')[1].text
     print res.select('#cl-breadcrumbs h3')[2].text
     return appleitem
예제 #10
0
 def parse_list(self, response):
     res = BeautifulSoup(response.body, 'lxml')
     for news in res.select('.rtddt'):
         item = AppleItem()
         item['name'] = news.select('h1')[0].text
         item['url'] = news.select('a')[0]['href']
         item['time'] = news.select('time')[0].text
         print('----------', item['time'])
         item['kind'] = news.select('h2')[0].text
         print('----------', item['kind'])
         yield item
         #print (news.select('h1')[0].text,news.select('a')[0]['href'])
         yield scrapy.Request(
             news.select('a')[0]['href'], self.parse_detail)
예제 #11
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        apps = hxs.select('//section/div/ul/li')
        items = []

        for app in apps:
            item = AppleItem()
            item['app_name'] = app.select('.//h3/a/text()').extract()
            item['appstore_link'] = app.select('.//h3/a/@href').extract()
            item['category'] = app.select('.//h4/a/text()').extract()
            item['img_src'] = app.select('.//a/img/@src').extract()

            items.append(item)

        return items
예제 #12
0
파일: mbp.py 프로젝트: Felix-Suen/Apple
    def next_price(self, response):
        item = AppleItem()
        price = response.css('.as-price-currentprice span::text')[0].re(
            '[^\s-].*[^\s-]')
        location = response.css('title::text').re('\(.+\)')

        item['price'] = price

        # two websites have exceptions
        if price == ['$1,299.00']:
            item['location'] = ['(US)']
        elif price == ['¥142,800 (税別)']:
            item['location'] = ['(JP)']
        else:
            item['location'] = location

        yield item
예제 #13
0
    def parse(self, response):
        if response.url.index('https://tw.appledaily.com/new/realtime/') >= 0:
            res = BeautifulSoup(response.body, 'lxml')
            for news in res.select('.rtddt'):
                item = AppleItem()
                item['name'] = news.select('h1')[0].text
                item['url'] = news.select('a')[0]['href']
                item['time'] = news.select('time')[0].text
                print('----------', item['time'])
                item['kind'] = news.select('h2')[0].text
                print('----------', item['kind'])
                yield item
                #print (news.select('h1')[0].text,news.select('a')[0]['href'])
                yield scrapy.Request(
                    news.select('a')[0]['href'], self.parse_detail)

            for page in range(2, 10):
                yield scrapy.Request(
                    'https://tw.appledaily.com/new/realtime/' + str(page),
                    self.parse_list)
예제 #14
0
    def parse(self, response):
        items = AppleItem()

        lable = response.xpath("//div[@class='RTitem']")

        for i in lable:
            items['date'] = i.xpath(
                "div[@class='RTitemRHS']/div[@class='date']/text()"
            ).extract_first()
            items['time'] = i.xpath(
                "div[@class='RTitemRHS']/div[@class='time']/text()"
            ).extract_first()
            items['views'] = i.xpath(
                "div[@class='RTitemRHS']/div[@class='view02']/text()"
            ).extract_first()
            items['title'] = i.xpath(
                "div[@class='RTitemRHS']/div[@class='text']/a/text()").extract(
                )[0]
            items['url'] = i.xpath(
                "div[@class='RTitemRHS']/div[@class='text']/a/@href"
            ).extract_first()

            yield items
    def parse(self, response):
        counter = 0
        list_month = []
        list_year = []
        list_day = []
        list_time = []
        list_mission = []
        list_where = []
        list_TBD = []
        res = BeautifulSoup(response.body, features="lxml")
        appleitem = AppleItem()

        for news in res.select('.sc-launch__month'):
            print(news.text)
            list_month.append(news.text)
            #appleitem['month'] = news.text
        for news in res.select('.sc-launch__year'):
            print(news.text)
            list_year.append(news.text)
            #appleitem['year'] = news.text
        for news in res.select('.sc-launch__day'):
            print(news.text)
            if (news.text == 'TBD'):
                list_TBD.append(news.text)
            else:
                list_day.append(news.text)
                #appleitem['day'] = news.text

        for item in list_TBD:
            list_day.append(item)

        for news in res.select('.sc-launch__time'):
            length = len(str(news.text)) - 1

            while (length):
                if ((str(news.text)[length] != ' ') &
                    (str(news.text)[length] != '\n')):
                    end = length
                    have_content = True
                if (have_content & (str(news.text)[length] == ' ')):
                    #start = length
                    have_content = False
                    #print("start = ", start, "end = ", end)
                    break
                length = length - 1

            if (length != 0):
                print(str(news.text)[end - 6:end + 7])
                list_time.append(str(news.text)[end - 6:end + 7])
                #appleitem['time'] = "none"
            elif (length == 0):
                print("none")
                list_time.append("none")

        for news in res.select('.sc-launch__content'):
            length = len(str(news.text)) - 1

            for i in range(0, length):
                if (str(news.text)[i].isalpha()):
                    start2 = i
                    break

            while (length):
                if ((str(news.text)[length] != ' ') &
                    (str(news.text)[length] != '\n')):
                    end2 = length
                    break
                else:
                    length = length - 1

            counter = counter + 1
            if (counter % 2 == 1):  #mission
                print(news.text[start2:end2])
                list_mission.append(news.text[start2:end2])
                #appleitem['mission'] = news.text[start2:end2]
            else:  #where
                print(news.text[start2:end2])
                list_where.append(news.text[start2:end2])
                #appleitem['where'] = news.text[start2:end2]

        size_of_continuous_none = len(list_TBD)

        del list_time[0:size_of_continuous_none]
        for i in range(0, size_of_continuous_none):
            list_time.append("none")

        appleitem['month'] = list_month
        appleitem['year'] = list_year
        appleitem['day'] = list_day
        appleitem['time'] = list_time
        appleitem['mission'] = list_mission
        appleitem['where'] = list_where

        return appleitem
예제 #16
0
 def parse_detail(self, response):
     res = BeautifulSoup(response.body, 'html.parser')
     item = AppleItem()
     item['title'] = res.select('.entry-header h1')[0].text
     item['date'] = res.select('.entry-header time')[0].text
     return item