Esempi in Python per TutorialItem, esempi in Python per tutorial.items.TutorialItem

Esempio n. 1

0

Mostra file

File: page_spider.py Progetto: copyNdpaste/crawler

 def parse(self, response): #페이지의 기업 홈페이지 주소를 스크래핑함
     self.log('I just visited: ' + response.url)
     infos = response.xpath('//*[@id="devStarterForm"]/div[2]/ul//li')
     #여기서 페이지 이동해서 값 가져오는 함수 호출(???)
     for info in infos:
         item = TutorialItem()
         item['company_name'] = info.xpath('div[1]/div[1]/a/text()')[0].extract()
         link = info.xpath('div[1]/div[1]/a/@href')[0].extract()
         if link[0] == '/':
             self.log('first char is /')
             item['company_info'] = 'www.jobkorea.co.kr'+info.xpath('div[1]/div[1]/a/@href')[0].extract()
         elif link[0] == 'h':
             item['company_info'] = info.xpath('div[1]/div[1]/a/@href')[0].extract()
         item['title'] = info.xpath('div[2]/div[1]/a/span/text()')[0].extract()
         deadline = info.xpath('div[4]/span[@class="day"]/text() | div[4]/span[@class="day schedule"]/text() | div[4]/span[@class="day tomorrow"]/text() | div[4]/span[@class="day today"]/text()')[:].extract()
         item['deadline'] = deadline
         item['achievement'] = info.xpath('div[3]/span[1]/text()')[0].extract()
         item['career'] = info.xpath('div[3]/strong/text()')[0].extract()
         item['area'] = info.xpath('div[3]/span[2]/text()')[0].extract()
         item['job'] = info.xpath('div[2]/div[2]/span/text()')[:].extract()
         link = response.urljoin(info.xpath('div[2]/div[1]/a/@href')[0].extract())
         #print(link)
         #yield scrapy.Request(link, callback=self.parse_homepage)
         #lenth = len(item)
         #for i in range(lenth):
         #    self.collection.insert({list(item.keys())[i]:list(item.values())[i]})
         for key in item:
             self.dictionary[key] = item.get(key)
         self.collection.insert(self.dictionary,manipulate=False)
         #self.collection.insert({'hi': 'www.jobkorea.co.kr/Recruit/Co_Read/C/fany77sy?Oem_Code=C1&PageGbn=ST',
         #                        "sdfsdf":12})
         yield item

Esempio n. 2

0

Mostra file

    def parse(self, response):  #parse方法用来处理request返回的结果。关于这一部分的一些内容，我在后面详细介绍。
        title = response.selector.xpath(
            ".//*[@id='content_bg']/div[2]/div[1]/div[1]/div[2]/p/text()"
        ).extract()
        version = response.selector.xpath(
            ".//*[@id='content_bg']/div[2]/div[1]/div[1]/div[2]/ul[1]/li[1]/a/text()"
        ).extract()
        grade = response.selector.xpath(
            ".//*[@id='content_bg']/div[2]/div[1]/div[1]/div[2]/ul[1]/li[2]/a/text()"
        ).extract()
        subject = response.selector.xpath(
            ".//*[@id='content_bg']/div[2]/div[1]/div[1]/div[2]/ul[1]/li[3]/a/text()"
        ).extract()
        publishing = response.selector.xpath(
            ".//*[@id='content_bg']/div[2]/div[1]/div[1]/div[2]/ul[1]/li[4]/text()"
        ).extract()

        Tutorial = TutorialItem()
        # 存入items
        Tutorial["title"] = title
        Tutorial["version"] = version
        Tutorial["grade"] = grade
        Tutorial["subject"] = subject
        Tutorial["publishing"] = publishing
        yield Tutorial

Esempio n. 3

0

Mostra file

File: books.py Progetto: allanwong/SpiderHouse

    def parse(self, response):

        try:

            for book in response.css('article.product_pod'):
                items = TutorialItem()

                #抓取商品名称
                items['name'] = book.xpath('./h3/a/@title').extract_first()

                #抓取商品价格
                #price = book.xpath('./div[2]/p[1]/text()').extract_first()
                items['price'] = book.css(
                    'p.price_color::text').extract_first()

                #抓取商品链接
                href = book.xpath('./div[1]/a/@href').extract_first()
                href = response.urljoin(href)
                items['href'] = href

                yield scrapy.Request(url=href,
                                     meta={'items': items},
                                     callback=self.pare_detail,
                                     dont_filter=True)

        except BaseException, err:
            print(err)

Esempio n. 4

0

Mostra file

    def parse(self, response):
        filename = response.url.split('/')[-2] + ".txt"

        item = TutorialItem()
        item['title'] = response.xpath(
            "/html/body/div[8]/div[2]/div").extract()
        yield item

Esempio n. 5

0

Mostra file

File: dmoz_spider.py Progetto: ryanzicky/1

    def parse(self, response):
        for line in response.xpath('//li[@class="j_thread_list clearfix"]'):
            # 初始化item对象保存爬取的信息
            item = TutorialItem()
            # 解析爬取的内容
            item['title'] = line.xpath(
                './/div[contains(@class, "threadlist_title pull_left j_th_tit")]/a/text()'
            ).extract()
            item['author'] = line.xpath(
                './/div[contains(@class, "threadlist_author pull_right")]//span[contains(@class, "frs-author-name-wrap")]/a/text()'
            ).extract()
            item['reply'] = line.xpath(
                './/div[contains(@class, "col2_left j_threadlist_li_left")]/span/text()'
            ).extract()

            self.logger.info(
                'ssssssssssssssssssssssssssssssssssssssssssssssssssssssssss')
            self.logger.info({
                'title': item['title'],
                'author': item['author'],
                'reply': item['reply']
            })
            self.logger.info(
                'qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq')
            yield item

Esempio n. 6

0

Mostra file

File: book2.py Progetto: lp2016/scrapy

 def getList(self, response):
     item = TutorialItem()
     res = response.xpath('//div[@class="volume"]')
     url_list = []
     if response.xpath('//div[@class="volume"][1]/h3/text()').extract(
     )[1].strip() == '作品相关':
         for i in range(2, len(res) + 1):
             url_list.extend(
                 response.xpath(
                     '//div[@class="volume"][{}]//ul//li//a/@href'.format(
                         i)).extract())
     else:
         for i in range(1, len(res) + 1):
             url_list.extend(
                 response.xpath(
                     '//div[@class="volume"][{}]//ul//li//a/@href'.format(
                         i)).extract())
     chapterID = 1
     for url in url_list:
         item['chapterID'] = chapterID
         yield scrapy.Request(url='https:' + url,
                              callback=self.getContent,
                              meta={'key': chapterID},
                              errback=self.errback_httpbin)
         chapterID += 1

Esempio n. 7

0

Mostra file

File: dmoz_spider.py Progetto: tim1004457/python_script

 def parse(self, response):
     print("receive data")
     host = 'http://www.t66y.com/'
     hxs = Selector(response)
     sites = hxs.xpath('//tr[@class="tr3 t_one"]')
     for site in sites:
         item = TutorialItem()
         linkSite = site.xpath(
             'td[@style="text-align:left;padding-left:8px"]/h3/a[@target="_blank"]/@href'
         )
         titleSite = site.xpath(
             'td[@style="text-align:left;padding-left:8px"]/h3/a[@target="_blank"]/text()'
         )
         timeSite = site.xpath('td/a[@class="f10"]/text()')
         countTime = site.xpath('td[@class="tal f10 y-style"]/text()')
         if titleSite.__len__() == 0:
             continue
         for title in titleSite:
             item['title'] = title.extract()
         for link in linkSite:
             item['link'] = host + (link.extract())
         for time in timeSite:
             item['publicTime'] = (time.extract())
         for count in countTime:
             item['clickTimes'] = (count.extract())
         self.saveInDb(item)

Esempio n. 8

0

Mostra file

 def detail_parse(self, response):
     items = TutorialItem()
     baseurl = 'https://www.tjnhm.com/'
     items['midx'] = '16'
     url = ''
     if response.xpath('/html/body/div[2]/div[1]/div[2]/p[1]/span/img/@src'
                       ).get() != None:
         url = response.xpath(
             '/html/body/div[2]/div[1]/div[2]/p[1]/span/img/@src').get()
     else:
         url = response.xpath(
             '/html/body/div[2]/div[1]/div[2]/p[1]/img/@src').get()
     if url == None:
         return
     items['pic'] = baseurl + url
     items['name'] = response.xpath(
         '/html/body/div[2]/div[1]/div[2]/h1/text()').get().strip()
     t = ''
     for i in response.xpath('//*[@id="aboutus_text"]//span'):
         if i.xpath('text()').get() == None:
             return
         t = t + i.xpath('text()').get().strip()
     items['text'] = t
     print(items['name'])
     print(items['text'])
     print(items['pic'])
     yield items

Esempio n. 9

0

Mostra file

File: leboncoin.py Progetto: AlexisAubineau/Scrapy-Crawling

    def parse(self, response):
        jsonresponse = json.loads(response.text)
        for ads in jsonresponse["ads"]:
            item = TutorialItem()
            if "subject" in ads:
                item["subject"] = ads["subject"]
            if "price" in ads:
                item["price"] = ads["price"][0]
            if "category_name" in ads:
                item["category"] = ads["category_name"]
            if "location" in ads:
                if "city" in ads["location"]:
                    item["location"] = ads["location"]["city"]
            item["url"] = ads["url"]
            yield {'object': item}
        self.i = self.i + 1
        print(self.i)

        if len(jsonresponse["ads"]) == 35:
            self.frmdata["offset"] = self.frmdata["offset"] + 35
            yield scrapy.Request(self.url,
                                 callback=self.parse,
                                 method='POST',
                                 body=json.dumps(self.frmdata),
                                 headers=self.headr,
                                 dont_filter=True)

Esempio n. 10

0

Mostra file

 def parse(self, response):
     self.logger.info('A response from %s just arrived!', response.url)
     # filename = response.url.split("/")[-1] + '.html'
     # with open(filename, 'wb') as f:
     #     f.write(response.body)
     
     
     
     # items = []
     # for node in response.xpath("//div[@class='col-md-8']/div[@class='quote']"):
     #     item=TutorialItem()
     #     content = node.xpath("./span[1]/text()").extract()
     #     author=node.xpath("./span[2]/small/text()").extract()
     #     tags = node.xpath('./div[@class="tags"]/a/text()').extract()
     #     item["author"]=author
     #     item["content"]=content
     #     item["tags"]=tags
     #     items.append(item)
     #     yield item
     # return items
     for node in response.xpath("//div[@class='col-md-8']/div[@class='quote']"):
         item = TutorialItem()
         item["author"]=node.xpath("./span[2]/small/text()").extract()
         item["content"]=node.xpath("./span[1]/text()").extract()
         item["tags"]=node.xpath('./div[@class="tags"]/a/text()').extract()
         yield item
     for href in response.xpath('//li[@class="next"]/a/@href'):
         if href:
             url=response.urljoin(href.extract())
             yield scrapy.Request(url,callback=self.parse)

Esempio n. 11

0

Mostra file

File: quotes.py Progetto: nkmk/scrapy-example

 def parse(self, response):
     for quote in response.css('div.quote'):
         item = TutorialItem()
         item['author'] = quote.css('small.author::text').extract_first()
         item['text'] = quote.css('span.text::text').extract_first()
         item['tags'] = quote.css('div.tags a.tag::text').extract()
         yield item

Esempio n. 12

0

Mostra file

File: Crawler_Flipkart.py Progetto: manik005/Crawler

 def parse_list(self, response):
     hxs = Selector(response)
     titles = hxs.select(
         "//div[contains(@class,'product-unit unit-4 browse-product new-design')]"
     )
     items = []
     count1 = 0
     for title in titles:
         count1 = count1 + 1
         item = TutorialItem()
         item['model'] = str(
             title.select(".//div[contains(@class,'pu-title')]/a/text()").
             extract()).encode('utf-8').strip()
         item['offer'] = title.select(
             ".//div[contains(@class,'pu-final')]/span/text()").extract()
         item['image'] = title.select(
             ".//div[contains(@class,'pu-visual-section')]/a/img/@data-src")
         item['standard_url'] = "http://www.flipkart.com" + \
             title.select(
                 ".//div[contains(@class,'pu-title')]/a/@href")[0].extract()
         # return items
         request = Request(item['standard_url'], callback=self.new_features)
         request.meta['item'] = item
         items.append(item)
         yield request

Esempio n. 13

0

Mostra file

 def parse(self, response):
     item = TutorialItem()
     selector = Selector(response)
     Movies = selector.xpath('//div[@class="info"]')
     for eachMovie in Movies:
         title = eachMovie.xpath('div[@class="hd"]/a/span/text()').extract()  # 多个span标签
         fullTitle = "".join(title)  # 将多个字符串无缝连接起来
         movieInfo = eachMovie.xpath('div[@class="bd"]/p/text()').extract()
         star = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span/text()').extract()[0]
         quote = eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()
         # quote可能为空，因此需要先进行判断
         if quote:
             quote = quote[0]
         else:
             quote = ''
         item['title'] = fullTitle
         item['movieInfo'] = ';'.join(movieInfo)
         item['star'] = star
         item['quote'] = quote
         yield item
     nextLink = selector.xpath('//span[@class="next"]/link/@href').extract()
     # 第10页是最后一页，没有下一页的链接
     if nextLink:
         nextLink = nextLink[0]
         yield Request(urljoin(response.url, nextLink), callback=self.parse)

Esempio n. 14

0

Mostra file

File: QuotesSpider.py Progetto: snjl/python.spider.scrapy_test

    def parse(self, response):
        bs_obj = BeautifulSoup(response.text)
        items = bs_obj.find_all('div', {'class': 'quote'})  # 获取列表
        for item in items:
            item_field = TutorialItem()  # 每一个item信息存储到item_field里
            text = item.find('span', {
                'itemprop': 'text',
                'class': 'text'
            }).text
            author = item.find('small', {'class': 'author'}).text
            tags = item.find_all('a', {'class': 'tag'})
            tags = [tag.text for tag in tags]  # 获取tags列表里的每一个tag的文本
            item_field['text'] = text  # 存储数据到item_field
            item_field['author'] = author
            item_field['tags'] = tags
            self.logger.info(item_field['text'])
            yield item_field  # 使用生成器，每次调用都会从结束处开始，会生成新的item_field，爬取和计算后会返回数据

        # 获取下一页，由于使用BeautifulSoup比较麻烦，而且错误处理比较不方便，所以使用css选择器
        next_page = response.css('.pager .next a::attr(href)').extract_first()
        # 如果next_page存在
        if next_page:
            # 使用urljoin获取绝对地址
            next_url = response.urljoin(next_page)
            # 回调函数，继续调用该parse函数，传入next_url进行请求,生成一个新的Request加入队列
            yield scrapy.Request(url=next_url, callback=self.parse)

Esempio n. 15

0

Mostra file

File: first_spider.py Progetto: gogogo365/learning_spider

 def parse(self, response):
     item = TutorialItem()
     item["identification"] = response.xpath('//*[@id="b_footerItems"]//text()').extract()
     item["name"] = response.xpath('//*[@id="sbox"]//text()').extract()
     
     # print(next_page)
     yield item

Esempio n. 16

0

Mostra file

 def parse(self, response):
     ul = response.xpath('//ul[@class="gl-warp clearfix"]')
     items = []
     for li in ul.xpath('.//li[@class="gl-item"]'):
         item = TutorialItem()
         price = li.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-price"]/strong/i/text()').extract()
         title = li.xpath(
             './/div[@class="gl-i-wrap"]/div[@class="p-name p-name-type-2"]/a/em/text()').extract()
         description = li.xpath(
             './/div[@class="gl-i-wrap"]/div[@class="p-name p-name-type-2"]/a/@title').extract()
         img = li.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-img"]/a/img/@src').extract()
         if img:
             pic1 = img
         else:
             pic1 = li.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-img"]/a/img/@data-lazy-img').extract()
         pic2 = pic1
         item['sourceName'] = "jd"
         sourceURL = li.xpath(
             './/div[@class="gl-i-wrap"]/div[@class="p-name p-name-type-2"]/a/@href').extract()
         item['title'] = title[0].encode('utf-8')
         if price:
             item['price'] = price[0]
         else:
             item['price'] = 0
         item['description'] = description[0].encode('utf-8')
         item['pic1'] = pic1[0].encode('utf-8')
         item['pic2'] = pic2[0].encode('utf-8')
         item['sourceURL'] = sourceURL[0].encode('utf-8')
         items.append(item)
     return items

Esempio n. 17

0

Mostra file

 def parse(self, response):
     if response.status == '404':
         return
     # print(response.body.decode('utf-8'))
     baseurl = 'http://www.19371213.com.cn/collection/zdwwjs/201811'
     item = TutorialItem()
     item['midx'] = '44'
     item['name'] = response.xpath(
         '/html/body/div[6]/div/div/div/div[2]/div/div/article/section/div[2]/header/h4/text()'
     ).get().strip()
     t = ''
     for i in response.xpath(
             '/html/body/div[6]/div/div/div/div[2]/div/div/article/section/div[2]/div/div/div//img'
     ):
         t = i.xpath('@src').get()[1:]
     item['pic'] = baseurl + t
     t = ''
     for i in response.xpath(
             '/html/body/div[6]/div/div/div/div[2]/div/div/article/section/div[2]/div/div/div//p'
     ):
         if i.xpath('text()').get() != None:
             t = t + i.xpath('text()').get().strip()
     item['text'] = t
     print(item['name'])
     print(item['text'])
     print(item['pic'])
     yield item

Esempio n. 18

0

Mostra file

    def parse(self, response):
        item = TutorialItem()
        selector = Selector(response)
        movies = selector.xpath('//div[@class="info"]')
        for movie in movies:
            title = movie.xpath('div[@class="hd"]/a/span/text()').extract()
            fullTitle = ''
            for each in title:
                fullTitle += each
            movieInfo = movie.xpath('div[@class="bd"]/p/text()').extract()
            star = movie.xpath(
                'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()[0]
            quote = movie.xpath('div[@class="bd"]/p/span/text()').extract()
            if quote:
                quote = quote[0]
            else:
                quote = ''
            item['title'] = fullTitle
            item['movieInfo'] = ';'.join(movieInfo).replace(' ', '').replace(
                '\n', '')
            item['star'] = star[0]
            item['quote'] = quote
            yield item

        nextPage = selector.xpath('//span[@class="next"]/link/@href').extract()
        if nextPage:
            nextPage = nextPage[0]
            print(self.url + str(nextPage))
            yield Request(self.url + str(nextPage), callback=self.parse)

Esempio n. 19

0

Mostra file

File: spider.py Progetto: DakotaShen/NextGame-Game-Search-Engine

    def parse(self, response):
        sel = Selector(response)
        sites = sel.xpath('//*[@id="js-sort-filter-results"]/section/article')
        items = []

        for site in sites:
            item = TutorialItem()
            # title = site.xpath('a/div[2]/h3/text()').extract()
            # content = site.xpath('a/div[2]/p/text()').extract() # //*[@id="js-sort-filter-results"]/section/article[2]/a/div[2]/p/text()
            # item['title'] = [t.encode('utf-8') for t in title]
            # item['content'] = [c.encode('utf-8') for c in content]
            url = site.xpath('a/@href').extract()
            title = site.xpath('a/div[2]/h3/text()').extract()
            item['url'] = [u.encode('utf-8') for u in url]
            item['title'] = [t.encode('utf-8') for t in title]
            items.append(item)
            # print "item appending"
            # log.msg("Appending Item",'INFO')

        # log.msg("Appending Done",'INFO')
        # print "item appended"

        # print items
        '''
        print 'len item is: ' + str(len(item['url']))
        for i in range(len(item['url'])):
            print item['title'][i]
        '''
        return items

Esempio n. 20

0

Mostra file

 def parse_next_again(self, response):
     item = TutorialItem()
     name = response.xpath(
         '//div[@class = "add-to-cart"]/div[1]/p[1]/text()').extract()[0]
     x = response.xpath(
         '//div[@class = "add-to-cart"]/span/text()').extract()
     y = response.xpath('//div[@class = "add-to-cart"]/a/text()').extract()
     if x != []:
         sale = x
     if y != []:
         sale = y
     if x == [] and y == []:
         sale = response.xpath(
             '//div[@class = "add-to-cart"]/div[2]/span/text()').extract()
     #color = response.xpath('//div[]')
     item['name'] = name
     item['sale'] = sale[0].strip()
     s = sale[0].strip().encode('unicode-escape').decode('string_escape')
     print s
     if s == "Notify me":
         item['count'] = 1
     else:
         item['count'] = 0
         #print name
         list.append(name)
     #print(list)
     yield item

Esempio n. 21

0

Mostra file

File: jd_spider.py Progetto: Karlzzb/scrapy_karl

 def parse_product(self, response):
     '商品页获取title,price,product_id'
     req = []
     phonename = response.xpath(
         '//div/ul[contains(@class, "parameter2 p-parameter-list")]/li[1]/@title'
     ).extract_first()
     issueyear = response.xpath(
         '//div[contains(@class, "Ptable-item")][1]/dl/dd/text()').re(
             u'[1-9][0-9][0-9][1-9]\u5e74')
     issuemonth = response.xpath(
         '//div[contains(@class, "Ptable-item")][1]/dl/dd/text()').re(
             u'[1-9][0-9]*\u6708')
     pattern = r"(\d+)\.html$"
     id = re.findall(pattern, response.url)
     priceUrl = "https://p.3.cn/prices/mgets?&skuIds=J_" + str(id[0])
     item = TutorialItem()
     item['phonename'] = phonename
     item['issueyear'] = issueyear
     item['issuemonth'] = issuemonth
     item['itemurl'] = response.url
     request = scrapy.Request(priceUrl,
                              method="GET",
                              callback=self.parse_price)
     request.meta['item'] = item
     logging.log(logging.DEBUG, request)
     yield request

Esempio n. 22

0

Mostra file

    def parse_next_one(self, response):
        name = response.xpath(
            '//div[@class = "add-to-cart"]/div[1]/p[1]/text()').extract(
            )[0].strip()

        if name.find('Lite') != -1:
            sale = response.xpath('//div[@class = "add-to-cart"]/span/text()'
                                  ).extract()[0].strip()
        else:
            sale = response.xpath(
                '//div[@class = "add-to-cart"]/div[2]/span/text()').extract(
                )[0].strip()
            if sale == []:
                sale = response.xpath(
                    '//div[@class = "add-to-cart"]/div[2]/text()').extract(
                    )[0].strip()

        item = TutorialItem()
        item['name'] = name
        item['sale'] = sale
        s = sale.encode('unicode-escape').decode('string_escape')
        print s
        if s == "BUY NOW":
            item['count'] = 0
        else:
            item['count'] = 1
            #print name
            list.append(name)
        yield item

Esempio n. 23

0

Mostra file

File: dmoz_spider.py Progetto: japheth-starita/TryingScrapy

 def parse(self, response):
     for sel in response.xpath('//ul/li'):
         item = TutorialItem()
         item['title'] = sel.xpath('a/text()').extract()
         item['link'] = sel.xpath('a/@href').extract()
         item['desc'] = sel.xpath('text()').extract()
         yield item

Esempio n. 24

0

Mostra file

File: quotes.py Progetto: Bruce979527682/ScrapyProject

 def parse(self, response):
     for quote in response.css('div.quote'):
         yield TutorialItem(text=quote.css('span.text::text').get(),
                            author=quote.css('small.author::text').get())
     next_page = response.css('li.next a::attr(href)').get()
     if next_page is not None:
         yield response.follow(next_page, self.parse)

Esempio n. 25

0

Mostra file

File: dmoz_spider.py Progetto: liustay/Python-Practice-2

 def parse(self, response):
     movie_list = response.xpath(
         "//div[@class='article']/ol[@class='grid_view']/li")
     for item in movie_list:
         movie_item = TutorialItem()
         movie_item["number"] = item.xpath(
             ".//div[@class='item']/div[@class='pic']/em/text()"
         ).extract_first()
         movie_item["title"] = item.xpath(
             ".//div[@class='info']/div[@class='hd']/a/span[@class='title'][1]/text()"
         ).extract_first()
         content = item.xpath(
             ".//div[@class='item']/div[@class='info']/div[@class='bd']/p[1]/text()"
         ).extract()
         for i_content in content:
             movie_item["desc"] = "".join(i_content.split())
         movie_item["star"] = item.xpath(
             ".//div[@class='item']/div[@class='info']/div[@class='bd']/div[@class='star']/span[@class='rating_num']/text()"
         ).extract_first()
         movie_item["intro"] = item.xpath(
             ".//div[@class='item']/div[@class='info']/div[@class='bd']/p[@class='quote']/span[@class='inq']/text()"
         ).extract_first()
         movie_item["link"] = item.xpath(
             ".//div[@class='item']/div[@class='pic']/a/@href"
         ).extract_first()
         yield movie_item
         next_link = response.xpath(
             "//div[@class='paginator']/span[@class='next']/a/@href"
         ).extract()
         if next_link:
             next_link = next_link[0]
             yield scrapy.Request("https://movie.douban.com/top250" +
                                  next_link,
                                  callback=self.parse)

Esempio n. 26

0

Mostra file

    def parse(self, response):

        print "crwal: " + response.url

        tt_item = TutorialItem()
        page_url = response.url
        links = response.selector.re(
            r'href="(/item/[\w%]+)"')  # response自带正则解析
        for link in links:
            next_url = urlparse.urljoin(page_url, link)
            yield Request(next_url)

        # title = response.xpath('//dd[@class="lemmaWgt-lemmaTitle-title"]/h1/text()').extract_first('error')
        title = response.css('div .lemmaWgt-lemmaTitle-title h1::text'
                             ).extract_first('error: not found')
        # para_lst = response.xpath('//div[@class="lemma-summary"]/div[@class="para"]//text()').extract()
        para_lst = response.css('div .lemma-summary div.para ::text').extract()
        para = ''
        for i in para_lst:
            para += i

        tt_item['url'] = page_url
        tt_item['title'] = title
        tt_item['para'] = para

        yield tt_item

Esempio n. 27

0

Mostra file

File: quotes_spider.py Progetto: ahhzliuzg-git/tutorial

    def callmefordebug(self, response):
        self.log('call me')
        x = response.xpath('//a[@class="bizDownload"]')

        if x:
            #x1=x.xpath('./text()').extract()

            #x1=''.join(x)
            filen = response.xpath(
                '//a[@class="bizDownload"]/text()').extract()
            filen = ''.join(filen)
            self.log('---***---download file ： %s***---s' % filen)
            file = TutorialItem()  #here hook to filepipleline lzg
            for lu in x:
                filen = lu.xpath(
                    '//a[@class="bizDownload"]/text()').extract()[0]
                #x=''.join(x)
                #filen=''.join(filen)
                urlid = lu.xpath('//a[@class="bizDownload"]/@id').extract()[0]
                self.log('*****download file urlid %s' % urlid)
                url = 'http://www.ccgp.gov.cn/oss/download?uuid=' + urlid
                #file=TutorialItem() #here hook to filepipleline lzg
                #first get the outline part file href
                file['file_urls'] = [url]
                file['file_name'] = filen
                #file[name]=filen
                self.log('*****download file Saved file %s' % filen)
                yield file

Esempio n. 28

0

Mostra file

File: movie_spider.py Progetto: denmouse/smtcrawel

 def parse(self, response):
     # print response.body
     item = TutorialItem()
     selector = Selector(response)
     Movies = selector.xpath('//div[@class="info"]')
     for eachMoive in Movies:
         title = eachMoive.xpath('div[@class="hd"]/a/span/text()').extract()
         fullTitle = ''
         for each in title:
             fullTitle += each
         movieInfo = eachMoive.xpath('div[@class="bd"]/p/text()').extract()
         star = eachMoive.xpath(
             'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
         ).extract()
         quote = eachMoive.xpath(
             'div[@class="bd"]/p[@class="quote"]/span/text()').extract()
         if quote:
             quote = quote[0]
         else:
             quote = ''
         item['title'] = fullTitle
         item['movieInfo'] = ';'.join(movieInfo)
         item['star'] = star
         item['quote'] = quote
         yield item
     nextLink = selector.xpath('//span[@class="next"]/a/@href')
     if nextLink:
         url = response.urljoin(nextLink[0].extract())
         print url
         yield Request(url, self.parse)

Esempio n. 29

0

Mostra file

File: jobs.py Progetto: RoodrigoRoot/scraping_scrapy

 def parse(self, response):
     item = TutorialItem()
     item["job_title"] = response.css(
         "span.listing-company-name a::text").getall()
     item["location"] = response.css(
         "span.listing-location a::text").getall()
     yield item

Esempio n. 30

0

Mostra file

 def parse(self,response):
     try:
         page_number = response.meta['page_number']
     except:
         page_number = 2
     try:
         house_list = response.xpath("//div[@class='resblock-list-container clearfix']//ul[2]/li");
         for i_items in house_list:
             #item文件导入
             item = TutorialItem()
             #数据处理
             item['title'] = i_items.xpath(".//div//div[1]//a/text()").extract_first()
             item['address'] = i_items.xpath(".//div//div[2]//span[1]/text()").extract_first() + i_items.xpath(".//div//div[2]//span[2]/text()").extract_first() + i_items.xpath(".//div//div[2]//a/text()").extract_first()
             try:
                 item['total'] = i_items.xpath(".//div//div[@class='resblock-price']//div[@class='second']/text()").extract_first()
             except:
                 item['total'] = 'none'
             item['unitprice'] = i_items.xpath(".//div//div[@class='resblock-price']//div[@class='main-price']//span[1]//text()").extract_first()
             try:
                 item['buildingface'] = i_items.xpath(".//div//div[@class='resblock-area']//span//text()").extract_first()
             except:
                 item['buildingface'] = 'none'
             #数据导入到pipeline
             yield item
         #解析下一页的规则
         #next_link = response.xpath("//div[@class='page-box']//a[@class='next']/@href")
         print(page_number)
         if page_number < 5:
             url = "https://sz.fang.lianjia.com/loupan/pg" + str(page_number)
             print(url)
             page_number = page_number + 1
             yield scrapy.Request(url=url,meta={"page_number": page_number},callback=self.parse)
         
     except Exception as e:
         self.logger.error("error=" + str(response.url) + ", " + str(e))