コード例 #1
0
    def parse_forum_block_list(self, response: HtmlResponse):
        post_list = response.css('table#threadlisttableid')
        post_urls = []

        for post in post_list.css('tbody[id!="separatorline"]'):
            loader = ItemLoader(item=LyCommunityPostItem(),
                                selector=post,
                                base_url='http://www.lysq.com/')
            loader.add_value('block_name', response.meta['forum_block'])
            loader.add_css('title', 'a.s.xst::text')
            loader.add_css('url', 'a.s.xst::attr("href")')
            loader.add_css('author_username', 'td.by:nth-child(3) a::text')
            loader.add_css('created_time', 'td.by:nth-child(3) span::text')
            loader.add_css('last_comment_username',
                           'td.by:nth-child(5) a::text')
            loader.add_css('last_comment_time',
                           'td.by:nth-child(5) em span::attr("title")')
            loader.add_css('last_comment_time',
                           'td.by:nth-child(5) em span::text')
            loader.add_css('last_comment_time',
                           'td.by:nth-child(5) em a::text')
            loader.add_css('comment_count', 'td.num a::text')
            loader.add_css('view_count', 'td.num em::text')
            has_image = True if post.css(
                'th.common img[alt="attach_img"]') else False
            loader.add_value('has_image', has_image)
            has_attachment = True if post.css(
                'th.common img[alt="attachment"]') else False
            loader.add_value('has_attachment', has_attachment)

            item = loader.load_item()
            post_urls.append(item['url'])
            yield item

        for post_url in post_urls:
            yield Request(
                response.urljoin(post_url),
                callback=self.parse_forum_post,
                # 这个post_url是relative的, 并且是post的第一页, 所以不能使用response.url
                meta={
                    'post_url': post_url,
                    'page': response.meta['page']
                })

        next_page = response.css('div.pg a.nxt::attr("href")').extract_first()
        if next_page:
            yield Request(response.urljoin(next_page),
                          callback=self.parse_forum_block_list,
                          meta={
                              **response.meta, 'page':
                              response.meta['page'] + 1
                          })
コード例 #2
0
 def parse_main_page(self, response: HtmlResponse):
     car_types = response.xpath(
         "//select[@id='make_id']/option[position()>1]/text()").getall()
     for car in car_types:
         car = car.replace(' ', '-').replace('&', 'and')
         car = response.urljoin('/tuning-specs/' + car)
         yield scrapy.Request(car, callback=self.parse_models_page)
コード例 #3
0
	def parse_json(self,response):
		data = response.body[1:-1]
		js = json.loads(data)
		response = HtmlResponse(url=response.url,body=js['data'].encode('utf8'))
		for href in response.css(settings["el_nacional"]['links']):
			full_url = response.urljoin(href.extract())
			yield scrapy.Request(full_url, callback=self.parse_links)
コード例 #4
0
 def parse(self, response):
     urls = response.css('li.lista-noticia-item > a::attr(href)').extract()
     for url in urls:
         url = response.urljoin(url)
         yield scrapy.Request(url=url,
                              callback=self.detalhes_noticia,
                              dont_filter=True)
     i = 0
     while True:
         i += 1
         self.driver.find_element_by_link_text('Próxima').click()
         self.parse(HtmlResponse(self.driver.page_source))
         response = HtmlResponse(self.driver.current_url,
                                 body=self.driver.page_source,
                                 encoding='utf-8')
         urls = response.css(
             'li.lista-noticia-item > a::attr(href)').extract()
         for url in urls:
             url = response.urljoin(url)
             yield scrapy.Request(url=url,
                                  callback=self.detalhes_noticia,
                                  dont_filter=True)
         print(i)
         time.sleep(3)
         if i > 500:
             break
     self.escrever_dados()
コード例 #5
0
ファイル: detail.py プロジェクト: hfutxqd/NicotvParser
def get_video_urls(url):
    rsp = requests.get(url)
    response = HtmlResponse(body=rsp.content, url=rsp.url)
    video_urls = response.xpath('//ul[contains(@class, "ff-playurl") and contains(@class, "active")]/li/a/@href').extract()
    for i in range(0, len(video_urls)):
        video_urls[i] = response.urljoin(video_urls[i])
    return video_urls
コード例 #6
0
 def _parse_links(self, item):
     """Parse or generate links."""
     res = HtmlResponse(url=item["url"], body=item["description"], encoding="utf-8")
     links = []
     for link in res.css("a"):
         link_href = res.urljoin(link.attrib["href"])
         link_title = " ".join(link.css("* ::text").extract()).strip()
         if "viewform" not in link_href and link_title:
             links.append({"href": link_href, "title": link_title})
     return links
コード例 #7
0
    def se_res_parser(self, response: HtmlResponse):
        """
        参考: https://docs.scrapy.org/en/latest/intro/tutorial.html#more-examples-and-patterns
        :param response: 由框架请求完每一个url返回的response通过异步回调传入
        :return:
        """
        try:
            filter_: Filter = Filter(self.cursor, PLATFORM_CODE)
            all_ori_urls: List[str] = filter_.fetch_urls_from_detail_url_tb()
        except MySQLError:
            import traceback
            traceback.print_exc()
            self.cursor.close()
            self.conn.close()
            self.log('The DB connection has closed!')

        soup_god: Soup = Soup(response.text, 'lxml')
        # 搜索结果页所有的楼盘a标签soup类型集合
        all_comm_soup: List[Soup] = soup_god.select('p.house-name a')
        for comm in all_comm_soup:
            # 通过框架方法urljoin()生成完整url
            href: str = response.urljoin(comm.get('href'))
            if href in all_ori_urls:
                self.log('Hit the crawled community of {}!'.format(href))
                continue
            self.detail_item['comm_url'] = href
            self.detail_item['city_id'] = self.helper.get_city_id_by_url(href)
            self.detail_item['commit_time'] = self.helper.date_getter()
            self.detail_item['comm_from'] = PLATFORM_CODE
            self.detail_item['is_new'] = 0
            yield self.detail_item

        next_page_wrapper = re.search(r'尾页(.*)下一页', response.text)
        if next_page_wrapper is not None:
            next_page_wrapper_txt: str = next_page_wrapper.group().replace(
                '"', '')
            next_page_reg = re.search(r'href=(.+) class',
                                      next_page_wrapper_txt)
            if next_page_reg is not None:
                next_page: str = response.urljoin(next_page_reg.group(1))
                yield scrapy.Request(next_page, callback=self.se_res_parser)
                self.helper.avoid_anti_crawling(
                    20, 'before the next searching page', self)
コード例 #8
0
ファイル: detail.py プロジェクト: hfutxqd/NicotvParser
def get_video_channels(url):
    rsp = requests.get(url)
    response = HtmlResponse(body=rsp.content, url=rsp.url)
    channels = response.xpath('//div[contains(@class, "tab-content")]//ul[contains(@class, "ff-playurl")]')
    video_channels = []
    for channel in channels:
        video_urls = channel.xpath('./li/a/@href').extract()
        for i in range(0, len(video_urls)):
            video_urls[i] = response.urljoin(video_urls[i])
        video_channels.append(video_urls)
    return video_channels
コード例 #9
0
 def parse(self, response: HtmlResponse):
     li_eles = response.selector.css('.container li')
     for ele in li_eles:
         cmd_group = ele.attrib['data-group']
         cmd_name = ele.attrib['data-name']
         detail_url = ele.css('a').attrib['href']
         yield Request(
             response.urljoin(detail_url),
             callback=self.parse_details,
             meta={'cmd_group': cmd_group, 'cmd_name': cmd_name}
         )
コード例 #10
0
ファイル: qb.py プロジェクト: ly803744/spiders
    def parse(self, response: HtmlResponse):
        # print('*'*100)
        # # response是响应对象,包含常用的属性
        # print(response.encoding)   # 不能修改
        # print(response.headers)
        # print(response.status)
        # print(response.url)
        # print(response.request.url)
        # #  直接查询网页中的title标签,xpath()返回一个selector对象
        # print(response.xpath('//title/text()').extract())
        # print(response.selector.xpath('//title/text()').extract())
        # print(response.css('div[class="author clearfix"] a'))
        # print(response.text)   # 打印文本数据
        # print(response.body)   # 网页内容的字节码
        articles = response.xpath('//div[starts-with(@class,"article")]')
        # print(articles)
        for art in articles:
            # print(art)
            # article是Selector对象类型
            try:
                name = art.xpath('./div[1]//img/@alt').extract()[0]
                img = art.xpath('./div[1]//img/@src').extract()[0]
                content = art.xpath(
                    './/div[@class="content"]/span[1]/text()').extract()
                # 读取下一页数据
            except:
                pass
            else:
                # print(name,img)
                # print(''.join(content).replace('\n', ''))

                from qiubai.items import QiubaiItem
                item = QiubaiItem()
                item['name'] = name
                item['img'] = 'http' + img
                item['content'] = ''.join(content).replace('\n', '')
                # 将item的数据交给管道去处理
                # 利用协程的方式
                # yield {
                #     'name':name,
                #     'img':'http'+img,
                #     'content':''.join(content).replace('\n', '')
                #     }

                yield item

        next_url = response.xpath(
            '//ul[@class="pagination"]/li[last()]/a/@href').extract()[0]
        next_page_url = response.urljoin(next_url)
        if next_page_url:
            # 发起下一页的请求
            yield scrapy.Request(next_page_url, callback=self.parse)
コード例 #11
0
ファイル: qb.py プロジェクト: mmllyy/SpiderProject
    def parse(self, response:HtmlResponse):
        # print('---'*100)
        # response是响应对象,常用属性
        # print(response.encoding)   # 不能修改
        # print(response.headers)
        # print(response.status)
        # print(response.url)
        # print(response.request.url)

        # 直接查询网页中的title标签,
        # print(response.xpath('//title/text()'))
        # print(response.text)  # 打印文本数据
        # print(response.body)
        # print(response.selector.xpath('//div[starts-with(@class,"author")]/a'))
        # print(response.css('div[class="author clearfix"] a'))
        # from qiubai.qiubai.items import QiubaiItem

        articles = response.xpath('//div[starts-with(@class,"article")]')
        for article in articles:
            try:
                name = article.xpath('./div[1]//img/@alt').extract()[0]
                img = article.xpath('./div[1]//img/@src').extract()[0]
                content = article.xpath('.//div[@class="content"]/span[1]/text()').extract()
            except:
                pass
            else:
                # print(name,img)
                # print(''.join(content).replace('\n',''))
                # item = QiubaiItem()
                # item.name = name
                # item.img = 'http:'+img
                # item.content = ''.join(content).replace('\n','')
                # 将item
                yield {
                    "name":name,
                    "img":'http:'+img,
                    'content':''.join(content).replace('\n','')
                }

        # print('---'*100)

# //div[starts-with(@class,"article")]/div[1]//img/@src
# //div[starts-with(@class,"article")]//div[@class="content"]
# //div[starts-with(@class,"article")]//div[@class="thumb"]//img/@src

        # 读取下一页数据
        next_url = response.xpath('//ul[@class="pagination"]/li[last()]/a/@href').extract()[0]
        next_page_url = response.urljoin(next_url)
        print(next_page_url)
        yield scrapy.Request(next_page_url,callback=self.parse)
        # print(next_url)
        print('---'*100)
コード例 #12
0
ファイル: wienerlinien_at.py プロジェクト: misspenalty/feeds
 def parse(self, response):
     # Wiener Linien returns HTML with an XML content type which creates an
     # XmlResponse.
     response = HtmlResponse(url=response.url, body=response.body)
     for item in response.css('.block-news-item'):
         il = FeedEntryItemLoader(response=response,
                                  timezone=self._timezone,
                                  base_url='http://{}'.format(self.name))
         link = response.urljoin(item.css('a::attr(href)').extract_first())
         il.add_value('link', link)
         il.add_value('title', item.css('h3::text').extract_first())
         il.add_value('updated', item.css('.date::text').extract_first())
         yield scrapy.Request(link, self.parse_item, meta={'il': il})
コード例 #13
0
ファイル: search.py プロジェクト: hfutxqd/NicotvParser
def search(keyword):
    url = SEARCH_URL.format(keyword=keyword)
    rsp = requests.get(url)
    response = HtmlResponse(body=rsp.content, url=rsp.url)
    search_result = response.xpath('//div[@class="container ff-bg"]/ul[contains(@class, "list-unstyled")]/li')
    videos = []
    for item in search_result:
        status = item.xpath('.//span[@class="continu"]/text()').extract_first()
        videos.append({
            'name': item.xpath('./h2/a/text()').extract_first(),
            'url': response.urljoin(item.xpath('./h2/a/@href').extract_first()),
            'status': status.strip() if status else ''
        })
    return videos
コード例 #14
0
ファイル: bikediscount.py プロジェクト: oceancloud82/scraping
    def parse_product_list(self, response):
        if not isinstance(response, HtmlResponse):
            try:
                data = demjson.decode(response.body)
                response = HtmlResponse(response.url,
                                        body=data['data'],
                                        encoding='utf-8',
                                        request=response.request)
            except:
                self.log('No valid json found in %s' % response.url)
                return

        products = response.xpath(u'//a[@itemprop="url"]/@href').extract()

        for url in products:
            url = response.urljoin(url)
            yield Request(url, callback=self.parse_product)

        if products:
            pages = response.xpath(
                '//ul[contains(@class, "uk-pagination")]//a/@href').extract()
            for page in pages:
                yield Request(response.urljoin(page),
                              callback=self.parse_product_list)
コード例 #15
0
    def parse(self, response:HtmlResponse):
        #response是响应对象,常用属性
        # print(response.encoding)    #不能修改
        # #字节码数据,返回的是字典类型,value为列表类型。例:{b'Server': [b'openresty'], b'Date': [b'Wed, 20 Jun 2018 01:59:00 GMT']}
        # print(response.headers)
        # print(response.status)
        # print(response.url)     #返回响应网址,与response.request.url相同
        # print(response.request.url)
        # print(response.body)    #字节码数据
        # print(response.text)    #文本数据
        # 可以直接发起xpath,查询网页中的title节点,xpath()返回的是Selector对象的列表集合
        # print(response.xpath('//div[@class="content"]/span/text()').extract_first())   #extract()提取selector对象的内容
        # t = response.xpath('//div[@class="content"]/span/text()').extract_first()
        # print(type(t))
        # p = response.xpath('//div[@class="content"]/span/text()').extract()
        # print(type(p))
        #print(response.css('div[class="author clearfix"] a'))

        articles = response.xpath('//div[starts-with(@class,"article")]')


        for article in articles:
            try:
                # article 是Selector对象类型
                name = article.xpath('./div[1]//img/@alt').extract()[0]
                img = article.xpath('./div[1]//img/@src').extract()[0]
                content = article.xpath('.//div[@class="content"]/span[1]/text()').extract()
            except:
                pass
            else:
                print(name, img)
                print(''.join(content).replace('\n', ''))



                #将item数据交给item管道处理
                yield {
                    "name":name,
                    "img":'http:'+img,
                    "content":''.join(content).replace('\n','')
                }
        #读取下一页数据
        next_url = response.xpath('//ul[@class="pagination"]/li[last()]/a/@href').extract()[0]
        next_page_url = response.urljoin(next_url)  #与start_url做拼接,https://www.qiushibaike.com/8hr/page/2/

        #发起下一页的请求,callback是回调函数
        yield scrapy.Request(next_page_url,callback=self.parse)
        print(next_url)
コード例 #16
0
ファイル: wienerlinien_at.py プロジェクト: marcelogp/PyFeeds
 def parse(self, response):
     # Wiener Linien returns HTML with an XML content type which creates an
     # XmlResponse.
     response = HtmlResponse(url=response.url, body=response.body)
     for item in response.css(".block-news-item"):
         il = FeedEntryItemLoader(
             response=response,
             timezone="Europe/Vienna",
             ignoretz=True,
             base_url="https://www.{}".format(self.name),
         )
         link = response.urljoin(item.css("a::attr(href)").extract_first())
         il.add_value("link", link)
         il.add_value("title", item.css("h3::text").extract_first())
         il.add_value("updated", item.css(".date::text").extract_first())
         yield scrapy.Request(link, self.parse_item, meta={"il": il})
コード例 #17
0
ファイル: MyFollow.py プロジェクト: lizhaode/Scrapy91
    def parse(self, response: HtmlResponse):
        # 获取当前页面页数
        url_list = response.url.split('page=')
        if len(url_list) == 1:
            current_page_num = 1
        else:
            current_page_num = int(url_list[1])
        # 兼容 cookie 失效的情况
        if 'login.php' in response.url:
            raise ValueError('cookie 未设置或失效')
        else:
            myvideo_list = response.css('div.maindescwithoutborder')
            video_info_list = myvideo_list.css('a')
            self.logger.warn('解析{0}成功,存在{1}个视频'.format(response.url,
                                                       len(video_info_list)))

            link_and_title_dict = {}
            for item in video_info_list:
                title = item.css('::text').extract_first()
                link = item.css('a::attr(href)').extract_first()
                # 不知道为啥有时候会出现 email protected
                if 'email protected' in title:
                    continue
                link_and_title_dict[link] = title
                # 丢给另一个去解析真实的视频地址
                yield scrapy.Request(url=link,
                                     callback=self.parse_my_follow_real_link)
            self.logger.warn('最终解析{0}个视频'.format(len(link_and_title_dict)))

            # 记录下来当前页面的内容
            yield SaveMovieInfoItem(page_number=current_page_num,
                                    movie_link_and_name=link_and_title_dict)

            self.logger.warn('解析完毕,检查是否存在下一页'.format(response.url))
            next_page_tag = response.css('a[href*="?&page="]')
            for i in next_page_tag:
                if '»' == i.css('a::text').extract_first():
                    ori_link = i.css('a::attr(href)').extract_first()
                    next_link = response.urljoin(ori_link)
                    self.logger.warn('存在下一页')
                    next_headers = {
                        'Cookie': self.cookie,
                        'Referer': response.url
                    }
                    yield scrapy.Request(url=next_link,
                                         callback=self.parse,
                                         headers=next_headers)
コード例 #18
0
ファイル: pipelines.py プロジェクト: ipylei/zhiku_hoover
    def push_to_mq(self, item):
        """推送到RabbitMQ
        :param item:
        """

        # 推搜索
        if isinstance(item, SearchItem):
            if item.get('Content'):
                body = self.packaged_search(item)
                self.channel.basic_publish(exchange='', routing_key=self.news_queue, body=body)

                # 推其他
                url = item.get("Url")
                # 1.推送内容中的图片
                content = item.get("Content")
                if content:
                    response = HtmlResponse(url=url, body=content, encoding='utf8')
                    img_list = response.xpath("//img/@src").extract()
                    if img_list:
                        image_list = [response.urljoin(img_url) for img_url in img_list]
                        body = self.packaged_data(website=self.website, url=url, resource_urls=image_list,
                                                  resource_type="Picture", content=content)
                        self.channel.basic_publish(exchange='', routing_key=self.image_queue, body=body)

                # 2.推送搜索内容的附件到MQ
                pdf_file = item.pop("pdf_file") if item.get('pdf_file') else None
                if pdf_file:
                    pdf_file_list = json.loads(pdf_file).get("附件")
                    body = self.packaged_data(website=self.website, url=url, resource_urls=pdf_file_list,
                                              resource_type="Pdf")
                    self.channel.basic_publish(exchange='', routing_key=self.file_queue, body=body)

        # 推专家
        elif isinstance(item, ExpertItem):
            body = self.packaged_expert(item)
            self.channel.basic_publish(exchange='', routing_key=self.expert_queue, body=body)

            # 3.推送专家头像(图片)
            head_portrait = item.get("img_url")
            url = item.get("experts_url")
            if head_portrait:
                body = self.packaged_data(website=self.website, url=url, resource_urls=[head_portrait],
                                          resource_type="Picture")
                self.channel.basic_publish(exchange='', routing_key=self.expert_img_queue, body=body)
コード例 #19
0
    def parse(self, response: HtmlResponse):
        json_resp = self._process_dirty_json(response)

        for hero in json_resp['soul']:
            yield SanguoOlHeroItem(
                name=hero['name'],
                pic=response.urljoin(hero['pic']),
                name_pinyin=hero['pinyin'],
                sex=hero['sex'],
                name_zi=hero['zi'],
                life_range=hero['shengsi'],
                come_from=hero['jiguan'],
                brief=hero['content'],
                cata=hero['cata']
            )

        if json_resp['page'] < json_resp['mpage']:
            yield response.follow(
                url=self.LIST_API.format(page=json_resp['page'] + 1),
                callback=self.parse
            )
コード例 #20
0
ファイル: detail.py プロジェクト: hfutxqd/NicotvParser
def get_video_detail(detail_url):
    video_detail = {}
    try:
        rsp = requests.get(detail_url)
        response = HtmlResponse(body=rsp.content, url=rsp.url)
        video_detail['title'] = response.xpath('//div[@class="media"]//a[@class="ff-text"]/text()').extract_first()
        # video_detail['actors'] = response.xpath('//div[@class="media"]//dt[contains(text(), "主演")]/following-sibling::dd[1]//text()').extract()
        #
        # video_detail['directors'] = response.xpath(
        #     '//div[@class="media"]//dt[contains(text(), "导演")]/following-sibling::dd[1]//text()').extract()
        # video_detail['categories'] = response.xpath(
        #     '//div[@class="media"]//dt[contains(text(), "类型")]/following-sibling::dd[1]//text()').extract()
        # video_detail['area'] = response.xpath(
        #     '//div[@class="media"]//dt[contains(text(), "地区")]/following-sibling::dd[1]//text()').extract_first()
        # video_detail['year'] = response.xpath(
        #     '//div[@class="media"]//dt[contains(text(), "年份")]/following-sibling::dd[1]//text()').extract_first()
        data_active = response.xpath('//ul[contains(@class, "ff-playurl") and contains(@class, "active")]/@data-active').extract_first()
        video_detail['episode'] = response.xpath('//li[@data-id="{data_active}"]//text()'.format(data_active=data_active)).extract_first()
        player_url = response.xpath('//div[@id="cms_player"]/script[1]/@src').extract_first()
        rsp = requests.get(response.urljoin(player_url))
        video_info = json.loads(re.findall(r'var cms_player = (\{[\s\S]+?\});', rsp.text)[0])
        # print(video_info)
        req = PreparedRequest()
        url = video_info['url']
        if video_info['name'] == 'haokan_baidu':
            params = {}
            req.prepare_url(url, params)
        elif video_info['name'] == '360biaofan':
            params = {'time': video_info['time'], 'auth_key': video_info['auth_key']}
            req.prepare_url(url, params)
        rsp = requests.get(req.url)
        script_text = Selector(text=rsp.text).xpath('//script/text()').extract_first()
        # print(script_text)
        video_detail['video_url'] = re.findall(r'url: *\"(\S+?)\"', script_text)[0]
        return video_detail
    except KeyboardInterrupt:
        print('Interrupted')
        exit(0)
    except Exception:
        return video_detail
コード例 #21
0
ファイル: movie.py プロジェクト: FeJQ/Spider-Douban-Movie
    def parse(self, response: HtmlResponse):
        reque = response.request  # type:scrapy.Request

        print()
        # 当可用代理小于3个时,获取新代理
        while len(PROXY_LIST) < 3 & 1 == 0:
            get_proxy_url = 'http://webapi.http.zhimacangku.com/getip?num=1&type=2&pro=0&city=0&yys=0&port=1&pack=126415&ts=1&ys=1&cs=1&lb=1&sb=0&pb=45&mr=1&regions='
            proxy_obj = requests.get(get_proxy_url).content.decode()
            # proxy_response = scrapy.Request(url=get_proxy_url, )
            # print(proxy_response.body)
            proxy_obj = json.loads(proxy_obj)
            if (proxy_obj['success'] == True):
                temp = {}
                temp['ip'] = proxy_obj['data'][0]['ip']
                temp['port'] = proxy_obj['data'][0]['port']
                temp['expire_time'] = proxy_obj['data'][0]['expire_time']
                temp['city'] = proxy_obj['data'][0]['city']
                temp['isp'] = proxy_obj['data'][0]['isp']
                PROXY_LIST.append(temp)

        # print(response.request.meta)
        # print(response.request.headers['User-Agent'])

        movie_el_list = response.xpath('//*[@class="grid_view"]/li')
        for movie_el in movie_el_list:
            item = DoubanmovieItem()
            item['movie_name_zh'] = movie_el.xpath(
                './/span[@class="title"][1]/text()').extract_first()
            item['score'] = movie_el.xpath(
                './/span[@class="rating_num"]/text()').extract_first()
            item['description'] = movie_el.xpath(
                './/span[@class="inq"]/text()').extract_first()
            yield item

        next_url = response.xpath(
            '//span[@class="next"]/a/@href').extract_first()
        if (next_url != None):
            url = response.urljoin(next_url)
            yield scrapy.Request(url=url)
コード例 #22
0
    def parse(self, response):
        href = response.selector.xpath('//div[@id="sogou_vr_11002301_box_0"]/@href').extract()[0]
        cmd="~/bin/phantomjs ./getBody.js '%s'" % href
        time.sleep(1)
        stdout, stderr = subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE, stderr = subprocess.PIPE).communicate()
        print stderr
        response = HtmlResponse(url=href, body=stdout)

        for selector in Selector(response=response).xpath('//*[@id="history"]/div/div/div/div'):
            href = selector.xpath('h4/@hrefs').extract()[0].strip()
            title = ""
            for elem in selector.xpath('h4/text()').extract():
                if len(elem.strip()) > 0:
                    title = elem.strip()
            abstract = selector.xpath('//*[contains(@class, "weui_media_desc")]/text()').extract()[0].strip()
            pubtime = selector.xpath('//*[contains(@class, "weui_media_extra_info")]/text()').extract()[0].strip()
            full_url = response.urljoin(href)
            n = 0
            if len(title) != 0:
                sql = "select * from CrawlPage where title='%s'" % title
                n = self.cursor.execute(sql)
            if len(title) == 0 or n == 0:
                yield scrapy.Request(full_url, callback=self.parse_profile)
コード例 #23
0
# -*- coding: utf-8 -*-

from scrapy.http import HtmlResponse

url = 'http://www.mzitu.com/32288'
response = HtmlResponse(url=url)

# url
assert response.url == url
assert response.urljoin(url='2') == 'http://www.mzitu.com/2'
assert response.url + '/2' == 'http://www.mzitu.com/32288/2'

response = HtmlResponse(url=url + '/2')
photo_number, album_number = response.url.split('/')[:2:-1]
assert photo_number == '2'
assert album_number == '32288'