Esempio n. 1
0
    def parse(self, response): # 对start_urls进行解析
        # with open('error.html', 'w') as f:
        #     f.write(response.body.decode())
        print(response.url + '*****')
        tr_list = response.xpath('//*[@class="tablelist"]//tr')[1:-1]
        print(len(tr_list))

        for tr in tr_list:
            item = {}
            # 获取一部分数据
            item['name'] = tr.xpath('./td[1]/a/text()')[0]
            item['address'] = tr.xpath('./td[4]/text()')[0]
            item['time'] = tr.xpath('./td[5]/text()')[0]
            # 获取详情页url,并发送请求
            detail_url = 'https://hr.tencent.com/' + tr.xpath('./td[1]/a/@href')[0]
            print(detail_url)
            yield Request(
                detail_url,
                parse='parse_detail',
                meta=item # meta接收一个字典
            )
        # 翻页
        print(response.xpath('//a[text()="下一页"]/@href')[0])
        next_url = 'https://hr.tencent.com/' + response.xpath('//a[text()="下一页"]/@href')[0]
        if response.xpath('//a[text()="下一页"]/@href')[0] != 'javascript:;':
            yield Request(next_url, parse='parse')
Esempio n. 2
0
 def start_request(self):  # 发送start_urls中url地址的请求
     headers = {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"
     }
     url_temp = "https://movie.douban.com/top250?start={}&filter="
     for i in [ page*25 for page in range(10)]:
         yield Request(url_temp.format(i),headers=headers)
Esempio n. 3
0
 def start_requests(self):
     # 重写start_requests方法,返回多个请求
     # base_url = 'http://movie.douban.com/top250?start='
     url = 'http://www.heimahui.club:8081/?a='
     for i in range(100):  # 逐个返回第1-10页的请求属相
         url = url + str(i)
         yield Request(url)
Esempio n. 4
0
 def start_request(self):
     """
     构造请求对象
     :return:
     """
     for url in self.start_urls:
         yield Request(url)
Esempio n. 5
0
    def start_requests(self):
        start_urls = [
            "https://movie.douban.com/top250?start=" + str(page)
            for page in range(0, 226, 25)
        ]

        for start_url in start_urls:
            yield Request(start_url, headers=self.headers)
Esempio n. 6
0
 def start_requests(self):
     # request_list = []
     # for start_url in self.start_urls:
     #     request_list.append(Request(start_url))
     # return request_list
     # 将start_request处理为生成器,在Engine哪里进行迭代取出每个请求
     for start_url in self.start_urls:
         yield Request(start_url)
Esempio n. 7
0
 def parse(self, response):
     title_list = []
     for li in response.xpath("//ol[@class='grid_view']/li"):
         # title = li.xpath(".//span[@class='title'][1]/text()")    # 提取该li标下的 标题
         # title_list.append(title[0])
         detail_url = li.xpath(
             ".//div[@class='info']/div[@class='hd']/a/@href")[0]
         yield Request(url=detail_url, parse='parse_detail')
Esempio n. 8
0
    def start_requests(self):

        url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php'

        while True:

            yield Request(url, dont_filter=True)
            # 一定要等待一定要再yield后面, 否则会导致程序卡死了
            time.sleep(10)
Esempio n. 9
0
 def parse(self, response):
     self.total += 1
     time.sleep(2)
     if self.total > 10:
         return
     yield Request(
         'https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1',
         filter=False,
         parse='parse')
Esempio n. 10
0
 def parse(self, response):
     print(response.body)
     '''响应体数据是js代码'''
     # 使用js2py模块,执行js代码,获取数据
     ret = js2py.eval_js(
         response.body.decode("gbk"))  # 对网站分析发现,数据编码格式是gbk的,因此需要先进行解码
     for news in ret.list:  #
         yield Request(news["url"],
                       headers=self.headers,
                       parse='parse_detail',
                       meta={"type": news["channel"]["title"]})
Esempio n. 11
0
 def parse(self, response):
     divs = response.xpath('//*[@id="content"]/div/div[1]/ol/li')
     for div in divs:
         dic = {}
         #dic['url'] = response.url
         dic['name'] = div.xpath('./div/div[2]/div[1]/a/span[1]/text()')[0]
         item = Item(dic)
         detail_url = div.xpath('./div/div[2]/div[1]/a/@href')[0]
         #yield item
         yield Request(detail_url,
                       callback=self.parse_detail,
                       meta={'item': item})
Esempio n. 12
0
    def parse(self, response):
        '''解析豆瓣电影top250列表页'''

        # yield {'i':'同意!'}
        for li in response.xpath("//ol[@class='grid_view']/li"):    # 遍历每一个li标签
            item = {}
            item["title"] =  li.xpath(".//span[@class='title'][1]/text()")[0]    # 提取该li标下的 标题
            # print(item)
            # yield item

            detail_url = li.xpath(".//div[@class='info']/div[@class='hd']/a/@href")[0]
            yield Request(detail_url, parse="parse_detail", meta={"item":item})    # 发起详情页的请求,并指定解析函数是parse_detail方法
Esempio n. 13
0
    def parse(self, response):

        a_s = response.xpath(
            '//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a')
        for a in a_s:
            data = {}
            data['movie_name'] = a.xpath('./span[1]/text()')[0]
            data['movie_url'] = a.xpath('./@href')[0]
            # print(data)
            # yield Item(data)
            yield Request(data['movie_url'],
                          callback=self.parse_detail,
                          meta={'data': data})
Esempio n. 14
0
    def parse(self,response):  # 提取页面的数据
        li_list = response.xpath('//*[@id="content"]/div/div[1]/ol/li')
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
            "Host": "movie.douban.com"
        }

        for li in li_list[0:1]:
            item = {}
            item['movie_name'] = li.xpath('.//div/div[2]/div[1]/a/span[1]/text()')[0]
            item['movie_actor'] = li.xpath('.//div/div[2]/div[2]/p[1]/text()')[0]
            item['detail_url'] = li.xpath('.//div/div[2]/div[1]/a/@href')[0]
            print(item['detail_url'])
            yield Request(item['detail_url'],parse='parse_detail',meta={'item':item},headers=headers)
Esempio n. 15
0
    def parse(self, response):
        # item = {}
        # item['title'] = response.xpath("//head/title/text()")[0]
        # yield Item(item)

        node_list = response.xpath("//div[@class='hd']")[:3]
        for node in node_list:
            item = {}
            item['page_title'] = node.xpath("./a/span/text()")[0]
            item['page_link'] = node.xpath("./a/@href")[0]
            # Item数据,交给管道
            yield Item(item)
            # Request对象,Engine发送,并由指定的回调函数parse_page解析
            yield Request(item['page_link'], callback="parse_page")
Esempio n. 16
0
    def parse(self, response):
        # 提取a标签的列表
        a_s = response.xpath(
            '//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a')
        for a in a_s:
            data = {}
            data['movie_name'] = a.xpath('./span[1]/text()')[0]
            data['movie_url'] = a.xpath('./@href')[0]
            # yield Item(data)

            #2.1.2-1: 构建详情页的请求交给引擎
            yield Request(data['movie_url'],
                          callback=self.parse_detail,
                          meta={'data': data},
                          headers=self.headers)
Esempio n. 17
0
    def parse(self, response):
        node_list = response.xpath("//div[@class='hd']")

        for node in node_list:
            data = {}
            # 电影标题
            data['title'] = node.xpath("./a/span[1]/text()")[0]
            # 详情页链接
            data['url'] = node.xpath("./a/@href")[0]

            # 返回Item对象和Request对象给引擎
            # 如果是Item对象交给管道处理
            # 如果是Request对象,通过getattr获取解析方法处理对应的响应
            yield Item(data)
            yield Request(data['url'], callback="parse_page")
Esempio n. 18
0
    def start_requests(self):

        # 准备URL
        url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php'
        # 请求头
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
            'Referer': 'http://roll.news.sina.com.cn/s/channel.php?ch=01'
        }

        while True:
            # 构建请求
            # 默认所有请求只要重复了, 就会过滤掉
            # 如果请求指定了dont_filter=True,该请求就不过滤
            # 1. 修改框架中Request, 接收dont_filter
            # 2. 修改框架中调度器, 添加请求的时候, 判断这个请求是否需要过滤
            yield Request(url, headers=headers, dont_filter=True)
            # 休息一下: 一定要yield后面, 协程切换后才睡, 否则到程序卡该位置
            time.sleep(2)
Esempio n. 19
0
    def parse(self, response):
        '''解析豆瓣电影top250列表页'''
        # title_list = []    # 存储所有的
        for li in response.xpath("//ol[@class='grid_view']/li"):  # 遍历每一个li标签
            item = {}
            item["title"] = li.xpath(".//span[@class='title'][1]/text()")[
                0]  # 提取该li标下的 标题
            # title_list.append(title[0])

            detail_url = li.xpath(
                ".//div[@class='info']/div[@class='hd']/a/@href")[0]

            self.page += 1
            if self.page > 2:
                break

            yield Request(detail_url,
                          parse="parse_detail",
                          meta={"item":
                                item})  # 发起详情页的请求,并指定解析函数是parse_detail方法
Esempio n. 20
0
    def parse(self, response):  # 提取页面数据
        # 先分组,再提取
        div_list = response.xpath("//div[@id='content-left']/div")

        for div in div_list[:1]:
            item = {}
            item['name'] = div.xpath(".//h2/text()")[0].strip()
            item['age'] = div.xpath(
                ".//div[contains(@class,'articleGender')]/text()")
            item['age'] = item['age'][0] if len(item['age']) > 0 else None
            item['gender'] = div.xpath(
                ".//div[contains(@class,'articleGender')]/@class")
            item['gender'] = item['gender'][0].split(' ')[-1].replace(
                'Icon', '') if len(item['gender']) > 0 else None
            item['href'] = urllib.parse.urljoin(response.url,
                                                div.xpath("./a/@href")[0])
            # yield Item(item)
            yield Request(item['href'],
                          parse='parse_detail',
                          meta={'item': item})
Esempio n. 21
0
    def parse(self, response):
        """提取页面的数据"""

        # 先分组,在提取数据

        div_list = response.xpath("//div[@id='content-left']/div")
        for div in div_list[:1]:
            item = {}
            item["name"] = div.xpath(".//h2/text()")[0].strip()
            item["age"] = div.xpath(
                ".//div[contains(@class,'articleGender')]/text()")
            item["age"] = item["age"][0] if len(item["age"]) > 0 else None
            item["gender"] = div.xpath(
                ".//div[contains(@class,'articleGender')]/@class")
            # item["gender"] = item["gender"][0].split(' ')[-1].replace("Icon", "") if len(["gender"]) > 0 else None
            item["gender"] = item["gender"][0].split(" ")[-1].replace(
                "Icon", "") if len(item["gender"]) > 0 else None
            item["href"] = urllib.parse.urljoin(response.url,
                                                div.xpath("./a/@href")[0])
            # print(item)
            yield Item(item)
            yield Request(item["href"],
                          parse="parse_detail",
                          meta={"item": item})
Esempio n. 22
0
 def start_requests(self):
     url_temp = 'https://www.qiushibaike.com/hot/page/{}/'
     for i in range(1, 14):
         yield Request(url_temp.format(i))
Esempio n. 23
0
 def start_requests(self):
     for url in self.start_urls:
         # 不指定callback,默认响应由parse解析
         # 2. dont_filter 请求可以不去重
         yield Request(url, dont_filter=True)
Esempio n. 24
0
 def start_requests(self):
     # 重写start_requests方法,返回多个请求
     base_url = 'http://movie.douban.com/top250?start='
     for i in range(0, 250, 25):    # 逐个返回第1-10页的请求属相
         url = base_url + str(i)
         yield Request(url)
Esempio n. 25
0
 def start_requests(self):
     """构建初始请求对象"""
     for url in self.start_urls:
         yield Request(url)
Esempio n. 26
0
 def start_requests(self):
     while True:
         # 需要发起这个请求,才能获取到列表页数据,并且返回的是一个js语句
         url = "http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=89&spec=&type=&ch=&k=&offset_page=0&offset_num=0&num=120&asc=&page=1&r=0.5559616678192825"
         yield Request(url, parse='parse', filter=False)
         time.sleep(10)  # 每10秒发起一次请求
Esempio n. 27
0
 def start_requests(self):
     for start_url in self.start_urls:
         yield Request(start_url, filter=False)
Esempio n. 28
0
 def start_request(self):
     return Request(self.start_url)
Esempio n. 29
0
 def start_requests(self):
     for url in self.start_urls:
         yield Request(url)
Esempio n. 30
0
 def start_requests(self):
     for url in self.start_urls:
         # 不指定callback,默认响应由parse解析
         yield Request(url)