Beispiel #1
0
	def start_requests(self):
		with open('/home/aleksey/Downloads/vsm_links.txt') as handle:
			for url in handle:
				req = SplashRequest(self.base + url[:-1])
				req.meta['splash'].setdefault('args', {})['timeout'] = 220
				req.meta['splash']['endpoint'] = 'render.html'
				yield req
Beispiel #2
0
    def mainParse(self, response):
        '''
        主解析函数
        :param response: 爬取的链接对象
        :return:
        '''

        # 爬取内容
        style_find = response.css('div.col-sm-4')
        # 循环容器
        for i in style_find:
            content_url = i.css('.movie-item > a::attr(href)').extract_first()
            # 判断可以点进去的时候
            if content_url is not None:
                # 合并路由,形成爬取路径
                content_url = response.urljoin(content_url)
                # 传递路径,进行内容页爬取,调用contextParse函数
                str_url = """
                    --splash.response_body_enabled = true
                    splash.private_mode_enabled = false
                    splash:set_user_agent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36")
                    assert(splash:go(%s))
                    splash:wait(3)
                    return {html = splash:html()}
                """%(content_url)
                splash_args = {"lua_source": str_url}
                yield SplashRequest(content_url, args=splash_args, callback=self.contextParse)


        # 爬取下一页
        page_content = response.css('div.container > div:nth-of-type(3)')
        # 找到按钮上的文字,用来判断是否是最后一页
        weiye_text = page_content.css('a:last-of-type::text').extract_first()
        # 如果是尾页的时候,根据爬取页面进行逻辑操作
        if weiye_text == '尾页':
            if page_content.css('a::text').extract()[-2] == '下一页':
                next_url = page_content.css('a::attr(href)').extract()[-2]
                # 判断路径不为None时
                if next_url is not None:
                    # 合并成新的路径
                    next_url = response.urljoin(next_url)
                    # 回调自己进行深入爬取
                    str_url = """
                        --splash.response_body_enabled = true
                        splash.private_mode_enabled = false
                        splash:set_user_agent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36")
                        assert(splash:go(%s))
                        splash:wait(3)
                        return {html = splash:html()}
                    """ % (next_url)
                    splash_args = {"lua_source": str_url}
                    yield SplashRequest(next_url, args=splash_args, callback=self.parse)
Beispiel #3
0
 def _parse_listing_results_page(self, response):
     """Yield a separate request for each listing on the results page."""
     links = response.xpath('//a[starts-with(@target, "listing") and @rel="noopener"]')
     for link in links:
         # get all href of the specified kind and join them to be a valid url
         href = link.xpath('@href').extract()
         url = response.urljoin(href[0])
         text_nodes = link.xpath('.//text()').extract()
         matches = list(filter(lambda x: re.match(r'\$[\d,]+', x), text_nodes))
         # request the url and pass the response to final listings parsing function
         request = SplashRequest(url, self._parse_listing_contents, args={'wait': 15})
         request.meta['search_price'] = int(matches[0].replace('$', '')) if len(matches) == 1 else None
         yield request
Beispiel #4
0
    def parse(self, response):

        yesterday = (date.today() + timedelta(days=-4)).strftime("%Y-%m-%d")
        article_list = (response.xpath(
            '//div[@class="dataList"]//ul[@class="articleList"]'))
        for article in article_list:
            list_li = (article.xpath('li'))
            for i in list_li:
                article_time = (
                    i.xpath('span[@class="postTime"]/text()').extract()[0])
                article_time_split = article_time.split(' ')[0]
                if yesterday == article_time_split:
                    item = ZhiboccItem()
                    article_title = i.xpath('span[@class="articleTitle"]')
                    news_link = "https:{}".format(
                        article_title.xpath('a/@href').extract()[0])
                    title = article_title.xpath('a/text()').extract()[0]
                    item['link'] = news_link
                    item['time'] = article_time
                    item['today'] = article_time.split(' ')[0]
                    item['title'] = title
                    lua_source = """
                    --splash.response_body_enabled = true
                    splash.private_mode_enabled = false
                    splash:set_user_agent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36")
                    assert(splash:go("%s"))
                    splash:wait(3)
                    return {html = splash:html()}
                    """ % news_link
                    splash_args = {"lua_source": lua_source}
                    yield SplashRequest(news_link,
                                        meta={'item': item},
                                        endpoint='run',
                                        args=splash_args,
                                        callback=self.parse2)
    def parse(self, response):

        html = response.text
        select = Selector(text=html)
        url = response.url
        print("{}成功响应".format(url))
        # if url.startswith("http://app1.sfda.gov.cn/datasearchcnda/face3/search.jsp"):
        a_el_list = select.xpath(
            "/html/body/table[2]/tbody/tr/td/p/a/@href").extract()
        if a_el_list is None or len(a_el_list) == 0:
            writeFile(
                url=url,
                fileName=
                "D:\workspace\country_medical\country_medical\exception-file\exception.txt"
            )
        else:
            for a_el in a_el_list:
                u = "http://app1.sfda.gov.cn/datasearchcnda/face3/" + a_el.split(
                    ",")[1].replace("'", "")
                self.log("detail_url------------->>>{}".format(u))
                yield SplashRequest(url=u,
                                    callback=self.parse_item,
                                    dont_filter=True,
                                    args={
                                        "images": 0,
                                        'timeout': 10,
                                        "wait": "5"
                                    })
Beispiel #6
0
 def start_requests(self):
     # 重写  start_requests 构建 SplashRequest 的请求
     yield SplashRequest(
         url='https://channel.jd.com/food.html',
         callback=self.parse,
         args={'wait': 0.5},  # 由于splash渲染需要时间, 此处需要等待0.5s
     )
Beispiel #7
0
 def start_requests(self):
     yield SplashRequest(
         endpoint='execute',
         args={
             'lua_source': script,
             'url': self.url
         },
     )
Beispiel #8
0
 def parse(self, response):
     LOG.info("Get response-->%s." % response)
     for next_page in response.css('li>a'):
         relative_path = next_page.xpath('@href').extract_first()
         if not relative_path or 'void' in relative_path:
             continue
         url = response.urljoin(relative_path)
         yield SplashRequest(url=url, callback=self.parse_notice)
 def start_requests(self):
     for url in self.start_urls:
         yield SplashRequest(url=url,
                             dont_filter=True,
                             args={
                                 "images": 0,
                                 'timeout': 10,
                                 "wait": "5"
                             })
Beispiel #10
0
 def get_pages(self, response):
     # 获取每页链接
     page_li = response.xpath('//*[@id="videolist_box"]/div[2]/div[2]/ul/li/button/text()')
     if len(page_li) > 5:
         max_page = page_li[5].extract()
     else:
         max_page = page_li[-1].extract()
     for i in range(1, int(max_page)):
         page_url = "https://www.bilibili.com/v/ad/ad/#/all/default/0/{}".format(i)
         yield SplashRequest(page_url, callback=self.parse_list, args={'wait': '0.5'})
Beispiel #11
0
 def parse_category(self, response):
     # 获取大分类下的小分类地址
     res_all = response.xpath('//div[@id="subnav"]/ul/li/a/@href').extract()
     if res_all is None:
         res = [response.url]
     else:
         res = res_all[1:]
     for mctg in res:
         url = 'https://www.bilibili.com' + mctg
         yield SplashRequest(url, callback=self.get_pages, args={'wait': '3'})
Beispiel #12
0
    def start_requests(self):  # 由此方法通过下面链接爬取页面
        cookie = get_valid_cookie()

        if cookie:
            # 需要动态js爬取
            yield SplashRequest(url=LEVEL_GRADE,
                                callback=self.parse,
                                cookies=cookie)
        else:
            pass  # TODO:待添加cookie处理
Beispiel #13
0
 def start_requests(self):
     yield SplashRequest(
         endpoint='execute',
         args={
             'lua_source': script,
             'phone': '1835362****',
             'wait': 5,
             'url': 'https://www.baidu.com/s?wd=1835362****'
         },
     )
Beispiel #14
0
 def parse(self, response):
     """Determine number of pages in search results, and iterate through each page of results."""
     # get the last page number on the page
     last_page_number = self._last_page_number_in_search(response)
     if last_page_number < 1:
         # abort the search if there are no results
         return
     # otherwise loop over all pages and scrape!
     page_urls = [response.url + "&section_offset=" + str(pageNumber) for pageNumber in range(0, last_page_number)]
     for page_url in page_urls:
         yield SplashRequest(page_url, self._parse_listing_results_page, args={'wait': 5})
Beispiel #15
0
 def start_requests(self):
     """
     搜索京东手机入口
     :return:
     """
     for url in self.start_urls:
         yield SplashRequest(
             url=url,
             callback=self.parse,
             args={'wait': 0.5},
         )
Beispiel #16
0
 def parse_notice(self, response):
     for item in response.css('.nph_photo_view'):
         img_path = item.css('.nph_cnt')
         img_item = pipline.ImgItem()
         relative_path = img_path.xpath('img/@src').extract_first()
         img_item['image_urls'] = [response.urljoin(relative_path)]
         yield img_item
     for next_page in response.css('.newlistcontentright a'):
         relative_path = next_page.xpath('@href').extract_first()
         url = response.urljoin(relative_path)
         yield SplashRequest(url=url, callback=self.parse_notice)
Beispiel #17
0
 def parse(self, response):
     """获取医院信息"""
     all_hospital_links = response.xpath(
         '//div[@id="fl_yiyuan_nr"]/div/ul/li/a['
         'not(contains(text(),"升级中")) and not(contains(text(),"建设中"))]')
     try:
         for each_hospital_link in all_hospital_links:
             # hospital_name = each_link.xpath('text()').extract_first('')
             data_info = each_hospital_link.xpath('@onclick').extract_first(
                 '')
             if data_info:
                 data_info = ''.join(re.findall(r'\S+', data_info))
                 is_sp_time = re.search(r'isSpTime:\'(.*?)\'', data_info)
                 plat_form_hos_id = re.search(r'.*platformHosId:\'(.*?)\'',
                                              data_info, S)
                 pay_mode = re.search(r'paymode:\'(.*?)\'', data_info, S)
                 org_name = re.search(r'orgname:\'(.*?)\'', data_info, S)
                 if is_sp_time and plat_form_hos_id and pay_mode and org_name:
                     is_sp_time = is_sp_time.group(1)
                     plat_form_hos_id = plat_form_hos_id.group(1)
                     pay_mode = quote(pay_mode.group(1))
                     org_name = quote(org_name.group(1))
                     data = {
                         'isSpTime': is_sp_time,
                         'platformHosId': plat_form_hos_id,
                         'paymode': pay_mode,
                         'orgname': org_name
                     }
                     self.headers.update({
                         'Content-Type':
                         'application/x-www-form-urlencoded',
                         'Origin': 'http://www.hnyygh.com',
                         'Referer': 'http://www.hnyygh.com/'
                     })
                     splash_args = {
                         'url': self.hospital_post_url,
                         'headers': self.headers,
                         'lua_source': self.dept_script,
                         'data': data
                     }
                     yield SplashRequest(self.hospital_post_url,
                                         endpoint='execute',
                                         headers=self.headers,
                                         args=splash_args,
                                         dont_filter=True,
                                         callback=self.parse_hospital_info,
                                         meta={'hospital_name': org_name})
     except Exception as e:
         self.logger.error(repr(e))
Beispiel #18
0
 def parse_list(self, response):
     # 获取视频地址
     site = Selector(response)
     video_urls = site.xpath('//*[@id="videolist_box"]/div[2]/ul/li/div/div[2]/a/@href').extract()
     for v_url in video_urls:
         r_url = 'https:' + v_url
         splash_args = {"lua_source": """
                         --splash.response_body_enabled = true
                         splash.private_mode_enabled = false
                         splash:set_user_agent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36")
                         assert(splash:go("%s"))
                         splash:wait(3)
                         return {html = splash:html()}
                         """ % r_url}
         yield SplashRequest(r_url, endpoint='run', callback=self.parse_product, args=splash_args)
Beispiel #19
0
 def start_requests(self):
     splash_args = {
         "lua_source":
         """
                 --splash.response_body_enabled = true
                 splash.private_mode_enabled = false
                 splash:set_user_agent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36")
                 assert(splash:go("https://item.jd.com/5089239.html"))
                 splash:wait(3)
                 return {html = splash:html()}
                 """
     }
     yield SplashRequest("https://item.jd.com/5089239.html",
                         endpoint='run',
                         args=splash_args,
                         callback=self.onSave)
Beispiel #20
0
 def start_requests(self):
     splash_args = {
         "lua_source":
         """
                 --splash.response_body_enabled = true
                 splash.private_mode_enabled = false
                 splash:set_user_agent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36")
                 assert(splash:go("https://www.zhipin.com/job_detail/?query=python&scity=101010100&industry=&position="))
                 splash:wait(3)
                 return {html = splash:html()}
                 """
     }
     yield SplashRequest(
         "https://www.zhipin.com/job_detail/?query=python&scity=101010100&industry=&position=",
         endpoint='run',
         args=splash_args,
         callback=self.onSave)
Beispiel #21
0
    def start_requests(self):
        '''
        爬虫入口函数
        :return: 调用主解析函数
        '''

        splash_args = {
            "lua_source": """
                            --splash.response_body_enabled = true
                            splash.private_mode_enabled = false
                            splash:set_user_agent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36")
                            assert(splash:go("http://www.k2938.com/type/1.html"))
                            splash:wait(3)
                            return {html = splash:html()}
                        """
        }
        yield SplashRequest("http://www.k2938.com/type/1.html", args=splash_args, callback=self.mainParse)
Beispiel #22
0
    def parse(self, response):
        # 打开数据库,并读取数据,生成Elements数组
        self.elements = GetListFromMySQL()
        for element in self.elements:
            if element.ID == 0:
                self.Url = element.Xpath
                #self.allowed_domains.append(element.Xpath)# 添加允许域名,不然scrapy会过滤掉
                self.elements.remove(element)

        yield SplashRequest(self.Url,
                            callback=self.parse_detail,
                            endpoint='execute',
                            args={
                                'lua_source': lua_script,
                                'images': 0,
                                'wait': 5
                            },
                            dont_filter=True)
        pass
Beispiel #23
0
    def next_requests(self):
        fetch_one = self.redis_conn.spop if self.use_set else self.redis_conn.lpop
        found = 0
        while found < self.redis_batch_size:
            data = fetch_one(self.task_queue)
            if not data:
                break
            url = data.decode()
            req = SplashRequest(
                url,
                args={
                    'await': 2,
                    'timeout': 90
                },
            )
            if req:
                yield req
                found += 1

        self.logger.info('Read {} requests from {}'.format(
            found, self.task_queue))
    def parse(self, response):
        self.url = response.url
        #定位到iframe标签,拼接成URL
        iframe = response.selector.xpath('//iframe/@src')[0].extract()
        url = response.url + iframe

        splash_args = {
            "lua_source":
            """ 
                    --splash.response_body_enabled = true 
                    splash.private_mode_enabled = false 
                    splash:set_user_agent('Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36') \
                    assert(splash:go('%s')) 
                    splash:wait(3) 
                    return {html = splash:html()} 
                    """ % (url)
        }

        #由于上面URL消息动态加载,需要yield SplashRequest
        yield SplashRequest(url,
                            endpoint='run',
                            args=splash_args,
                            callback=self.parse_iframe)
        pass
Beispiel #25
0
    def parse(self, response):

        qoutes = response.xpath('//div[@class="quote"]')
        for qoute in qoutes:
            yield {
                "text": qoute.xpath('.//span[@class="text"]/text()').get(),
                "author": qoute.xpath('.//small[@class="author"]/text()').get(),
                "tags": ','.join( [i for i in qoute.xpath('.//a[@class="tag"]/text()').getall()] ).join(("[","]"))

            }


        next_page = response.xpath('(//ul[@class="pager"]/*/a/@href)[last()]').get()



        self.logger.info(urljoin('http://quotes.toscrape.com/js', next_page))
        self.logger.info(response.request.url)

        if next_page:
            yield SplashRequest(url=urljoin('http://quotes.toscrape.com/js', next_page),
                                callback=self.parse,
                                endpoint="execute",
                                args={"lua_source": self.script})
Beispiel #26
0
 def start_requests(self):
     for aid in xrange(1, self.end_aid):
         url_mod = 'https://space.bilibili.com/{}'
         url = url_mod.format(aid)
         yield SplashRequest(url, callback=self.space_info, args={'wait': '3', 'aid': aid}, dont_filter=True)
Beispiel #27
0
 def start_requests(self):
     for url in self.start_urls:
         yield SplashRequest(url=url, callback=self.parse)
Beispiel #28
0
 def start_requests(self):
     url = self.start_urls
     yield SplashRequest(url,
                         callback=self.parse,
                         args={'lua_source': script},
                         endpoint='execute')
Beispiel #29
0
    def next_requests(self):
        """Returns a request to be scheduled or none."""
        use_set = self.settings.getbool('REDIS_START_URLS_AS_SET',
                                        defaults.START_URLS_AS_SET)
        fetch_one = self.server.spop if use_set else self.server.lpop
        # XXX: Do we need to use a timeout here?
        found = 0
        # TODO: Use redis pipeline execution.
        while found < self.redis_batch_size:
            data = fetch_one(self.redis_key)
            if not data:
                # Queue empty.
                break
            task_data = json.load(data)
            task_url = task_data['url']
            url = bytes_to_str(task_url, self.redis_encoding)
            # req = Request(url, dont_filter=True)
            req = SplashRequest(
                url,
                self.parse_result,
                args={
                    # optional; parameters passed to Splash HTTP API
                    'wait': 0.5,

                    # 'url' is prefilled from request url
                    # 'http_method' is set to 'POST' for POST requests
                    # 'body' is set to request body for POST requests
                },
                endpoint='render.json',  # optional; default is render.html
                splash_url='<url>',  # optional; overrides SPLASH_URL
                slot_policy=scrapy_splash.SlotPolicy.PER_DOMAIN,  # optional
            )
            '''
            the second method to use SplashRequest
            '''
            # req = scrapy.Request(url, self.parse_result, meta={
            #     'splash': {
            #         'args': {
            #             # set rendering arguments here
            #             'html': 1,
            #             'png': 1,
            #
            #             # 'url' is prefilled from request url
            #             # 'http_method' is set to 'POST' for POST requests
            #             # 'body' is set to request body for POST requests
            #         },
            #
            #         # optional parameters
            #         'endpoint': 'render.json',  # optional; default is render.json
            #         'splash_url': '<url>',  # optional; overrides SPLASH_URL
            #         'slot_policy': scrapy_splash.SlotPolicy.PER_DOMAIN,
            #         'splash_headers': {},  # optional; a dict with headers sent to Splash
            #         'dont_process_response': True,  # optional, default is False
            #         'dont_send_headers': True,  # optional, default is False
            #         'magic_response': False,  # optional, default is True
            #     }
            # })
            req.extra_data = task_data
            if req:
                yield req
                found += 1
            else:
                self.logger.debug("Request not made from data: %r", data)

        if found:
            self.logger.debug("Read %s requests from '%s'", found,
                              self.redis_key)
Beispiel #30
0
 def start_requests(self):
     yield SplashRequest(url='http://quotes.toscrape.com/js',
                         callback=self.parse,
                         endpoint="execute",
                         args={"lua_source": self.script})