コード例 #1
0
        def test_xhtml(self):
            xhtml = b"""
    <?xml version="1.0"?>
    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
    <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
    <head>
        <title>XHTML document title</title>
    </head>
    <body>
        <div class='links'>
        <p><a href="/about.html">About us</a></p>
        </div>
        <div>
        <p><a href="/follow.html">Follow this link</a></p>
        </div>
        <div>
        <p><a href="/nofollow.html" rel="nofollow">Dont follow this one</a></p>
        </div>
        <div>
        <p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
        </div>
        <div>
        <p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p>
        </div>
    </body>
    </html>
            """

            response = HtmlResponse("http://example.com/index.xhtml",
                                    body=xhtml)

            lx = self.extractor_cls()
            self.assertEqual(
                lx.extract_links(response),
                [
                    Link(
                        url="http://example.com/about.html",
                        text="About us",
                        fragment="",
                        nofollow=False,
                    ),
                    Link(
                        url="http://example.com/follow.html",
                        text="Follow this link",
                        fragment="",
                        nofollow=False,
                    ),
                    Link(
                        url="http://example.com/nofollow.html",
                        text="Dont follow this one",
                        fragment="",
                        nofollow=True,
                    ),
                    Link(
                        url="http://example.com/nofollow2.html",
                        text="Choose to follow or not",
                        fragment="",
                        nofollow=False,
                    ),
                    Link(
                        url="http://google.com/something",
                        text="External link not to follow",
                        nofollow=True,
                    ),
                ],
            )

            response = XmlResponse("http://example.com/index.xhtml",
                                   body=xhtml)

            lx = self.extractor_cls()
            self.assertEqual(
                lx.extract_links(response),
                [
                    Link(
                        url="http://example.com/about.html",
                        text="About us",
                        fragment="",
                        nofollow=False,
                    ),
                    Link(
                        url="http://example.com/follow.html",
                        text="Follow this link",
                        fragment="",
                        nofollow=False,
                    ),
                    Link(
                        url="http://example.com/nofollow.html",
                        text="Dont follow this one",
                        fragment="",
                        nofollow=True,
                    ),
                    Link(
                        url="http://example.com/nofollow2.html",
                        text="Choose to follow or not",
                        fragment="",
                        nofollow=False,
                    ),
                    Link(
                        url="http://google.com/something",
                        text="External link not to follow",
                        nofollow=True,
                    ),
                ],
            )
コード例 #2
0
        <meta charset="UTF-8">
        <title></title>
    </head>
    <body>
        <ul>
            <li class="item-"><a id='i1' href="link.html" class='ding'>first item</a></li>
            <li class="item-0"><a id='i2' href="llink.html" class='ding'>first item</a></li>
            <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
        </ul>
        <div><a href="llink2.html">second item</a></div>
        <div><a href="llink2.html">10</a></div>
    </body>
</html>
"""
# 构造response对象
response = HtmlResponse(url='', body=html, encoding='utf-8')
selector = Selector(response=response)
# 获取所有a标签
temp = selector.xpath('//a')
# 获取第一个body标签并从body标签开始找ul标签 ./ul 相对标签的子标签
temp = selector.xpath('body')[0].xpath('.//ul')
print(temp)
exit()
# 获取body的子标签ul
temp = selector.xpath('body/ul')
# 获取body的后代标签li
temp = selector.xpath('body//li')
# []空,li不是body的子标签
temp = selector.xpath('body/li')
# 获取body的父标签
temp = selector.xpath('body')[0].xpath('..')
コード例 #3
0
    def test_generic_form_requests_with_spider_args(self):
        name = "ebay3"
        args = {'search_string': 'Cars'}
        spider = self.smanager.create(name, **args)
        generic_form_request = list(spider.start_requests())[0]

        response = HtmlResponse(
            url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
            body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        request_list = [
            request_to_dict(req, spider)
            for req in generic_form_request.callback(response)
        ]
        expected = [{
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url': u'http://www.ebay.com/sch/ebayadvsearch/?rt=nc',
            'dont_filter': True,
            'priority': 0,
            'callback': 'parse',
            'method': 'GET',
            'errback': None
        }]
        self.assertEqual(request_list, expected)
コード例 #4
0
 def process_request(self,request,spider):
     dr=webdriver.PhantomJS()
     dr.get(request.url)
     time.sleep(2)
     body=dr.page_source
     return HtmlResponse(dr.current_url,body=body.replace(u'\xa9',u''),encoding='utf-8',request=request)
コード例 #5
0
 def test_meta_refresh(self):
     req = Request(url='http://example.org')
     rsp = HtmlResponse(req.url, body=self._body())
     req2 = self.mw.process_response(req, rsp, self.spider)
     assert isinstance(req2, Request)
     self.assertEqual(req2.url, 'http://example.org/newpage')
コード例 #6
0
    def process_request(self,  request, spider):
        chrome_options = Options()
        chrome_options.add_argument('--headless')  # 使用无头谷歌浏览器模式
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--no-sandbox')
        '''
        :param request: 请求
        :param spider: 爬虫名
        :return:
        '''
        # 判断是哪个爬虫
        if spider.name == 'scjrm_zszq':
            # 判断是否是登陆
            # if request.url == "http://www.scjrm.com/site/login.html":
            print("<<<<<<<" +request.url)
            spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe")
            spider.driver.get("http://www.scjrm.com/site/login.html")
            # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click()
            time.sleep(2)
            #模拟输入账号密码
            username = spider.driver.find_element_by_id('phonenumber')
            password = spider.driver.find_element_by_id('password')
            username.send_keys('18030535053')
            password.send_keys('123456')
            #模拟点击“登录”按钮
            spider.driver.find_element_by_id('sub_bt').click()
            time.sleep(1)
            spider.driver.get(request.url)
            time.sleep(3)
            spider.cookies = spider.driver.get_cookies()
            time.sleep(1)
            return HtmlResponse(url=spider.driver.current_url,  # 登录后的url
                                body=spider.driver.page_source,  # html源码
                                encoding='utf-8')
            # 不是登录
            # else:
            #     req = requests.session()  # 会话
            #     for cookie in spider.cookies:
            #         req.cookies.set(cookie['name'], cookie["value"])
            #     req.headers.clear()  # 清空头
            #     newpage = req.get(request.url)
            #     time.sleep(5)
            #     return HtmlResponse(url=request.url,  # 当前连接
            #                         body=newpage.text,  # 源代码  # 源代码
            #                         encoding="utf-8", request=request)  # 返回页面信息

        if spider.name == 'scjuchuang_yxzq':
            # 判断是否是登陆
            # if request.url.find('login') != -1:
            spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe")
            spider.driver.get('https://www.scjuchuang.com/login')
            # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click()
            time.sleep(2)
            #模拟输入账号密码
            username = spider.driver.find_element_by_class_name('loginName')
            password = spider.driver.find_element_by_class_name('loginPassword')
            username.send_keys('yczs123')
            password.send_keys('123456')
            #模拟点击“登录”按钮
            spider.driver.find_element_by_class_name('loginBtn').click()
            time.sleep(1)
            spider.driver.get('https://www.scjuchuang.com/goods?attr=1&page=1')
            # spider.driver.find_element_by_link_text('院线专区').click()
            spider.cookies = spider.driver.get_cookies()
            return HtmlResponse(url=spider.driver.current_url,  # 登录后的url
                                body=spider.driver.page_source,  # html源码
                                encoding='utf-8')

        elif spider.name == 'rjyiyao_xpsj':
            # 判断是否是登陆
            # if request.url.find('login') != -1:
            spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe", chrome_options=chrome_options)
            spider.driver.get('http://new.rjyiyao.com/web/login')
            # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click()
            time.sleep(2)
            #模拟输入账号密码
            username = spider.driver.find_element_by_id('username')
            password = spider.driver.find_element_by_id('password')
            username.send_keys('18030535053')
            password.send_keys('123456')
            #模拟点击“登录”按钮
            spider.driver.find_element_by_id('btnLogin').click()
            time.sleep(1)
            # spider.driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[3]/div[2]/a[2]/img').click()  # 新品上架
            # windows = spider.driver.window_handles
            # spider.driver.switch_to.window(windows[1])  # 切换到第二页
            spider.driver.get('http://new.rjyiyao.com/web/product/group/5?page=1')
            time.sleep(5)
            spider.cookies = spider.driver.get_cookies()
            return HtmlResponse(url=spider.driver.current_url,  # 登录后的url
                                body=spider.driver.page_source,  # html源码
                                encoding='utf-8')

        elif spider.name == 'rjyiyao_zkzq':
            # 判断是否是登陆
            # if request.url.find('login') != -1:
            spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe", chrome_options=chrome_options)
            spider.driver.get('http://new.rjyiyao.com/web/login')
            # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click()
            time.sleep(1)
            #模拟输入账号密码
            username = spider.driver.find_element_by_id('username')
            password = spider.driver.find_element_by_id('password')
            username.send_keys('18030535053')
            password.send_keys('123456')
            #模拟点击“登录”按钮
            spider.driver.find_element_by_id('btnLogin').click()
            time.sleep(2)
            spider.driver.get('http://new.rjyiyao.com/web/product/sale/3?page=1')
            # spider.driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[3]/div[2]/a[2]/img').click()  # 新品上架
            # windows = spider.driver.window_handles
            # spider.driver.switch_to.window(windows[1])  # 切换到第二页
            time.sleep(5)
            spider.cookies = spider.driver.get_cookies()
            return HtmlResponse(url=spider.driver.current_url,  # 登录后的url
                                body=spider.driver.page_source,  # html源码
                                encoding='utf-8')

        elif spider.name == 'sckxyy_ypzq':
            # 判断是否是登陆
            # if request.url.find('login') != -1:
            spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe")
            spider.driver.get('http://www.sckxyy.com/Login.html')
            time.sleep(2)
            #模拟输入账号密码
            username = spider.driver.find_element_by_id('usernameLogin')
            password = spider.driver.find_element_by_id('passwordLogin')
            username.send_keys('bianyuantianshi')
            password.send_keys('123456')
            #模拟点击“登录”按钮
            spider.driver.find_element_by_id('userLogin').click()
            time.sleep(1)
            spider.cookies = spider.driver.get_cookies()
            spider.driver.get('http://www.sckxyy.com/Drug_zone.html#Monday-bg-two')
            # spider.driver.find_element_by_link_text('普药专区').click()  # 普药专区
            # time.sleep(5)
            # windows = spider.driver.window_handles
            # spider.driver.switch_to.window(windows[1])  # 切换到第二页

            return HtmlResponse(url=spider.driver.current_url,  # 登录后的url
                                body=spider.driver.page_source,  # html源码
                                encoding='utf-8')
コード例 #7
0
 def setUp(self):
     body = get_testdata('link_extractor', 'sgml_linkextractor.html')
     self.response = HtmlResponse(url='http://example.com/index', body=body)
コード例 #8
0
 def setUp(self) -> None:
     with RESPONSE_FAILED.open("rb") as file:
         self.s_response_failed = HtmlResponse(url="", body=file.read())
     with RESPONSE_SUCCEED.open("rb") as file:
         self.s_response_succeed = HtmlResponse(url="", body=file.read())
コード例 #9
0
ファイル: ind.py プロジェクト: axuaxu/scrapy-test
import scrapy
from scrapy.selector import Selector
from scrapy.http import HtmlResponse

response = HtmlResponse(url='http://ca.indeed.com')
print response.selector.xpath('//span/text()').extract()
コード例 #10
0
    def parse(self, response):
        print('start url:', response.url)
        self.driver.get(response.url)
        self.driver.maximize_window()

        # test with set timer
        start_time = time.time()
        counter = 0

        while True:
            last_height = self.driver.execute_script(
                "return document.body.scrollHeight")
            try:
                print('scroll')
                self.driver.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(3)
                new_height = self.driver.execute_script(
                    "return document.body.scrollHeight")
                if new_height == last_height:
                    break
                else:
                    last_height = new_height
                    continue
            except:
                break

        # hide footer bar
        element = self.driver.find_element_by_id("jump_paging")
        self.driver.execute_script("arguments[0].style.visibility='hidden'",
                                   element)
        """"new recent Pantip error 31/07/19"""
        login_message = self.driver.find_element_by_xpath(
            "/html/body/div[4]/div/div/div[4]")
        self.driver.execute_script("arguments[0].style.visibility='hidden'",
                                   login_message)
        """"load sub comments"""

        try:
            more_buttons = self.driver.find_elements_by_class_name('see-more')

            for x in range(0, len(more_buttons)):
                if more_buttons[x].is_displayed():
                    more_buttons[x].click()
        except:
            pass

        response = HtmlResponse(self.driver.current_url,
                                body=self.driver.page_source,
                                encoding='utf-8',
                                request=response)
        response.selector.remove_namespaces()
        post_id = response.url.split('https://pantip.com/topic/')[1]
        title = response.xpath('//*[@id="topic-' + post_id +
                               '"]/div/div[2]/h2/node()').extract()
        post_story = response.xpath(
            '/html/body/div[4]/div/div/div[3]/div/div[4]/div[1]/div//text()'
        ).extract()
        post_date = response.xpath('//*[@id="topic-' + post_id + ''
                                   '"]/div/div[4]/div[2]/div[3]/div['
                                   '3]/div/span/abbr/@data-utime').extract()
        post_tags = response.xpath(
            '/html/body/div[4]/div/div/div[3]/div/div[3]/div/div[2]/a/text()'
        ).extract()

        post_comments_time = response.xpath(
            './/div[@class="display-post-avatar-inner"]/span/abbr/@data-utime'
        ).extract()
        post_comments_time.pop(0)

        post_comments_userID = response.xpath(
            './/div[@class="display-post-avatar-inner"]/a//text()').extract()

        comments = self.driver.find_elements_by_class_name(
            'display-post-story')

        # emotion_list = []
        #
        # emotions = response.xpath(
        #     '/html/body/div[4]/div/div/div[3]/div/div[4]/div[2]/div[4]/div[1]/a/span//text()').extract()
        #
        # emotions_count = response.xpath(
        #     '/html/body/div[4]/div/div/div[3]/div/div[4]/div[2]/div[4]/div[1]/span//text()').extract()
        # # remove first label of number of emotions
        # emotions_count.pop(0)
        # for i in range(len(emotions)):
        #     emotion_list.append({emotions[i]: emotions_count[i]})

        # print(emotion_list)

        # comments_emotions_list = []
        #
        # try:
        #     emotions = self.driver.find_element_by_class_name('/html/body/div[4]/div/div/div[6]/div/div/div[2]/div[3]/div[4]/div[1]/a/span')
        #     emotions_count = self.driver.find_elements_by_xpath('/html/body/div[4]/div/div/div[6]/div/div/div[2]/div[3]/div[4]/div[1]/span')
        #     emotions_count.pop(0)
        #     for i in range(len(emotions)):
        #         comments_emotions_list.append({emotions[i].text: emotions_count[i].text})
        # except:
        #     pass
        #
        # print(comments_emotions_list)

        # data_topic = {
        #     post_id: {
        #         'id': post_id,
        #         'user_id': post_comments_userID[0],
        #         'post_date': post_date[0],
        #         'post_tags': ','.join(str(c).strip() for c in post_tags),
        #         'post_title': title[0],
        #         'post_story': comments[0].text,
        #         'total_comment': int(len(post_comments_time)),
        #         # 'emotions': ''.join(str(c).strip() for c in emotion_list)
        #     }
        # }
        #
        # self.firebase.uploadDatabase('data/scraped/post', data_topic)
        #
        # comments.pop(0)
        # comments.pop(-1)
        # comments.pop(-1)
        # print(comments[0].text)
        #
        # post_comments_userID.pop(0)
        # if len(post_comments_userID) < len(comments):
        #     comments.pop(0)

        for i in range(len(post_comments_userID)):
            id = post_id + '_' + (post_comments_time[i].replace(
                '/', '_')).replace(' ', '_')
            data_comment = {
                id: {
                    "id": id,
                    "user_id": post_comments_userID[i],
                    "time": post_comments_time[i],
                    "comment": comments[i].text
                }
            }

            print(data_comment)

            self.firebase.uploadDatabase('data/model/comment', data_comment)
コード例 #11
0
ファイル: test.py プロジェクト: wangrongming/pythonBase
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
body = '<html><body><span>good</span></body></html>'
response = HtmlResponse(url='http://example.com', body=body)
a = response.selector.xpath('//span/text()').extract()
print(a)
コード例 #12
0
    def parse_scene(self, response):
        jsondata = response.json()
        htmlcode = jsondata['solution']['response']
        response = HtmlResponse(url=response.url,
                                body=htmlcode,
                                encoding='utf-8')
        response_url = jsondata['solution']['url']
        cookies = jsondata['solution']['cookies']
        for cookie in cookies:
            if cookie['name'] == 'mydate':
                scenedate = cookie['value']
            if cookie['name'] == 'performer':
                performer = cookie['value']
        item = SceneItem()
        if scenedate:
            item['date'] = self.parse_date(scenedate).isoformat()
        else:
            item['date'] = self.parse_date('today').isoformat()

        if performer:
            item['performers'] = [performer]
        else:
            item['performers'] = []

        item['title'] = self.get_title(response)
        item['description'] = self.get_description(response)
        item['image'] = self.get_image(response)
        item['image_blob'] = self.get_image_blob(response)
        item['tags'] = self.get_tags(response)
        if "" in item['tags']:
            item['tags'].remove("")
        item['id'] = re.search(r'/movie/(.*?)/',
                               jsondata['solution']['url']).group(1)
        item['trailer'] = self.get_trailer(response)
        item['url'] = jsondata['solution']['url']
        item['network'] = "ATK Girlfriends"

        if "atkarchives" in response_url:
            item['parent'] = "ATK Archives"
            item['site'] = "ATK Archives"
        if "atkexotics" in response_url:
            item['parent'] = "ATK Exotics"
            item['site'] = "ATK Exotics"
        if "atkpremium" in response_url:
            item['parent'] = "ATK Premium"
            item['site'] = "ATK Premium"
        if "atkpetites" in response_url:
            item['parent'] = "ATK Petites"
            item['site'] = "ATK Petites"
        if "atkhairy" in response_url:
            item['parent'] = "ATK Hairy"
            item['site'] = "ATK Hairy"
        if "amkingdom" in response_url:
            item['parent'] = "ATK Galleria"
            item['site'] = "ATK Galleria"

        days = int(self.days)
        if days > 27375:
            filterdate = "0000-00-00"
        else:
            filterdate = date.today() - timedelta(days)
            filterdate = filterdate.strftime('%Y-%m-%d')

        if self.debug:
            if not item['date'] > filterdate:
                item['filtered'] = "Scene filtered due to date restraint"
            print(item)
        else:
            if filterdate:
                if item['date'] > filterdate:
                    yield item
            else:
                yield item
コード例 #13
0
ファイル: test_http_request.py プロジェクト: zezhen/scrapy
def _buildresponse(body, **kwargs):
    kwargs.setdefault('body', body)
    kwargs.setdefault('url', 'http://example.com')
    kwargs.setdefault('encoding', 'utf-8')
    return HtmlResponse(**kwargs)
コード例 #14
0
ファイル: middlewares.py プロジェクト: ipenn/jin10ToWX
 def process_request(self, request, spider):
     url = request.url
     spider.chrome.get(url)
     time.sleep(3)
     html = spider.chrome.page_source
     return HtmlResponse(url=url,body=html,request=request,encoding='utf-8')
コード例 #15
0
    def process_request(self, request, spider):

        if spider.USE_SELENIUM:
            url = request._get_url()
            self.driver.get(url)
            return HtmlResponse(url, body=self.driver.page_source, encoding='utf-8')
コード例 #16
0
    print("############### START ###################")
    with open('source.json', 'r') as f:
        source_data = json.load(f)

    output_data = []
    errors = []
    sending_pattern = DEFUALT_SENDING_PATTERN.copy()
    for book in tqdm(source_data):
        txt_to_search = book['name']
        link_to_book = book['link']
        description = book['pargraph']
        sending_pattern['FreeText_1'] = txt_to_search
        try:
            r = requests.post(URL_TO_SEARCH_API, params=sending_pattern)
            response = HtmlResponse(url=URL_TO_SEARCH_API,
                                    body=r.text,
                                    encoding='utf-8')
            founded_books = response.selector.css('td[width="222"]')
            if len(founded_books) > 0:
                founded_books_title = []
                for book_html in founded_books:
                    book_title = ''.join(
                        book_html.css('*::text').extract()).replace('\n', '')
                    founded_books_title.append(book_title)
                founded_books_title_str = '|'.join(founded_books_title)
                output_data.append({
                    **book, 'founded_books_title':
                    founded_books_title_str
                })
        except Exception as e:
            print(f'ERROR: {e}')
コード例 #17
0
def load_html():
    file = codecs.open("test/resources/covid_stub.html", 'r')
    response = HtmlResponse(url="my HTML string",
                            body=file.read(),
                            encoding='utf-8')
    return response
コード例 #18
0
def fake_response(url: str) -> Response:
    body = bytes(requests.get(url).text, 'UTF-8')
    response = HtmlResponse(url, body=body)
    return response
コード例 #19
0
 def test_restrict_xpaths_with_html_entities(self):
     html = '<html><body><p><a href="/&hearts;/you?c=&euro;">text</a></p></body></html>'
     response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='iso8859-15')
     links = SgmlLinkExtractor(restrict_xpaths='//p').extract_links(response)
     self.assertEqual(links,
                      [Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC', text=u'text')])
コード例 #20
0
    def process_request(self, request, spider):
        """
        用PhantomJS抓取页面
        """
        # self.logger.debug('PhantomJS is Starting')
        page = request.meta.get('page', 1)
        print("我到这里了", )
        # m不重要, m的作用是判断是否刷新页面
        m = random.randint(2, 202)

        try:
            if page == 1:
                self.driver.get(request.url)
                time.sleep(random.uniform(1, 3))
                self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
                self.driver.execute_script('window.scrollBy(0, 1200)')
                time.sleep(random.uniform(0.5, 1.5))
                self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
                return HtmlResponse(url=request.url, body=self.driver.page_source, request=request, encoding='utf-8',
                                    status=200)
            if page <= m:
                input = self.wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
                submit = self.wait.until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
                input.clear()
                input.send_keys(page)
                submit.click()

                time.sleep(random.uniform(1, 3))

                self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
                self.driver.execute_script('window.scrollBy(0, 1200)')
                time.sleep(random.uniform(0.5, 1.5))
                self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
                return HtmlResponse(url=request.url, body=self.driver.page_source, request=request, encoding='utf-8',
                                    status=200)

            if page > m:
                self.driver.get(request.url)
                self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
                self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')

                input = self.wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
                submit = self.wait.until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
                input.clear()
                input.send_keys(page)
                submit.click()

                time.sleep(random.uniform(1, 3))

                self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
                self.driver.execute_script('window.scrollBy(0, 1200)')
                time.sleep(random.uniform(0.5, 1.5))
                self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
                return HtmlResponse(url=request.url, body=self.driver.page_source, request=request, encoding='utf-8',
                                    status=200)
        except TimeoutException:
            self.driver.get(request.url)
            return HtmlResponse(url=request.url, status=500, request=request)
コード例 #21
0
 def getScrapyResponse(self, url):
     response = self.downloadUsingSelenium(url)
     response = HtmlResponse(url=url, body=response, encoding='utf-8')
     return response
コード例 #22
0
'''

"""
Scrapy选择器是Selector通过传递文本或TextResponse 对象构造的类的实例。
它根据输入类型自动选择最佳解析规则(XML vs HTML)
"""

from scrapy.selector import Selector
from scrapy.http import HtmlResponse

# 从文本构建
body = '<html><body><span>good</span></body></html>'
print(Selector(text=body).xpath('//span/text()').extract())

# 从response(响应)中构建
response = HtmlResponse(url='https://sebastianraschka.com/blog/index.html', body=body, encoding='utf-8')
print(Selector(response=response).xpath('//*/h1[@class="post-title"]/text()').extract())
# 上面那句等价于下面这句
print(response.selector.xpath('//*/h1[@class="post-title"]/text()').extract())

response = r"""
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
コード例 #23
0
 def test_priority_adjust(self):
     req = Request('http://a.com')
     rsp = HtmlResponse(req.url, body=self._body())
     req2 = self.mw.process_response(req, rsp, self.spider)
     assert req2.priority > req.priority
コード例 #24
0
ファイル: conv.py プロジェクト: ameetbora/facebook-comments
def body_html(response_body: bytes) -> HtmlResponse:
    return HtmlResponse(url="",
                        body=to_json(response_body)["domops"][0][3]["__html"],
                        encoding="utf-8")
コード例 #25
0
def crawl_product_id():
    product_id_list = []
    i = 1
    while (i < 3):
        driver = webdriver.Chrome("C:/bin/chromedriver.exe",
                                  chrome_options=options)
        driver.get(laptop_page_url.format(i))
        if "https://shopee.vn/Laptop-cat.13030.13065" in laptop_page_url.format(
                i):
            y = 2300
            x = 1
            while y <= 4800:
                driver.execute_script("window.scrollTo(0, " + str(y) + ")")
                y += 1000
                # print("aaaaaaaaaaa")
                # try:
                #     print("bbbbbbb" ,WebDriverWait(driver, 1).until(EC.presence_of_element_located(
                #         (By.XPATH, '//*[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[2]/div[1]/div'.format({x})))))
                #     print("Page is ready!")
                # except TimeoutException:
                #     print("cccccccc")
                #     print("Loading took too much time!")
                x += 10
            body = driver.page_source
            abc = driver.current_url
            response = HtmlResponse(abc, body=body, encoding='utf8')
            print(body)
            if (response == None):
                break

            for product in response.css(
                    "div.col-xs-2-4.shopee-search-item-result__item"):
                try:
                    url = product.css("div a::attr(href)").get()
                    print("link ok: ", url)

                    product_key = url.rsplit("-i.", 1)[1]
                    # product_id_dict = {"shop_id": product_key[0], "item_id": product_key[1]}
                    # shop_id = product_key[0]
                    # item_id = product_key[1]
                    # parser = BeautifulSoup(body, 'html.parser')
                    # product_box = parser.findAll(class_="col-xs-2-4 shopee-search-item-result__item", )
                    # if (len(product_box) == 0):
                    #     break
                    # print(product_box[0])
                    # for product in product_box:
                    #     # href = product.get("href").rsplit("-i.", 1)[1]
                    #     # product_id = href.split(".html")[0]
                    #     product_id = product.get("div a::attr(href)")
                    #     # product_id = product.css("div a::attr(href)").get()
                    #     # product_id = product.get("href")
                    product_id_list.append(product_key)
                except:
                    print("no!")
        driver.close()
        print("Crawl page: ", i)
        print(product_id_list)
        # response = requests.get(laptop_page_url.format(i), params=params, headers=headers)
        # parser = BeautifulSoup(response.text, 'html.parser')
        # # print(response.content)
        # product_box = parser.findAll('a', class_="col-xs-2-4 shopee-search-item-result__item")
        #
        # if (len(product_box) == 0):
        #     break
        #
        # for product in product_box:
        #     href = product.get("href")
        #     print(href)

        i += 1

    return product_id_list, i
コード例 #26
0
from scrapy.selector import Selector
from scrapy.http import HtmlResponse

body = '<html><body><span>good</span></body></html>'
p = Selector(text=body).xpath('//span/text()').extract()

print(p)

response = HtmlResponse(url='http://example.com', body=body, encoding='utf-8')
print(Selector(response=response).xpath('//span/text()').extract())
コード例 #27
0
    def test_generic_form_requests_with_file_field(self):
        name = "ebay2"
        spider = self.smanager.create(name)
        generic_form_request = list(spider.start_requests())[0]

        self.assertEqual(generic_form_request.url,
                         'file://tmp/test_params.txt')
        response = HtmlResponse(url='file://tmp/test_params.txt',
                                body=open(
                                    join(_PATH, "data",
                                         "test_params.txt")).read())
        response.request = generic_form_request
        requests = list(generic_form_request.callback(response))
        request_list = [request_to_dict(req, spider) for req in requests]
        expected = [{
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {
                u'xpath':
                u"//form[@name='adv_search_from']",
                u'form_url':
                u'http://*****:*****@name='_nkw']",
                    'file_values': ['Cars', 'Boats'],
                    u'type': u'inurl',
                    u'value': u'file://tmp/test_params.txt'
                }, {
                    u'type': u'inurl',
                    u'name': u'_nkw2',
                    u'value': u'file://tmp/test_params.txt'
                }, {
                    u'xpath': u".//*[@name='_in_kw']",
                    u'type': u'iterate'
                }]
            },
            'headers': {},
            'url': u'file://tmp/test_params.txt',
            'dont_filter': True,
            'priority': 0,
            'callback': 'parse_field_url_page',
            'method': 'GET',
            'errback': None
        }]
        self.assertEqual(request_list, expected)

        generic_form_request = requests[0]
        self.assertEqual(generic_form_request.url,
                         'file://tmp/test_params.txt')
        response = HtmlResponse(url='file://tmp/test_params.txt',
                                body=open(
                                    join(_PATH, "data",
                                         "test_params.txt")).read())
        response.request = generic_form_request

        requests = list(generic_form_request.callback(response))
        request_list = [request_to_dict(req, spider) for req in requests]
        expected = [{
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {
                u'xpath':
                u"//form[@name='adv_search_from']",
                u'fields': [{
                    u'xpath': u".//*[@name='_nkw']",
                    'file_values': ['Cars', 'Boats'],
                    u'type': u'inurl',
                    u'value': u'file://tmp/test_params.txt'
                }, {
                    'file_values': ['Cars', 'Boats'],
                    u'type': u'inurl',
                    u'name': u'_nkw2',
                    u'value': u'file://tmp/test_params.txt'
                }, {
                    u'xpath': u".//*[@name='_in_kw']",
                    u'type': u'iterate'
                }],
                u'type':
                u'form',
                'field_index':
                1
            },
            'headers': {},
            'url': u'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc',
            'dont_filter': True,
            'priority': 0,
            'callback': 'parse_form_page',
            'method': 'GET',
            'errback': None
        }]
        self.assertEqual(request_list, expected)

        generic_form_request = requests[0]
        self.assertEqual(
            generic_form_request.url,
            'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc')
        response = HtmlResponse(
            url="http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
            body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        request_list = [
            request_to_dict(req, spider)
            for req in generic_form_request.callback(response)
        ]
        expected = [{
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url': u'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc',
            'dont_filter': True,
            'priority': 0,
            'callback': 'parse',
            'method': 'GET',
            'errback': None
        }]
        self.assertEqual(request_list, expected)
コード例 #28
0
 def process_request(self, request, spider):
     driver = webdriver.PhantomJS()
     driver.get(request.url)
     return HtmlResponse(request.url,
                         encoding='utf-8',
                         body=driver.page_source.encode('utf-8'))
コード例 #29
0
ファイル: test_contrib_loader.py プロジェクト: zrbruce/scrapy
class SelectortemLoaderTest(unittest.TestCase):
    response = HtmlResponse(url="", body="""
    <html>
    <body>
    <div id="id">marta</div>
    <p>paragraph</p>
    <a href="http://www.scrapy.org">homepage</a>
    <img src="/images/logo.png" width="244" height="65" alt="Scrapy">
    </body>
    </html>
    """)

    def test_constructor(self):
        l = TestItemLoader()
        self.assertEqual(l.selector, None)

    def test_constructor_errors(self):
        l = TestItemLoader()
        self.assertRaises(RuntimeError, l.add_xpath, 'url', '//a/@href')
        self.assertRaises(RuntimeError, l.replace_xpath, 'url', '//a/@href')
        self.assertRaises(RuntimeError, l.get_xpath, '//a/@href')
        self.assertRaises(RuntimeError, l.add_css, 'name', '#name::text')
        self.assertRaises(RuntimeError, l.replace_css, 'name', '#name::text')
        self.assertRaises(RuntimeError, l.get_css, '#name::text')

    def test_constructor_with_selector(self):
        sel = Selector(text=u"<html><body><div>marta</div></body></html>")
        l = TestItemLoader(selector=sel)
        self.assert_(l.selector is sel)

        l.add_xpath('name', '//div/text()')
        self.assertEqual(l.get_output_value('name'), [u'Marta'])

    def test_constructor_with_selector_css(self):
        sel = Selector(text=u"<html><body><div>marta</div></body></html>")
        l = TestItemLoader(selector=sel)
        self.assert_(l.selector is sel)

        l.add_css('name', 'div::text')
        self.assertEqual(l.get_output_value('name'), [u'Marta'])

    def test_constructor_with_response(self):
        l = TestItemLoader(response=self.response)
        self.assert_(l.selector)

        l.add_xpath('name', '//div/text()')
        self.assertEqual(l.get_output_value('name'), [u'Marta'])

    def test_constructor_with_response_css(self):
        l = TestItemLoader(response=self.response)
        self.assert_(l.selector)

        l.add_css('name', 'div::text')
        self.assertEqual(l.get_output_value('name'), [u'Marta'])

        l.add_css('url', 'a::attr(href)')
        self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org'])

        # combining/accumulating CSS selectors and XPath expressions
        l.add_xpath('name', '//div/text()')
        self.assertEqual(l.get_output_value('name'), [u'Marta', u'Marta'])

        l.add_xpath('url', '//img/@src')
        self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org', u'/images/logo.png'])

    def test_add_xpath_re(self):
        l = TestItemLoader(response=self.response)
        l.add_xpath('name', '//div/text()', re='ma')
        self.assertEqual(l.get_output_value('name'), [u'Ma'])

    def test_replace_xpath(self):
        l = TestItemLoader(response=self.response)
        self.assert_(l.selector)
        l.add_xpath('name', '//div/text()')
        self.assertEqual(l.get_output_value('name'), [u'Marta'])
        l.replace_xpath('name', '//p/text()')
        self.assertEqual(l.get_output_value('name'), [u'Paragraph'])

        l.replace_xpath('name', ['//p/text()', '//div/text()'])
        self.assertEqual(l.get_output_value('name'), [u'Paragraph', 'Marta'])

    def test_get_xpath(self):
        l = TestItemLoader(response=self.response)
        self.assertEqual(l.get_xpath('//p/text()'), [u'paragraph'])
        self.assertEqual(l.get_xpath('//p/text()', TakeFirst()), u'paragraph')
        self.assertEqual(l.get_xpath('//p/text()', TakeFirst(), re='pa'), u'pa')

        self.assertEqual(l.get_xpath(['//p/text()', '//div/text()']), [u'paragraph', 'marta'])

    def test_replace_xpath_multi_fields(self):
        l = TestItemLoader(response=self.response)
        l.add_xpath(None, '//div/text()', TakeFirst(), lambda x: {'name': x})
        self.assertEqual(l.get_output_value('name'), [u'Marta'])
        l.replace_xpath(None, '//p/text()', TakeFirst(), lambda x: {'name': x})
        self.assertEqual(l.get_output_value('name'), [u'Paragraph'])

    def test_replace_xpath_re(self):
        l = TestItemLoader(response=self.response)
        self.assert_(l.selector)
        l.add_xpath('name', '//div/text()')
        self.assertEqual(l.get_output_value('name'), [u'Marta'])
        l.replace_xpath('name', '//div/text()', re='ma')
        self.assertEqual(l.get_output_value('name'), [u'Ma'])

    def test_add_css_re(self):
        l = TestItemLoader(response=self.response)
        l.add_css('name', 'div::text', re='ma')
        self.assertEqual(l.get_output_value('name'), [u'Ma'])

        l.add_css('url', 'a::attr(href)', re='http://(.+)')
        self.assertEqual(l.get_output_value('url'), [u'www.scrapy.org'])

    def test_replace_css(self):
        l = TestItemLoader(response=self.response)
        self.assert_(l.selector)
        l.add_css('name', 'div::text')
        self.assertEqual(l.get_output_value('name'), [u'Marta'])
        l.replace_css('name', 'p::text')
        self.assertEqual(l.get_output_value('name'), [u'Paragraph'])

        l.replace_css('name', ['p::text', 'div::text'])
        self.assertEqual(l.get_output_value('name'), [u'Paragraph', 'Marta'])

        l.add_css('url', 'a::attr(href)', re='http://(.+)')
        self.assertEqual(l.get_output_value('url'), [u'www.scrapy.org'])
        l.replace_css('url', 'img::attr(src)')
        self.assertEqual(l.get_output_value('url'), [u'/images/logo.png'])

    def test_get_css(self):
        l = TestItemLoader(response=self.response)
        self.assertEqual(l.get_css('p::text'), [u'paragraph'])
        self.assertEqual(l.get_css('p::text', TakeFirst()), u'paragraph')
        self.assertEqual(l.get_css('p::text', TakeFirst(), re='pa'), u'pa')

        self.assertEqual(l.get_css(['p::text', 'div::text']), [u'paragraph', 'marta'])
        self.assertEqual(l.get_css(['a::attr(href)', 'img::attr(src)']),
            [u'http://www.scrapy.org', u'/images/logo.png'])

    def test_replace_css_multi_fields(self):
        l = TestItemLoader(response=self.response)
        l.add_css(None, 'div::text', TakeFirst(), lambda x: {'name': x})
        self.assertEqual(l.get_output_value('name'), [u'Marta'])
        l.replace_css(None, 'p::text', TakeFirst(), lambda x: {'name': x})
        self.assertEqual(l.get_output_value('name'), [u'Paragraph'])

        l.add_css(None, 'a::attr(href)', TakeFirst(), lambda x: {'url': x})
        self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org'])
        l.replace_css(None, 'img::attr(src)', TakeFirst(), lambda x: {'url': x})
        self.assertEqual(l.get_output_value('url'), [u'/images/logo.png'])

    def test_replace_css_re(self):
        l = TestItemLoader(response=self.response)
        self.assert_(l.selector)
        l.add_css('url', 'a::attr(href)')
        self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org'])
        l.replace_css('url', 'a::attr(href)', re='http://www\.(.+)')
        self.assertEqual(l.get_output_value('url'), [u'scrapy.org'])
コード例 #30
0
 def setUp(self):
     body = get_testdata("link_extractor", "linkextractor.html")
     self.response = HtmlResponse(url="http://example.com/index",
                                  body=body)