Example #1
0
    def parse(self, response):
        print "test point"
        response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body)
        url = response.url
        #first_active = response.xpath('//*[@id="zh-profile-activity-page-list"]/div/div[1]/a').extract()
        
#        active_page_list = response.xpath('//*[@id="zh-list-answer-wrap"]/div/h2/a/text()').extract()
        active_page_list = response.xpath('//*[@id="zh-list-answer-wrap"]/div')
        file_obj = open('collection_now.log', 'w')

        for active_block in active_page_list:
            #active = active_block.xpath('.//div[1]/text()').extract()[1].strip()
            #question = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/text()').extract()
            #answer_link = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/@href').extract()[0]
            
            #if 'http' not in answer_link:
            #    answer_link = "http://www.zhihu.com" + answer_link
            question = active_block.xpath('.//h2/a/text()').extract()[0]
#            answer_link = active_block.xpath('.//div/div[1]/div[4]/div/a[@href="toggle-expand"]/@href').extract()
            answer_link = active_block.xpath('.//div/div[1]/div[4]/div/a/@href').extract()
            if len(answer_link) > 0:
                if 'http' not in answer_link[0]:
                    answer_link_str = "http://www.zhihu.com" + answer_link[0]
#                print question, answer_link_str
                file_obj.write(question.encode('utf-8') + '\t' + answer_link_str.encode('utf-8') + '\n')

#            file_obj.write('\n')

        file_obj.close()
Example #2
0
    def parse_kb(self, response):
        # initial html tokenization to find regions segmented by e.g. "======"
        # or "------"
        filtered = response.xpath(
            "//div[@class='sfdc_richtext']").extract()[0].split("=-")

        for entry in [x and x.strip() for x in filtered]:
            resp = HtmlResponse(url=response.url, body=entry,
                                encoding=response.encoding)

            for link in resp.xpath("//a"):
                href = link.xpath("@href").extract()[0]
                if "cache-www" in href:
                    text = resp.xpath("//text()").extract()
                    text_next = link.xpath("following::text()").extract()

                    item = FirmwareLoader(item=FirmwareImage(),
                                          response=response,
                                          date_fmt=["%b %d, %Y", "%B %d, %Y",
                                                    "%m/%d/%Y"])

                    version = FirmwareLoader.find_version_period(text_next)
                    if not version:
                        version = FirmwareLoader.find_version_period(text)

                    item.add_value("version", version)
                    item.add_value("date", item.find_date(text))
                    item.add_value("url", href)
                    item.add_value("product", response.meta["product"])
                    item.add_value("vendor", self.name)
                    yield item.load_item()
	def parse_json(self,response):
		data = response.body[1:-1]
		js = json.loads(data)
		response = HtmlResponse(url=response.url,body=js['data'].encode('utf8'))
		for href in response.css(settings["el_nacional"]['links']):
			full_url = response.urljoin(href.extract())
			yield scrapy.Request(full_url, callback=self.parse_links)
Example #4
0
 def parse_shop(self, response):
     print '\r\n\t======== Page Crawl Start - Company -----------'
     hxs = HtmlXPathSelector(response)
     item = init_item('shop') #初始化 shop item
     try :
         if conf['show_messages'] : print '----Company Fetch Start----'
     #--分析代码开始#################################################################################################################
         item['url'] = response.url
         item['logo_src'] = 'http://baidu.com/abc/ddd.jpg'
         item['photo_src'] = '/image/abcd.jpg'
         newurl = 'http://cn.china.cn' #构造企业介绍页Url
         try : #尝试加载新页面,使用代理IP
             proxy_handle = urllib2.ProxyHandler({ 'http' : get_proxy() })
             opener = urllib2.build_opener(proxy_handle)
             temp = opener.open(newurl,timeout=30) #请求
         except : #重试一次,如果仍无法打开.. 然后..就没有然后了
             proxy_handle = urllib2.ProxyHandler({ 'http' : get_proxy() })
             opener = urllib2.build_opener(proxy_handle)
             temp = opener.open(newurl,timeout=30) #请求
         temp = temp.read() #读数据
         newresponse = HtmlResponse(newurl)
         newresponse._set_body(temp)
         hxs = HtmlXPathSelector(newresponse) #构建新的xpath选择器
         #print temp
     #--分析代码结束#################################################################################################################
         if conf['show_messages'] : print '---- Fetch Success ----'
     except EOFError,e :
         if conf['show_messages'] : print '----Company Fetch Error Start----'
         print e
         if conf['show_messages'] : print '----Company Fetch Error End----'
Example #5
0
    def test_login_requests(self):
        name = "pinterest.com"
        spider = self.smanager.create(name)
        login_request = list(spider.start_requests())[0]

        response = HtmlResponse(url="https://pinterest.com/login/", body=open(join(_PATH, "data", "pinterest.html")).read())
        response.request = login_request
        form_request = login_request.callback(response)
        expected = {'_encoding': 'utf-8',
            'body': 'email=test&password=testpass&csrfmiddlewaretoken=nLZy3NMzhTswZvweHJ4KVmq9UjzaZGn3&_ch=ecnwmar2',
            'callback': 'after_login',
            'cookies': {},
            'dont_filter': True,
            'errback': None,
            'headers': {'Content-Type': ['application/x-www-form-urlencoded']},
            'meta': {},
            'method': 'POST',
            'priority': 0,
            'url': u'https://pinterest.com/login/?next=%2F'}

        self.assertEqual(request_to_dict(form_request, spider), expected)

        # simulate a simple response to login post from which extract a link
        response = HtmlResponse(url="http://pinterest.com/", body="<html><body><a href='http://pinterest.com/categories'></body></html>")
        result = list(spider.after_login(response))
        self.assertEqual([r.url for r in result], ['http://pinterest.com/categories', 'http://pinterest.com/popular/'])
Example #6
0
    def parse(self, response):
        print "test point"
        response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body)
        url = response.url
        #first_active = response.xpath('//*[@id="zh-profile-activity-page-list"]/div/div[1]/a').extract()
        
        #active_page_list = response.xpath('//*[@id="zh-profile-activity-page-list"]/div/div[1]/a[@class="question_link" or @class="post-link"]/text()').extract()
        active_page_list = response.xpath('//*[@id="zh-profile-activity-page-list"]/div')

        file_obj = open('active_now.log', 'w')

        for active_block in active_page_list:
            active = active_block.xpath('.//div[1]/text()').extract()[1].strip()
            question = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/text()').extract()
            answer_link_list = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/@href').extract()
            answer_link = ""
            if len(answer_link_list) > 0:
                answer_link = answer_link_list[0]
            
            question_txt = ""
            if len(question) > 0:
                question_txt = question[0] 
            if 'http' not in answer_link:
                answer_link = "http://www.zhihu.com" + answer_link
            file_obj.write(active.encode('utf-8') + '\t' + question_txt.encode('utf-8') + '\t' + answer_link.encode('utf-8') + '\n')

#            file_obj.write('\n')
            print answer_link
        file_obj.close()
Example #7
0
    def parse(self, response):
        marker_txt = re.findall(re.compile("markerData.*\}", re.MULTILINE), response.body_as_unicode())
        if not len(marker_txt):
            return
        markers_json = "{\"" + marker_txt[0]
        markers = list(json.loads(markers_json).values())[0]

        if not len(markers):
            return
        for marker in markers:
            marker_response = HtmlResponse(url="", body=marker["info"].encode("utf-8"))
            hours = re.findall(r"\{\"label.*\}", marker["info"])
            hours = hours[0]
            parsed_hours = json.loads(hours)

            addr_parts = marker_response.css(".address span:not(.phone)::text").extract()
            url = marker_response.css("header a").xpath("@href").extract_first()
            city, state = addr_parts[-1].split(",")

            yield GeojsonPointItem(lat=marker.get("lat"), lon=marker.get("lng"),
                                   name=marker_response.css("header a::text").extract_first(default=None),
                                   addr_full=", ".join(addr_parts),
                                   city=city.strip(),
                                   state=state.strip(),
                                   country="United States",
                                   phone=marker_response.css(".phone::text").extract_first(),
                                   website=url,
                                   opening_hours=get_hours(parsed_hours["days"]),
                                   ref=url.split("/")[-1].split(".")[0])
    def crawlListPage(self):
        print '开始抓取列表页'
        self.openPage(
            "http://hotel.elong.com/nanjing/"
        )
        # 记录每页的循环次数(初始值为0)
        loop_num = 0
        # 标识页面是否已经爬取:False为未处理,反之为已处理
        if_handle = False

        # 总页面数
        page_num = 0
        hotel_num = int(self.driver.find_element_by_xpath("//span[@class='t24 mr5']").text)
        if hotel_num % 20==0:
            page_num = hotel_num/20
        else:
            page_num = hotel_num/20 + 1

        # 测试 抓取5页
        #page_num = 5

        while page_num>=1:
            loop_num += 1
            self.driver.find_element_by_tag_name("body").send_keys(Keys.END)
            #self.driver.find_element_by_tag_name("body").send_keys(Keys.PAGE_UP)
            if u"返后价" in self.driver.page_source:
                if if_handle == False:
                    self.__parseUrls(self.driver.page_source)
                    print u"获取酒店数为:%d" % len(self.listPageInfo)
                    if_handle = True
                try:
                    #判断是否在加载,若在加载,就等0.1s
                    response = HtmlResponse(url="My HTML String",body=self.driver.page_source,encoding="utf-8")
                    _loading = response.xpath("//div[@id='_loading_']/@style").extract()
                    while 1:
                        if _loading == []:
                            break
                        if u'none' in _loading[0]:
                            break
                        else:
                            #print '正在加载中......'
                            time.sleep(0.1)
                            response = HtmlResponse(url="My HTML String",body=self.driver.page_source,encoding="utf-8")
                            _loading = response.xpath("//div[@id='_loading_']/@style").extract()
                    if u"下一页" in self.driver.page_source:
                        self.driver.find_element_by_xpath("//div[@class='paging1']/a[@class='page_next']").click()
                        page_num -= 1
                        if_handle = False
                        loop_num = 0
                        time.sleep(random.uniform(1, 3))
                except Exception, e:
                    print "error happen at clicking next-page"
                    print e

            if loop_num != 0:
                if loop_num < 15:
                    time.sleep(1)
                    continue
                else:
                    break
Example #9
0
    def parse_kb(self, response):
        mib = None

        # need to perform some nasty segmentation because different firmware versions are not clearly separated
        # reverse order to get MIB before firmware items
        for entry in reversed(response.xpath(
                "//div[@id='support-article-downloads']/div/p")):
            for segment in reversed(entry.extract().split("<br><br>")):
                resp = HtmlResponse(
                    url=response.url, body=segment, encoding=response.encoding)
                for href in resp.xpath("//a/@href").extract():
                    text = resp.xpath("//text()").extract()

                    if "MIBs" in href:
                        mib = href

                    elif "firmware" in href:
                        text = resp.xpath("//text()").extract()

                        item = FirmwareLoader(
                            item=FirmwareImage(), response=resp, date_fmt=["%m/%d/%Y"])
                        item.add_value("date", item.find_date(text))
                        item.add_xpath("url", "//a/@href")
                        item.add_value("mib", mib)
                        item.add_value("product", response.meta["product"])
                        item.add_value("vendor", self.name)
                        item.add_value(
                            "version", FirmwareLoader.find_version_period(text))
                        yield item.load_item()
 def parse_lobbyist(self, html, fn):
     response = HtmlResponse('http://localhost/test.html',
                             body='<book>%s</book>' % html)
     rows = response.css('row')
     item = fn(response, rows)
     actual = dict(item)
     return actual
 def parse_row(self, html, fn):
     response = HtmlResponse('http://localhost/test.html',
                             body='<table>%s</table>' % html)
     row = response.css('tr')[0]
     item = fn(response, row)
     actual = dict(item)
     return actual
 def __crawlHotelComment(self,driver,hotel_id ,pagenum):
     pagenum = int(pagenum)
     # 遍历所有页
     while pagenum>=1:
         response = HtmlResponse(url="My HTML String", body=self.driver.page_source, encoding="utf-8")
         loading = response.xpath("//div[@id='commentLoading']/@style").extract()[0]
         #当加载不显示时,才爬取
         while loading!=u'display: none;':
             print '正在加载......'
             time.sleep(0.1)
             response = HtmlResponse(url="My HTML String", body=self.driver.page_source, encoding="utf-8")
             loading = response.xpath("//div[@id='commentLoading']/@style").extract()[0]
         itemlist =  response.xpath("//ul[@class='dcomt_list']/li")
         for item in itemlist:
             username = item.xpath(".//div[@class='dcomt_head left']/div[2]/span/text()").extract()[0]
             remarkText = item.xpath(".//p[@class='dcomt_con_txt']/text()").extract()[0]
             #TODO 过滤 非中文字符 待修改
             remarkText = remarkText.encode("gbk",'ignore')
             remarkText = remarkText.decode("gbk")
             remark = ''
             for string in remarkText:
                 remark = remark + re.sub("\s+", "", string)
             user_type = item.xpath(".//div[@class='dcomt_head_pic']/p/text()").extract()[0]
             comm_time = item.xpath(".//span[@class='dcomt_con_time']/text()").extract()[0]
             goodorbad = item.xpath(".//p[@class='mb5']/i/@class").extract()[0]
             comm_type = ''
             if u'good' in  goodorbad:
                 comm_type = "值得推荐"
             if u'bad' in goodorbad:
                 comm_type = "有待改善"
             senti_value = self.hotelNLP.sentiment(remark.encode("utf-8"))
             viewpoint = json.dumps(self.hotelNLP.viewpoint(remark.encode("utf-8"),decoding="utf-8"))
             comm ={
                 "guid":uuid.uuid1(),
                 "username":username,
                 "remark":remark,
                 "comm_time":comm_time,
                 "user_type":user_type,
                 "comm_type":comm_type,
                 "senti_value":senti_value,
                 "viewpoint":viewpoint,
                 "baseinfo_id":hotel_id
             }
             if self.__is_exist_in_comment_list(comm) is False:
                 self.commList.append(comm)
             else:
                 #print comm['remark']
                 pass
         if pagenum == 1:
             break
         #点下一页
         self.scroll_and_click_by_xpath("//div[@id='comment_paging']/a[@class='page_next']")
         pagenum  -= 1
         time.sleep(random.uniform(1,4))
         print pagenum
     return True
 def parse_field(self, html, fn):
     response = HtmlResponse('http://localhost/test.html',
                             body='<table><tr>%s</tr></table>' % html)
     row = response.css('tr')[0]
     node = response.css('td')[0]
     lobbyist = Loader(self.spider, response, Lobbyist(), row)
     lobbyist.add_value(None, fn(node))
     item = lobbyist.load_item()
     actual = dict(item)
     return actual
    def test_caching(self):
        r1 = HtmlResponse('http://www.example.com', body=b'<html><head></head><body></body></html>')
        r2 = r1.copy()

        doc1 = LxmlDocument(r1)
        doc2 = LxmlDocument(r1)
        doc3 = LxmlDocument(r2)

        # make sure it's cached
        assert doc1 is doc2
        assert doc1 is not doc3
Example #15
0
    def test_generic_form_requests_with_spider_args(self):
        name = "ebay3"
        args = {'search_string': 'Cars'}
        spider = self.smanager.create(name, **args)
        generic_form_request = list(spider.start_requests())[0]

        response = HtmlResponse(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        request_list = [request_to_dict(req, spider)
                             for req in generic_form_request.callback(response)]
        expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse', 'method': 'GET', 'errback': None}]
        self.assertEqual(request_list, expected)
Example #16
0
def get_response(url, meta={}):
    url = canonicalize_url(url)
    r = requests.get(url)

    res = r.text
    final_url = r.url

    to_encoding = 'utf-8'
    response = HtmlResponse(url=final_url, body=res, encoding=to_encoding)
    response.request = Request(url, meta=meta)

    return response
Example #17
0
 def process_response(self, request, response, spider):
     #log.msg('%s is type %s' % (response.url, type(response)), level=log.DEBUG)
     if type(response) is Response and not _file_pattern.match(response.url):
         response = HtmlResponse(response.url, body=response.body)
         
     if hasattr(response, 'body_as_unicode'):
         hdoc = html.fromstring(response.body_as_unicode())
         links = hdoc.xpath('//a')
         for link in links:
             href = link.get('href')
             link.set('href', urlparse.urljoin(get_base_url(response), href) )    
         return response.replace(body=html.tostring(hdoc, encoding='unicode'))            
     return response
    def test_caching(self):
        r1 = HtmlResponse('http://www.example.com', body='<html><head></head><body></body></html>')
        r2 = r1.copy()

        doc1 = LxmlDocument(r1)
        doc2 = LxmlDocument(r1)
        doc3 = LxmlDocument(r2)

        # make sure it's cached
        assert doc1 is doc2
        assert doc1 is not doc3

        # don't leave documents in memory to avoid wrong libxml2 leaks reports
        del doc1, doc2, doc3
Example #19
0
 def parse(self, response):
    #pprint.pprint("------------------------------")
    cadena_temp_1 = response.body.split("<TABLE  CELLSPACING=1>")
    cadena_temp_1 = cadena_temp_1[1].split("</TABLE>")
    cadena_temp_1[0] = '<HTML><BODY><TABLE  CELLSPACING=1>'.lower() + cadena_temp_1[0].lower() + '</TABLE></BODY></HTML>'.lower()
    response_2 = HtmlResponse(url="http://nuforc.org/webreports/ndxevent.html", body=cadena_temp_1[0])
    for registro in response_2.xpath('.//body/table/tbody/tr'):
         item_tabla = CrawlerUfoItem()
         item_tabla['report_href'] = registro.xpath('td[1]/font/a/@href').extract()[0]
         item_tabla['report_text'] = registro.xpath('td[1]/font/a/text()').extract()[0]
         item_tabla['count'] = registro.xpath('td[2]/font/text()').extract()[0]
         #pprint.pprint(item_tabla)
         url_nuevo =	'http://nuforc.org/webreports/' + item_tabla['report_href']
         yield scrapy.Request(url_nuevo, body = "", method = 'GET', headers={"content-type":"application/x-www-form-urlencoded"}, callback = self.parse_2, dont_filter = True)
 def __parseUrls(self, page_source):
     response = HtmlResponse(url="my HTML string",body=page_source,encoding="utf-8")
     # 抽取出每页中的酒店url存储到urlList中
     urlList = response.xpath("//a[@class='name']/@href").extract()
     commnumList = response.xpath("//div[@class='comment']/a/span/text()").extract()
     name_list = response.xpath("//a[@class='name']/text()").extract()
     if len(urlList) == len(commnumList) == len(name_list):
         for i in range(0,len(urlList)):
             self.listPageInfo.append({
                 "guid":uuid.uuid1(),
                 "url":urlList[i],
                 "hotel_name":name_list[i],
                 "OTA":"途牛",
                 "comm_num":int(commnumList[i]),
             })
Example #21
0
    def _get_url(url, request_kwargs={}):
        '''Returns a scrapy.html.HtmlResponse with the contents of the received
        url.

        Note that the session is kept intact among multiple calls to this
        method (i.e. cookies are passed over).
        '''
        response = betamax_session.get(url)
        scrapy_response = HtmlResponse(
            url=str(response.url),
            body=response.content,
        )
        scrapy_response.request = Request(url, **request_kwargs)

        return scrapy_response
Example #22
0
 def parse(self, response):
     # Wiener Linien returns HTML with an XML content type which creates an
     # XmlResponse.
     response = HtmlResponse(url=response.url, body=response.body)
     for item in response.css(".block-news-item"):
         il = FeedEntryItemLoader(
             response=response,
             timezone="Europe/Vienna",
             ignoretz=True,
             base_url="https://www.{}".format(self.name),
         )
         link = response.urljoin(item.css("a::attr(href)").extract_first())
         il.add_value("link", link)
         il.add_value("title", item.css("h3::text").extract_first())
         il.add_value("updated", item.css(".date::text").extract_first())
         yield scrapy.Request(link, self.parse_item, meta={"il": il})
    def test_response_libxml2_caching(self):
        r1 = HtmlResponse("http://www.example.com", body="<html><head></head><body></body></html>")
        r2 = r1.copy()

        doc1 = Libxml2Document(r1)
        doc2 = Libxml2Document(r1)
        doc3 = Libxml2Document(r2)

        # make sure it's cached
        assert doc1 is doc2
        assert doc1.xmlDoc is doc2.xmlDoc
        assert doc1 is not doc3
        assert doc1.xmlDoc is not doc3.xmlDoc

        # don't leave libxml2 documents in memory to avoid wrong libxml2 leaks reports
        del doc1, doc2, doc3
Example #24
0
 def parse_3(self, response, item):
    cadena_temp_2 = response.body.lower().replace("<p>","")
    response = HtmlResponse(url=response.url, body=cadena_temp_2)
    #pprint.pprint(response.xpath('.//body/table/font/caption/b/text()').extract()[0])
    #pprint.pprint("******************************")
    pprint.pprint(item)
    for registro in response.xpath('.//body/table/tbody'):
        if not registro.xpath('tr[1]/td/font').extract():
            item["detalle1"] = ""
        else:
            item["detalle1"] = registro.xpath('tr[1]/td/font').extract()[0]
        if not registro.xpath('tr[2]/td/font').extract():
            item["detalle2"] = ""
        else:
            item["detalle2"] = registro.xpath('tr[2]/td/font').extract()[0]
        #pprint.pprint(item_tabla_3)
        yield item
Example #25
0
def test_spider_crawls_links(spider, scrape_request, html_headers,
                             mock_html_twolinks):
    """Ensure spider always picks up relevant links to HTML pages"""
    # Use only 1 user agent for easier counting
    ua = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0')
    spider.batch_user_agents = [ua]

    # Generate a mock response based on html containing two links
    mock_response = HtmlResponse(url='http://test:12345',
                                 body=mock_html_twolinks,
                                 encoding='utf-8')
    mock_response.request = scrape_request
    mock_response.headers = html_headers
    mock_response.meta['user_agent'] = ua
    mock_response.meta['sitescan'] = factories.SiteScanFactory()
    mock_response.status = 200
    mock_response.flags = []

    # Call spider on the mock response
    pipeline_generator = spider.parse(mock_response)

    # We should have two new requests and one MarkupItem
    sites_expected = set([
        mock_response.url + '/link1.html',
        mock_response.url + '/link2.html',
    ])

    sites_collected = []
    for elem in pipeline_generator:
        if isinstance(elem, Request):
            sites_collected.append(elem.url)
        else:
            assert isinstance(elem, MarkupItem)

    assert sites_expected == set(sites_collected)
Example #26
0
 def parse_2(self, response):
    cadena_temp_1 = response.body.split("<TABLE  CELLSPACING=1>")
    cadena_temp_1 = cadena_temp_1[1].split("</TABLE>")
    cadena_temp_1[0] = ('<HTML><BODY><TABLE  CELLSPACING=1>' + cadena_temp_1[0] + '</TABLE></BODY></HTML>').lower()
    response = HtmlResponse(url=response.url, body=cadena_temp_1[0])
    #pprint.pprint("++++++++++++++++++++++++++++++")
    for registro in response.xpath('.//body/table/tbody/tr'):
        item = Crawler_2Item()
        if not registro.xpath('td[1]/font/a/text()').extract():
            item["date_text"] = ""
        else:
            item["date_text"] = registro.xpath('td[1]/font/a/text()').extract()[0]
        if not registro.xpath('td[1]/font/a/@href').extract():
            item["date_href"] = ""
        else:
            item["date_href"] = registro.xpath('td[1]/font/a/@href').extract()[0]
        if not registro.xpath('td[2]/font/text()').extract():
            item["city"] = ""
        else:
            item["city"] = registro.xpath('td[2]/font/text()').extract()[0]
        if not registro.xpath('td[3]/font/text()').extract():
            item["state"] = ""
        else:
            item["state"] = registro.xpath('td[3]/font/text()').extract()[0]
        if not registro.xpath('td[4]/font/text()').extract():
            item["shape"] = ""
        else:
            item["shape"] = registro.xpath('td[4]/font/text()').extract()[0]
        if not registro.xpath('td[5]/font/text()').extract():
            item["duration"] = ""
        else:
            item["duration"] = registro.xpath('td[5]/font/text()').extract()[0]
        if not registro.xpath('td[6]/font/text()').extract():
            item["summary"] = ""
        else:
            item["summary"] = registro.xpath('td[6]/font/text()').extract()[0]
        if not registro.xpath('td[7]/font/text()').extract():
            item["posted"] = ""
        else:
            item["posted"] = registro.xpath('td[7]/font/text()').extract()[0]
        #pprint.pprint(item_tabla_2) 
        url_nuevo =	'http://nuforc.org/webreports/' + item["date_href"]
        item["detalle1"] = ""
        item["detalle2"] = ""
        yield scrapy.Request(url_nuevo , body = "", method = 'GET', headers={"content-type":"application/x-www-form-urlencoded"}, dont_filter = True, callback = lambda r : self.parse_3(r, item) )
	def parse_links(self, response):
		try:
			reponse.css
		except:
			response = HtmlResponse(url=response.url,body=response.body)
		fecha = limpiar_autor_tc(response.css(settings[self.name]['fecha']).extract()[0].split('|')[1])
		current_date = True
		if(len(fecha)>10):
			current_date = obtener_fecha_tipo6(fecha.split(" ")[0])
		if(current_date):
			titulo = limpiar_autor_tc(response.css(settings[self.name]['titulo']).extract()[0])
			body = limpiar_ult_n(response.css(settings[self.name]['body']).extract())
			yield {
			'titulo': titulo,
			'autor': response.css(settings[self.name]['autor']).extract()[0],
			'fecha': fecha,
			'body': [body],
			'link': response.url,
			}
Example #28
0
    def parse_page(self, response):
        #print "test point"
        response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body)
        url = response.url
        name = response.xpath('//h2[@class="zm-item-title zm-editable-content"]/text()').extract()
        context_list = response.xpath('//div[@class="zm-editable-content"]/text()').extract()
        print name[0]
        for context in context_list:
            print context

        answer_num = response.xpath('//h3/@data-num').extract()
        if len(answer_num) == 0:
            print 1
        else:
            print answer_num[0]

        author_list = response.xpath('//*[@class="author-link"]/text()').extract()
        for author in author_list:
            print author
Example #29
0
    def test_parse_drug_details_or_overview_generates_new_request_if_redirected_to_search_page(self):
        url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.Search_Drug_Name'
        meta = {
            'original_url': 'http://www.accessdata.fda.gov/somewhere.cfm',
            'original_cookies': {
                'foo': 'bar',
            },
        }
        mock_response = HtmlResponse(url=url)
        mock_response.request = Request(url, meta=meta)

        with mock.patch('random.random', return_value='random_cookiejar'):
            spider = Spider()
            request = spider.parse_drug_details_or_overview(mock_response)

        assert request.url == meta['original_url']
        assert request.cookies == meta['original_cookies']
        assert request.dont_filter
        assert request.callback == spider.parse_drug_details_or_overview
        assert request.meta['cookiejar'] == 'random_cookiejar'
    def test_start_url_matcher(self):
        url = 'http://example.org'
        spider = self.spider_factory(start_urls=[url])

        response = HtmlResponse(url)

        rule = spider._rulesman.get_rule_from_response(response)
        self.failUnless(isinstance(rule.matcher, UrlListMatcher))

        response = HtmlResponse(url + '/item.html')

        rule = spider._rulesman.get_rule_from_response(response)
        self.failUnless(rule is None)

        # TODO: remove this block
        # in previous version get_rule returns rule from response.request
        response.request = Request(url)
        rule = spider._rulesman.get_rule_from_response(response.request)
        self.failUnless(isinstance(rule.matcher, UrlListMatcher))
        self.failUnlessEqual(rule.follow, True)
Example #31
0
def load_html():
    file = codecs.open("test/resources/covid_stub.html", 'r')
    response = HtmlResponse(url="my HTML string",
                            body=file.read(),
                            encoding='utf-8')
    return response
Example #32
0
    def parse1(self, response):
        rs = response.xpath('//div[@class="left"]//div[@class="sons"]')

        title = rs.xpath('//div[@class="cont"]//h1/text()').extract_first()
        print(title)
        quanwen = rs.xpath('//div[@class="cont"]//div[@class="contson"]')
        quanwen1 = quanwen[0].xpath('./text()').extract()
        print(quanwen1)

        qw = ""
        for item in quanwen1:
            if len(item) < 5:
                continue
            qw += item.strip()
            qw += '\r\n'

        if len(qw) < 12:
            quanwen = response.xpath('//div[@class="main3"]/div[@class="left"]/div[@class="sons"]/div[@class="cont"]/div[@class="contson"]').extract()[0]
            temp = re.sub(r'<p>', '', quanwen)
            temp = re.sub(r'</p>', '', temp)
            temp = re.sub(r'</div>', '', temp)
            temp = re.sub(r'(<div\s+class=\s*\".*?\">)', '', temp)
            qw1 = re.split('\<br>|\n', temp)
            for item in qw1:
                if len(item) < 5:
                    continue
                qw += item.strip()
                qw += '\r\n'
            # quanwen2 = quanwen[0].xpath('/p')
            # quanwen1 = quanwen2.xpath('./text()').extract()
            #quanwen1 = quanwen[0].xpath('/p/./text()').extract()
            # for item in quanwen1:
            #     qw += item.strip()
            #qw = '\r\n'.join(temp)
        print(qw)
        cd_zz = rs.xpath('//div[@class="cont"]//p//a/text()').extract()
        cd = str(cd_zz[0])
        zz = str(cd_zz[1])

        rs_all  = rs.xpath('//div[@class="contyishang"]')
        rs_h = rs_all.css('a::attr(href)')
        yw=""
        yuny=""
        zy=""
        yiny=""
        zs=""

        byw=True

        for href in rs_h:
            hreftxt = href.extract()
            if ("javascript:PlayFanyi" in hreftxt) and byw:
                print(hreftxt)
                byw=False
                yw_num = re.sub("\D", "", hreftxt)
                print(str(yw_num))
                yw_url = "https://so.gushiwen.org/fanyi_" + str(yw_num) +".aspx"
                html_requests = requests.get(yw_url).text.encode('utf-8')
                html_response = HtmlResponse(url=yw_url, body=html_requests, headers={'Connection': 'close'})
                rs = html_response.xpath(
                    '//div[@class="main3"]//div[@class="left"]//div[@class="contyishang"]/p[not(@style or contains(text(),"参考资料:"))]').extract()
                for temp1 in rs:
                    temp = re.sub(r'<p>', '', temp1)
                    temp = re.sub(r'</p>', '', temp)
                    temp = re.sub(r'<strong>', '', temp)
                    temp = re.sub(r'</strong>', '', temp)
                    temp = re.sub(r'<a>', '', temp)
                    temp = re.sub(r'</a>', '', temp)
                    temp = re.sub(r'\u3000', '', temp)
                    temp = re.sub(r'(<a\s+href=\s*\".*?\">)', '', temp)
                    yw1 = re.split('\<br>', temp)
                    if yw1[0] == "译文":
                        del yw1[0]
                        yw2 = '\r\n'.join(yw1)
                        yw +=yw2
                    if yw1[0] == "韵译":
                        del yw1[0]
                        yw2 = '\r\n'.join(yw1)
                        yuny +=yw2
                    if yw1[0] == "直译":
                        del yw1[0]
                        yw2 = '\r\n'.join(yw1)
                        zy +=yw2
                    if yw1[0] == "音译":
                        del yw1[0]
                        yw2 = '\r\n'.join(yw1)
                        yiny +=yw2
                    if yw1[0] == "注释":
                        del yw1[0]
                        yw2 = '\r\n'.join(yw1)
                        zs +=yw2

        c1 = self.conn.cursor()
        c1.execute("INSERT INTO guwen VALUES (?,?,?,?,?,?,?,?,?)",(title,zz,cd,qw,yw,yuny,zy,yiny,zs))

        self.conn.commit()
#名句end
Example #33
0
    def parse_product_reviews(self, response):

        for line in response.body.split('\n'):
            if line.startswith('var materials='):
                body = line.lstrip('var materials=').rstrip(',')
                break

        try:
            body = eval(body)
        except:
            logging.error('Failed to parse: ' + repr(response.body))
            body = ''

        # Emulate "normal" HTML response
        if body:
            body = ('<html><body>' + '%s' + '</body></html>') % (
                body['BVRRSourceID'].replace('\\/', '/'))

        response2 = HtmlResponse(url=response.url, body=body)
        response2.request = response.request

        hxs = HtmlXPathSelector(response2) if body else None
        base_url = self.get_base_url(response)
        product = response.meta['product']
        product['metadata'].setdefault('reviews', [])

        box_spec = self.PRODUCT_REVIEW_BOX or {}

        review_hxs = xpath_select(
            hxs, box_spec.get('xpath')
        ) if 'xpath' in box_spec and box_spec.get('xpath') != "." else hxs

        for review_box in review_hxs:
            loader = ReviewLoader(item=Review(),
                                  selector=hxs,
                                  date_format=self.PRODUCT_REVIEW_DATE_FORMAT)
            loader.add_value('url', urlparse.urljoin(base_url, response.url))
            # review full text
            full_text_specs = box_spec.get('full_text', []) if hasattr(
                box_spec.get('full_text', []),
                'append') else [box_spec['full_text']]
            full_text_parts = []
            for xpath in full_text_specs:
                items = xpath_select(
                    review_box,
                    xpath).extract() if not callable(xpath) else [xpath(hxs)]
                if any(items):
                    item_text = self.REVIEW_TEXT_JOIN.join([
                        e.replace(u'\xa0', u' ').strip(self.REVIEW_TEXT_STRIP)
                        for e in items
                    ])
                    full_text_parts.append(item_text)

            review_text = self.REVIEW_PARAGRAPH_JOIN.join(full_text_parts)
            loader.add_value('full_text', review_text)

            if box_spec.get('date'):
                date = review_box.select(
                    box_spec.get('date')).extract() if not callable(
                        box_spec.get('date')) else [
                            box_spec['date'](review_box)
                        ]
                loader.add_value('date', date[0] if date else None)

            if box_spec.get('rating'):
                rating_text = review_box.select(
                    box_spec.get('rating')).extract() if not callable(
                        box_spec.get('rating')) else [
                            box_spec['rating'](review_box)
                        ]
                loader.add_value('rating',
                                 rating_text[0] if rating_text else None)

            review = loader.load_item()
            if review.get('full_text') or review.get('date'):
                product['metadata']['reviews'].append(review)

        next_page = xpath_select(hxs, box_spec.get('next_url')).extract() if (
            box_spec.get('next_url') and not callable(box_spec['next_url'])
        ) else [box_spec['next_url'](response, hxs)] if callable(
            box_spec.get('next_url')) else None
        next_page_url = urlparse.urljoin(
            base_url, next_page[0]) if any(next_page) else None

        if not next_page_url or next_page_url in self.visited_urls or not review_hxs:
            yield self.clean_product(product)
        else:
            self.visited_urls.add(next_page_url)
            yield Request(url=next_page_url,
                          meta=dict(**response.meta),
                          callback=self.parse_product_reviews)
Example #34
0
 def parse(self, response: HtmlResponse):
     next_page = response.xpath('//a[@class="catalog-pagination__item _text js-pagination-catalog-item"]/@href').extract_first()
     links = response.xpath('//a[@class="book__image-link js-item-element ddl_product_link"]/@href').extract()
     for link in links:
         yield response.follow(link, callback=self.handle_book_data)
     yield response.follow(next_page, callback=self.parse)
Example #35
0
def reparse_by_id(session, id):
    url = session.query(Article.url).filter(Article.id == id).first()[0]
    response = requests.get(url)
    response = HtmlResponse(url=url, body=response.content)
    WikiResponseProcessor.DBResponseProcessor().process(response, id_to_update=id)
Example #36
0
    def handle_response(self, response):
        lru = url_to_lru_clean(response.url)

        if self.phantom:
            self.phantom.get(response.url)

            # Collect whole DOM of the webpage including embedded iframes
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

            # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", log.INFO)
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout, self.ph_idle_timeout,
                        self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", log.INFO)
                except SeleniumTimeout:
                    self.log(
                        "Scrolling/Unfolding timed-out (%ss)" %
                        self.ph_timeout, log.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err,
                             log.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log(
                        "Scrolling/Unfolding crashed: %s %s" % (type(e), e),
                        log.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

    # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url,
                                        headers=response.headers,
                                        body=cleanupbase64images(
                                            response.body),
                                        flags=flags,
                                        request=response.request)
                self.log(
                    "WARNING: page with base64 embedded images was cleaned-up for links extraction"
                )
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)
Example #37
0
 def process_request(self,request,spider):
     dr=webdriver.PhantomJS()
     dr.get(request.url)
     time.sleep(2)
     body=dr.page_source
     return HtmlResponse(dr.current_url,body=body.replace(u'\xa9',u''),encoding='utf-8',request=request)
Example #38
0
    <a href='image00.html'>Name: Image 00 <br/><img src='image00.jpg' /></a>
    <a href='image01.html'>Name: Image 01 <br/><img src='image01.jpg' /></a>
    <a href='image02.html'>Name: Image 02 <br/><img src='image02.jpg' /></a>
    <a href='image03.html'>Name: Image 03 <br/><img src='image03.jpg' /></a>
    <a href='image04.html'>Name: Image 04 <br/><img src='image04.jpg' /></a>


    </div>

    </body>
</html>

'''

response = HtmlResponse(url='http://www.example.com',
                        body=body,
                        encoding='utf-8')

print(response.xpath('/html'))
print(response.xpath('/html/head'))
print(response.xpath('/html/body/div/a'))

print('\n//E 选中文档中所有E')
for a in response.xpath('//a'):
    print(a)

print('E1//E2:选中E1后代节点中的所有E2,无论在后代中的什么位置')
for a in response.xpath('/html/body//img'):
    print(a)

print('E/text():选中E的文本子节点')
 def vacancy_parce(self, response: HtmlResponse):
     name1 = response.css("div.vacancy-title h1::text").extract_first()
     salary1 = response.xpath("//span[@class='bloko-header-2 bloko-header-2_lite']/text()").extract()
     link1 = response.url
     src = "hh.ru"
     yield JobparserItem(name=name1, salary=salary1, link=link1, src=src)
Example #40
0
'''

"""
Scrapy选择器是Selector通过传递文本或TextResponse 对象构造的类的实例。
它根据输入类型自动选择最佳解析规则(XML vs HTML)
"""

from scrapy.selector import Selector
from scrapy.http import HtmlResponse

# 从文本构建
body = '<html><body><span>good</span></body></html>'
print(Selector(text=body).xpath('//span/text()').extract())

# 从response(响应)中构建
response = HtmlResponse(url='https://sebastianraschka.com/blog/index.html', body=body, encoding='utf-8')
print(Selector(response=response).xpath('//*/h1[@class="post-title"]/text()').extract())
# 上面那句等价于下面这句
print(response.selector.xpath('//*/h1[@class="post-title"]/text()').extract())

response = r"""
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
Example #41
0
 def user_parse(self, response: HtmlResponse):
     j_body = json.loads(response.text)
     if j_body['authenticated']:
         yield response.follow(f'/{self.parse_user}',
                               callback=self.user_data_parse,
                               cb_kwargs={'username': self.parse_user})
Example #42
0
from scrapy.selector import Selector
from scrapy.http import HtmlResponse

body = '<html><body><span>good</span></body></html>'
p = Selector(text=body).xpath('//span/text()').extract()

print(p)

response = HtmlResponse(url='http://example.com', body=body, encoding='utf-8')
print(Selector(response=response).xpath('//span/text()').extract())
Example #43
0
 def process_request(self, request, spider):
     driver = webdriver.PhantomJS()
     driver.get(request.url)
     return HtmlResponse(request.url,
                         encoding='utf-8',
                         body=driver.page_source.encode('utf-8'))
Example #44
0
 def parse_multiple_via_pages(self, response):
     response = HtmlResponse(url=self.shops_root_url, body=response.body)
     le = LinkExtractor(
         allow=[r"%s" % regex for regex in self.via_page_url_regex])
     for link in le.extract_links(response):
         yield scrapy.Request(url=link.url, callback=self.parse_single_page)
 def setUp(self):
     body = get_testdata('link_extractor', 'sgml_linkextractor.html')
     self.response = HtmlResponse(url='http://example.com/index', body=body)
 def test_priority_adjust(self):
     req = Request('http://a.com')
     rsp = HtmlResponse(req.url, body=self._body())
     req2 = self.mw.process_response(req, rsp, self.spider)
     assert req2.priority > req.priority
 def test_meta_refresh(self):
     req = Request(url='http://example.org')
     rsp = HtmlResponse(req.url, body=self._body())
     req2 = self.mw.process_response(req, rsp, self.spider)
     assert isinstance(req2, Request)
     self.assertEqual(req2.url, 'http://example.org/newpage')
Example #48
0
def crawl_product_id():
    product_id_list = []
    i = 1
    while (i < 3):
        driver = webdriver.Chrome("C:/bin/chromedriver.exe",
                                  chrome_options=options)
        driver.get(laptop_page_url.format(i))
        if "https://shopee.vn/Laptop-cat.13030.13065" in laptop_page_url.format(
                i):
            y = 2300
            x = 1
            while y <= 4800:
                driver.execute_script("window.scrollTo(0, " + str(y) + ")")
                y += 1000
                # print("aaaaaaaaaaa")
                # try:
                #     print("bbbbbbb" ,WebDriverWait(driver, 1).until(EC.presence_of_element_located(
                #         (By.XPATH, '//*[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[2]/div[1]/div'.format({x})))))
                #     print("Page is ready!")
                # except TimeoutException:
                #     print("cccccccc")
                #     print("Loading took too much time!")
                x += 10
            body = driver.page_source
            abc = driver.current_url
            response = HtmlResponse(abc, body=body, encoding='utf8')
            print(body)
            if (response == None):
                break

            for product in response.css(
                    "div.col-xs-2-4.shopee-search-item-result__item"):
                try:
                    url = product.css("div a::attr(href)").get()
                    print("link ok: ", url)

                    product_key = url.rsplit("-i.", 1)[1]
                    # product_id_dict = {"shop_id": product_key[0], "item_id": product_key[1]}
                    # shop_id = product_key[0]
                    # item_id = product_key[1]
                    # parser = BeautifulSoup(body, 'html.parser')
                    # product_box = parser.findAll(class_="col-xs-2-4 shopee-search-item-result__item", )
                    # if (len(product_box) == 0):
                    #     break
                    # print(product_box[0])
                    # for product in product_box:
                    #     # href = product.get("href").rsplit("-i.", 1)[1]
                    #     # product_id = href.split(".html")[0]
                    #     product_id = product.get("div a::attr(href)")
                    #     # product_id = product.css("div a::attr(href)").get()
                    #     # product_id = product.get("href")
                    product_id_list.append(product_key)
                except:
                    print("no!")
        driver.close()
        print("Crawl page: ", i)
        print(product_id_list)
        # response = requests.get(laptop_page_url.format(i), params=params, headers=headers)
        # parser = BeautifulSoup(response.text, 'html.parser')
        # # print(response.content)
        # product_box = parser.findAll('a', class_="col-xs-2-4 shopee-search-item-result__item")
        #
        # if (len(product_box) == 0):
        #     break
        #
        # for product in product_box:
        #     href = product.get("href")
        #     print(href)

        i += 1

    return product_id_list, i
Example #49
0
def crawllianjie(doc = driver.page_source):
    response1 = HtmlResponse(url="my HTML string", body=doc, encoding="utf-8")

    A = response1.xpath("//dl[@class='list-noimg job-j-list clearfix job-new-list']")
    
    for B in A:
        ID = B.xpath('@pt').extract()[0]
        
        href = B.xpath('a/@href').extract()[0]
        href = 'http://gz.ganji.com' + href
        name = B.xpath('a/dt[@class = "fl per-info"]/div/div[@class="basic-info"]/span[@class="name"]/text()').extract()[0]
        sex = B.xpath('a/dt[@class = "fl per-info"]/div/div[@class="basic-info"]/span[2]/text()').extract()[0]
        age = B.xpath('a/dt[@class = "fl per-info"]/div/div[@class="basic-info"]/span[3]/text()').extract()[0]
        xueli = B.xpath('a/dt[@class = "fl per-info"]/div/div[@class="basic-info"]/span[4]/text()').extract()
        if(xueli):
            xueli = xueli[0]
        else:
            xueli = 'none'
        jingli = B.xpath('a/dt[@class = "fl per-info"]/div/div[@class="basic-info"]/span[5]/text()').extract()
        if(jingli):
            jingli = jingli[0]
        else:
            jingli = 'none'
        
        address = B.xpath('a/div[@class="fl district-salary"]/p[@class="district"]/text()').extract()[0]
        salary = B.xpath('a/div[@class="fl district-salary"]/p[@class="salary"]/text()').extract()[0]
        address = address.strip()
        salary = salary.strip()
        time1 = B.xpath('a/div[@class="order fr"]/text()').extract()[0]
        time1 = time1.strip()
        
        
        time.sleep(7)
        res = requests.get(href, headers=head, cookies = Cookie)
        html = res.text
        
        doc = etree.HTML(str(html))
        try:
            A1 = doc.xpath('//div[@class="tend-line clearfix"]/b/a')
            qiuzhi = []
            for B1 in A1:
                i = B1.xpath('text()')
                if(i):
                    i = i[0]
                else:
                    i = 'none'
                
                qiuzhi.append(i)
            qiuzhi = ','.join(qiuzhi)
        except:
            qiuzhi = 'none'
        
        try:
            work_exp1 = doc.xpath('//div[@class="experience-block"]/p/text()')
            exp = []
            if(work_exp1):
                work_exp1 = work_exp1[0]
                exp.append(work_exp1)
                didian = doc.xpath('//div[@class="experience-block"]/b')
                content = doc.xpath('//div[@class="experience-block"]/ul')
                for m,n in zip(didian,content):
                    company = m.xpath('text()')[0]
                    time2 = n.xpath('li[1]/p/text()')[0]
                    time3 = n.xpath('li[1]/p/i/text()')[0]
                    
                    zhiwei = n.xpath('li[2]/p/text()')[0]
                    work_content = n.xpath('li[3]/p/text()')
                    if(work_content):
                        work_content = work_content[0]
                    else:
                        work_content = 'none'
                    he = ','.join([company,time2,time3,zhiwei,work_content])
                    exp.append(he)
                exp = ','.join(exp)
            else:
                exp = 'none'
        except:
            exp = 'none'
        #print(exp)
        
        try:
            edu = doc.xpath('//div[@class="education-block"]/table/tbody/tr')
            if(edu):
                edu1 = []
                for a in edu:
                    i1 = a.xpath('td[1]/text()')[0]
                    i2 = a.xpath('td[2]/text()')
                    if(i2):
                        i2 = i2[0]
                    else:
                        i2 = 'none'
                    i3 = a.xpath('td[3]/text()')
                    if(i3):
                        i3 = i3[0]
                    else:
                        i3 = 'none'
                    i4 = ','.join([i1,i2,i3])
                    edu1.append(i4)
                edu1 = ','.join(edu1)
            else:
                edu1 = 'none'
        except:
            edu1 = 'none'
            
        
        try:    
            zhengshu = doc.xpath('//div[@class="project-block"]/table/tbody/tr')
            if(zhengshu):
                zhengshu1 = []
                for a in zhengshu:
                    i1 = a.xpath('td[1]/text()')[0]
                    i2 = a.xpath('td[2]/text()')
                    if(i2):
                        i2 = i2[0]
                    else:
                        i2 = 'none'
                    i3 = a.xpath('td[3]/text()')
                    if(i3):
                        i3 = i3[0]
                    else:
                        i3 = 'none'
                    i4 = ','.join([i1,i2,i3])
                    zhengshu1.append(i4)
                zhengshu1 = ','.join(zhengshu1)
            else:
                zhengshu1 = 'none'
        except:
            zhengshu1 = 'none'
            
        self1 = doc.xpath('//div[@class="self-block"]/div/text()')
        if(self1):
            self1 = self1[0]
        else:
            self1 = 'none'
            
        print(name,time1,qiuzhi)
        #print(ID,name,sex,age,xueli,jingli,address,salary,time1, href,
          #    exp, edu1, zhengshu1, self1)
        
        cursor = connect.cursor()

       
        sql = "INSERT IGNORE INTO ganji(ID,name,sex,age,xueli,jingli,address,salary,time1, href, exp, edu1, zhengshu1, self1,qiuzhi) VALUES ( '%s', '%s', '%s', '%s', '%s','%s','%s', '%s', '%s', '%s', '%s','%s','%s', '%s','%s'  )"
        data = (ID,name,sex,age,xueli,jingli,address,salary,time1, href,
              exp, edu1, zhengshu1, self1, qiuzhi)
        try:
            cursor.execute(sql % data)
        except:
            print(ID)

        connect.commit()
    
        
Example #50
0
        <meta charset="UTF-8">
        <title></title>
    </head>
    <body>
        <ul>
            <li class="item-"><a id='i1' href="link.html" class='ding'>first item</a></li>
            <li class="item-0"><a id='i2' href="llink.html" class='ding'>first item</a></li>
            <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
        </ul>
        <div><a href="llink2.html">second item</a></div>
        <div><a href="llink2.html">10</a></div>
    </body>
</html>
"""
# 构造response对象
response = HtmlResponse(url='', body=html, encoding='utf-8')
selector = Selector(response=response)
# 获取所有a标签
temp = selector.xpath('//a')
# 获取第一个body标签并从body标签开始找ul标签 ./ul 相对标签的子标签
temp = selector.xpath('body')[0].xpath('.//ul')
print(temp)
exit()
# 获取body的子标签ul
temp = selector.xpath('body/ul')
# 获取body的后代标签li
temp = selector.xpath('body//li')
# []空,li不是body的子标签
temp = selector.xpath('body/li')
# 获取body的父标签
temp = selector.xpath('body')[0].xpath('..')
Example #51
0
from scrapy.selector import Selector
from scrapy.http import HtmlResponse


# construct from text
body = '<html><body><span>good</span></body></html>'
print(Selector(text=body).xpath('//span/text()').extract())

# construct from response
response = HtmlResponse(url='http://doc.scrapy.org/en/latest/_static/selectors-sample1.html')
print(Selector(response=response).xpath('//title/text()').extract())
Example #52
0
    def test_generic_form_requests_with_file_field(self):
        name = "ebay2"
        spider = self.smanager.create(name)
        generic_form_request = list(spider.start_requests())[0]

        self.assertEqual(generic_form_request.url,
                         'file://tmp/test_params.txt')
        response = HtmlResponse(url='file://tmp/test_params.txt',
                                body=open(
                                    join(_PATH, "data",
                                         "test_params.txt")).read())
        response.request = generic_form_request
        requests = list(generic_form_request.callback(response))
        request_list = [request_to_dict(req, spider) for req in requests]
        expected = [{
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {
                u'xpath':
                u"//form[@name='adv_search_from']",
                u'form_url':
                u'http://*****:*****@name='_nkw']",
                    'file_values': ['Cars', 'Boats'],
                    u'type': u'inurl',
                    u'value': u'file://tmp/test_params.txt'
                }, {
                    u'type': u'inurl',
                    u'name': u'_nkw2',
                    u'value': u'file://tmp/test_params.txt'
                }, {
                    u'xpath': u".//*[@name='_in_kw']",
                    u'type': u'iterate'
                }]
            },
            'headers': {},
            'url': u'file://tmp/test_params.txt',
            'dont_filter': True,
            'priority': 0,
            'callback': 'parse_field_url_page',
            'method': 'GET',
            'errback': None
        }]
        self.assertEqual(request_list, expected)

        generic_form_request = requests[0]
        self.assertEqual(generic_form_request.url,
                         'file://tmp/test_params.txt')
        response = HtmlResponse(url='file://tmp/test_params.txt',
                                body=open(
                                    join(_PATH, "data",
                                         "test_params.txt")).read())
        response.request = generic_form_request

        requests = list(generic_form_request.callback(response))
        request_list = [request_to_dict(req, spider) for req in requests]
        expected = [{
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {
                u'xpath':
                u"//form[@name='adv_search_from']",
                u'fields': [{
                    u'xpath': u".//*[@name='_nkw']",
                    'file_values': ['Cars', 'Boats'],
                    u'type': u'inurl',
                    u'value': u'file://tmp/test_params.txt'
                }, {
                    'file_values': ['Cars', 'Boats'],
                    u'type': u'inurl',
                    u'name': u'_nkw2',
                    u'value': u'file://tmp/test_params.txt'
                }, {
                    u'xpath': u".//*[@name='_in_kw']",
                    u'type': u'iterate'
                }],
                u'type':
                u'form',
                'field_index':
                1
            },
            'headers': {},
            'url': u'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc',
            'dont_filter': True,
            'priority': 0,
            'callback': 'parse_form_page',
            'method': 'GET',
            'errback': None
        }]
        self.assertEqual(request_list, expected)

        generic_form_request = requests[0]
        self.assertEqual(
            generic_form_request.url,
            'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc')
        response = HtmlResponse(
            url="http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
            body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        request_list = [
            request_to_dict(req, spider)
            for req in generic_form_request.callback(response)
        ]
        expected = [{
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url': u'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc',
            'dont_filter': True,
            'priority': 0,
            'callback': 'parse',
            'method': 'GET',
            'errback': None
        }]
        self.assertEqual(request_list, expected)
Example #53
0
    def process_request(self, request, spider):
        #0_imo_org动态加载
        if (spider.name == "0_imo_org"
                and str(request.url).count("osssearchresults")):
            print("PhantomJS is starting...")
            driver = webdriver.PhantomJS(
                executable_path=settings['JS_BIN'])  # 指定使用的浏览器
            # driver = webdriver.Firefox()
            driver.get(request.url)
            body = driver.page_source
            return HtmlResponse(driver.current_url,
                                body=body,
                                encoding='utf-8',
                                request=request)

        # 0_worldcargonews动态加载翻页
        if (spider.name == "0_worldcargonews"
                and (str(request.url).count("search"))):
            print("PhantomJS is starting...")
            driver = webdriver.PhantomJS(
                executable_path=settings['JS_BIN'])  # 指定使用的浏览器
            #driver = webdriver.Firefox()
            driver.get(request.url)
            time.sleep(5)
            # print(driver.page_source)
            # 翻页(因为是每天都运行此爬虫程序,所以没有必要翻页)
            worldcargonews_look_more = '//div[@class="aoci aos-searchc"]/button'
            for i in range(1, 1):
                try:
                    driver.find_element_by_xpath(
                        worldcargonews_look_more).click()  # 数据由js来控制,点击后加载数据
                    time.sleep(2)
                    print("more page")
                except:
                    print("get news data failed")
            body = driver.page_source
            print("final page")
            #print(driver.page_source)
            # driver.close()
            return HtmlResponse(driver.current_url,
                                body=body,
                                encoding='utf-8',
                                request=request)
            driver.close()

        #2_Brunei_jpm动态加载
        if (spider.name == "2_Brunei_jpm"):
            print("PhantomJS is starting...")
            driver = webdriver.PhantomJS(
                executable_path=settings['JS_BIN'])  # 指定使用的浏览器
            # driver = webdriver.Firefox()
            driver.get(request.url)
            body = driver.page_source
            return HtmlResponse(driver.current_url,
                                body=body,
                                encoding='utf-8',
                                request=request)

        # 6_Malaysia_miti动态加载翻页
        if (spider.name == "6_Malaysia_miti"
                and str(request.url).count("search")):
            print("PhantomJS is starting...")
            driver = webdriver.PhantomJS(
                executable_path=settings['JS_BIN'])  # 指定使用的浏览器
            #driver = webdriver.Firefox()
            driver.get(request.url)
            time.sleep(1)
            # 翻页
            Malaysia_miti_look_more = '//*[@id="more"]'
            if (str(request.url).count("search")):
                for i in range(1, 1):
                    try:
                        driver.find_element_by_xpath(
                            Malaysia_miti_look_more).click(
                            )  # 数据由js来控制,点击后加载数据
                        time.sleep(2)
                        #true_page = driver.page_source
                        print("more page")
                    except:
                        print("get news data failed")
            body = driver.page_source
            print("final page")
            #print(driver.page_source)
            # driver.close()
            return HtmlResponse(driver.current_url,
                                body=body,
                                encoding='utf-8',
                                request=request)
            driver.close()

        #10_Thailand_thaigov变语言
        if (spider.name == "10_Thailand_thaigov"):
            driver = webdriver.PhantomJS(
                executable_path=settings['JS_BIN'])  # 指定使用的浏览器
            # driver = webdriver.Firefox()
            driver.get(request.url)
            time.sleep(5)
            #变语言
            change_lang = driver.find_element_by_xpath(
                '//*[@id="destop"]/div[@class="col-sm-8 col-md-8 remove-xs"]'
                '/div[@class="col-xs-6 col-md-2 remove-xs"]/a[2]')  # .click()
            driver.execute_script("arguments[0].click();", change_lang)
            #time.sleep(20)
            WebDriverWait(driver, 30).until(lambda x: x.find_element_by_xpath(
                "//*[contains(text(),'Change')]"))
            if ((str(request.body).count("Thursday 01 January 1970"))):
                return
            body = driver.page_source
            return HtmlResponse(driver.current_url,
                                body=body,
                                encoding='utf-8',
                                request=request)
            driver.close()

        # the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
        # for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
        user_agent_list = [ \
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]
        #不需要动态加载的页面:
        ua = random.choice(user_agent_list)
        if ua:
            request.headers.setdefault('User-Agent', ua)
Example #54
0
    def test_generic_form_requests_with_spider_args(self):
        name = "ebay3"
        args = {'search_string': 'Cars'}
        spider = self.smanager.create(name, **args)
        generic_form_request = list(spider.start_requests())[0]

        response = HtmlResponse(
            url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
            body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        request_list = [
            request_to_dict(req, spider)
            for req in generic_form_request.callback(response)
        ]
        expected = [{
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True,
            'priority': 0,
            'callback': 'after_form_page',
            'method': 'GET',
            'errback': None
        }, {
            'body': '',
            '_encoding': 'utf-8',
            'cookies': {},
            'meta': {},
            'headers': {},
            'url': u'http://www.ebay.com/sch/ebayadvsearch/?rt=nc',
            'dont_filter': True,
            'priority': 0,
            'callback': 'parse',
            'method': 'GET',
            'errback': None
        }]
        self.assertEqual(request_list, expected)
Example #55
0
    def parse_doctor(self, response):
        response_url = response.url
        doctor_id = re.search('doctor/([^\.]*)\.htm', response_url).group(1)

        hxs = Selector(response)

        #parse doctor name
        name_list = hxs.xpath("//input[@name='doctor_name']/@value")
        doctor_name = ''
        if len(name_list) != 0:
            doctor_name = name_list[0].extract()

        #hospital department
        hospital_department_selectors = hxs.xpath("//meta[@name='keywords']/@content")
        hospital = ''
        department = ''
        if len(hospital_department_selectors) != 0:
            hospital_re = r',(?P<hospital>.*?)' + doctor_name
            hospital_match = re.search(hospital_re, hospital_department_selectors[0].extract())
            if hospital_match != None:
                hospital = hospital_match.group('hospital')

            department_re = hospital + r'(?P<department>.*?)' + doctor_name + ','
            department_match = re.search(department_re, hospital_department_selectors[0].extract())
            if department_match != None:
                department = department_match.group('department')

        #title
        title = ''
        title_selectors = hxs.xpath('//meta[@name="description"]/@content')
        if len(title_selectors) != 0:
            title_re_str = doctor_name + r'(?P<doctor_title>.*?)' + u'简介'
            title = re.search(title_re_str, title_selectors[0].extract()).group(1)

        doctor_about_dict = None
        tag_doctor_about_selectors = hxs.xpath('//div[@id="bp_doctor_about"]/div[@class="doctor_about"]')
        if len(tag_doctor_about_selectors) != 0:
            doctor_about_dict = self.parse_doctor_about(tag_doctor_about_selectors)
        else:
            doctor_about_match_list = hxs.xpath(
                '//script[@type="text/javascript"]/text()').re(
                'BigPipe.onPageletArrive\((?P<doctor_about>\{"id":"bp_doctor_about".*\})\);')
            if doctor_about_match_list:
                da_dict = json.loads(doctor_about_match_list[0])
                if 'content' in da_dict:
                    doctor_about_hxs = Selector(HtmlResponse(url=response.url, body=da_dict['content'].encode('utf-8')))
                    doctor_about_dict = self.parse_doctor_about(doctor_about_hxs)


        #schedule
        doctor_schedule = []
        trs = hxs.xpath("//table[@class='doctortimefrom1']/tr")
        day_part = 0
        for itr in trs:
            if 0 != day_part:
                doctor_schedule.extend(self.weekday_operation(itr, day_part)) #上午
            day_part += 1

        # #disease
        # disease_list = list()
        # disease_ht_selector = hxs.xpath('//div[@class="ltdiv"]//table[@class="jbsm"]//td')
        # if len(disease_ht_selector) == 1:
        #     disease_list = self.parse_disease_from_td_selector(disease_ht_selector, doctor_id=doctor_id)
        # else:
        #     disease_match_list = hxs.xpath(
        #         '//script[@type="text/javascript"]/text()').re(
        #         'BigPipe.onPageletArrive\((?P<dict_content>\{"id":"bp_doctor_getvote".*\})\);')

        #     if disease_match_list:
        #         disease_match = disease_match_list[0]
        #         d_dict = json.loads(disease_match)

        #         if 'content' in d_dict:
        #             disease_hxs = Selector(HtmlResponse(url=response.url, body=d_dict['content'].encode('utf-8')))
        #             disease_selector = disease_hxs.xpath('//div[@class="ltdiv"]//table[@class="jbsm"]//td')
        #             if len(disease_selector) == 1:
        #                 disease_list = self.parse_disease_from_td_selector(disease_selector, doctor_id=doctor_id)


        zanwu_re = re.compile(u'暂无')
        empty_sub_re = re.compile(r'(<!--.*?-->|\n|\t|\r|[ ])')

        item = XPathItemLoader(DoctorDetailItem(),hxs)
        item.add_value('doctor_id',doctor_id)
        if doctor_name:
            item.add_value('_name',doctor_name)
        if response.meta['city']:
            item.add_value('city',response.meta['city'])
        if hospital:
            item.add_value('hospital',hospital)
        if department:
            item.add_value('department',department)
        if title:
            item.add_value('title',title)
        if doctor_schedule:
            item.add_value('schedule',doctor_schedule)
        else:
            if len(hxs.xpath('//table[@class="doctortimefrom1"]')) == 0:
                for content in hxs.xpath('//script[@type="text/javascript"]/text()').extract():
                    if content.find('doctortimefrom1') != -1:
                        item.add_value('schedule','') # shouldn't exist in js
                        break


        if doctor_about_dict:
            if 'image_url' in doctor_about_dict:
                item.add_value('image',doctor_about_dict['image_url'])
            if 'bio' in doctor_about_dict:
                bio = doctor_about_dict['bio']
                if zanwu_re.search(bio) != None:
                    bio = ''
                if bio:
                    item.add_value('bio',empty_sub_re.sub('', bio))
            if 'feature' in doctor_about_dict:
                feature = doctor_about_dict['feature']
                if zanwu_re.search(feature) != None:
                    feature = ''
                if feature:
                    item.add_value('feature',empty_sub_re.sub('', feature))

        yield item.load_item()



        url=u'http://www.haodf.com/doctor/'+doctor_id+u'/jingyan/1.htm'

        l = LetterItem()
        l['doctor_id'] = doctor_id
        letter = []

        disease_item = DoctorDiseaseItem()
        disease_item['doctor_id'] = doctor_id

        req=Request(url,callback=self.parse_letter)
        req.meta['item']=l
        req.meta['letter']=letter
        req.meta['disease']=disease_item
        yield req
Example #56
0
class SelectortemLoaderTest(unittest.TestCase):
    response = HtmlResponse(url="", body="""
    <html>
    <body>
    <div id="id">marta</div>
    <p>paragraph</p>
    <a href="http://www.scrapy.org">homepage</a>
    <img src="/images/logo.png" width="244" height="65" alt="Scrapy">
    </body>
    </html>
    """)

    def test_constructor(self):
        l = TestItemLoader()
        self.assertEqual(l.selector, None)

    def test_constructor_errors(self):
        l = TestItemLoader()
        self.assertRaises(RuntimeError, l.add_xpath, 'url', '//a/@href')
        self.assertRaises(RuntimeError, l.replace_xpath, 'url', '//a/@href')
        self.assertRaises(RuntimeError, l.get_xpath, '//a/@href')
        self.assertRaises(RuntimeError, l.add_css, 'name', '#name::text')
        self.assertRaises(RuntimeError, l.replace_css, 'name', '#name::text')
        self.assertRaises(RuntimeError, l.get_css, '#name::text')

    def test_constructor_with_selector(self):
        sel = Selector(text=u"<html><body><div>marta</div></body></html>")
        l = TestItemLoader(selector=sel)
        self.assert_(l.selector is sel)

        l.add_xpath('name', '//div/text()')
        self.assertEqual(l.get_output_value('name'), [u'Marta'])

    def test_constructor_with_selector_css(self):
        sel = Selector(text=u"<html><body><div>marta</div></body></html>")
        l = TestItemLoader(selector=sel)
        self.assert_(l.selector is sel)

        l.add_css('name', 'div::text')
        self.assertEqual(l.get_output_value('name'), [u'Marta'])

    def test_constructor_with_response(self):
        l = TestItemLoader(response=self.response)
        self.assert_(l.selector)

        l.add_xpath('name', '//div/text()')
        self.assertEqual(l.get_output_value('name'), [u'Marta'])

    def test_constructor_with_response_css(self):
        l = TestItemLoader(response=self.response)
        self.assert_(l.selector)

        l.add_css('name', 'div::text')
        self.assertEqual(l.get_output_value('name'), [u'Marta'])

        l.add_css('url', 'a::attr(href)')
        self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org'])

        # combining/accumulating CSS selectors and XPath expressions
        l.add_xpath('name', '//div/text()')
        self.assertEqual(l.get_output_value('name'), [u'Marta', u'Marta'])

        l.add_xpath('url', '//img/@src')
        self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org', u'/images/logo.png'])

    def test_add_xpath_re(self):
        l = TestItemLoader(response=self.response)
        l.add_xpath('name', '//div/text()', re='ma')
        self.assertEqual(l.get_output_value('name'), [u'Ma'])

    def test_replace_xpath(self):
        l = TestItemLoader(response=self.response)
        self.assert_(l.selector)
        l.add_xpath('name', '//div/text()')
        self.assertEqual(l.get_output_value('name'), [u'Marta'])
        l.replace_xpath('name', '//p/text()')
        self.assertEqual(l.get_output_value('name'), [u'Paragraph'])

        l.replace_xpath('name', ['//p/text()', '//div/text()'])
        self.assertEqual(l.get_output_value('name'), [u'Paragraph', 'Marta'])

    def test_get_xpath(self):
        l = TestItemLoader(response=self.response)
        self.assertEqual(l.get_xpath('//p/text()'), [u'paragraph'])
        self.assertEqual(l.get_xpath('//p/text()', TakeFirst()), u'paragraph')
        self.assertEqual(l.get_xpath('//p/text()', TakeFirst(), re='pa'), u'pa')

        self.assertEqual(l.get_xpath(['//p/text()', '//div/text()']), [u'paragraph', 'marta'])

    def test_replace_xpath_multi_fields(self):
        l = TestItemLoader(response=self.response)
        l.add_xpath(None, '//div/text()', TakeFirst(), lambda x: {'name': x})
        self.assertEqual(l.get_output_value('name'), [u'Marta'])
        l.replace_xpath(None, '//p/text()', TakeFirst(), lambda x: {'name': x})
        self.assertEqual(l.get_output_value('name'), [u'Paragraph'])

    def test_replace_xpath_re(self):
        l = TestItemLoader(response=self.response)
        self.assert_(l.selector)
        l.add_xpath('name', '//div/text()')
        self.assertEqual(l.get_output_value('name'), [u'Marta'])
        l.replace_xpath('name', '//div/text()', re='ma')
        self.assertEqual(l.get_output_value('name'), [u'Ma'])

    def test_add_css_re(self):
        l = TestItemLoader(response=self.response)
        l.add_css('name', 'div::text', re='ma')
        self.assertEqual(l.get_output_value('name'), [u'Ma'])

        l.add_css('url', 'a::attr(href)', re='http://(.+)')
        self.assertEqual(l.get_output_value('url'), [u'www.scrapy.org'])

    def test_replace_css(self):
        l = TestItemLoader(response=self.response)
        self.assert_(l.selector)
        l.add_css('name', 'div::text')
        self.assertEqual(l.get_output_value('name'), [u'Marta'])
        l.replace_css('name', 'p::text')
        self.assertEqual(l.get_output_value('name'), [u'Paragraph'])

        l.replace_css('name', ['p::text', 'div::text'])
        self.assertEqual(l.get_output_value('name'), [u'Paragraph', 'Marta'])

        l.add_css('url', 'a::attr(href)', re='http://(.+)')
        self.assertEqual(l.get_output_value('url'), [u'www.scrapy.org'])
        l.replace_css('url', 'img::attr(src)')
        self.assertEqual(l.get_output_value('url'), [u'/images/logo.png'])

    def test_get_css(self):
        l = TestItemLoader(response=self.response)
        self.assertEqual(l.get_css('p::text'), [u'paragraph'])
        self.assertEqual(l.get_css('p::text', TakeFirst()), u'paragraph')
        self.assertEqual(l.get_css('p::text', TakeFirst(), re='pa'), u'pa')

        self.assertEqual(l.get_css(['p::text', 'div::text']), [u'paragraph', 'marta'])
        self.assertEqual(l.get_css(['a::attr(href)', 'img::attr(src)']),
            [u'http://www.scrapy.org', u'/images/logo.png'])

    def test_replace_css_multi_fields(self):
        l = TestItemLoader(response=self.response)
        l.add_css(None, 'div::text', TakeFirst(), lambda x: {'name': x})
        self.assertEqual(l.get_output_value('name'), [u'Marta'])
        l.replace_css(None, 'p::text', TakeFirst(), lambda x: {'name': x})
        self.assertEqual(l.get_output_value('name'), [u'Paragraph'])

        l.add_css(None, 'a::attr(href)', TakeFirst(), lambda x: {'url': x})
        self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org'])
        l.replace_css(None, 'img::attr(src)', TakeFirst(), lambda x: {'url': x})
        self.assertEqual(l.get_output_value('url'), [u'/images/logo.png'])

    def test_replace_css_re(self):
        l = TestItemLoader(response=self.response)
        self.assert_(l.selector)
        l.add_css('url', 'a::attr(href)')
        self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org'])
        l.replace_css('url', 'a::attr(href)', re='http://www\.(.+)')
        self.assertEqual(l.get_output_value('url'), [u'scrapy.org'])
Example #57
0
 def getScrapyResponse(self, url):
     response = self.downloadUsingSelenium(url)
     response = HtmlResponse(url=url, body=response, encoding='utf-8')
     return response
Example #58
0
    def process_request(self, request, spider):

        if spider.USE_SELENIUM:
            url = request._get_url()
            self.driver.get(url)
            return HtmlResponse(url, body=self.driver.page_source, encoding='utf-8')
    def process_request(self,  request, spider):
        chrome_options = Options()
        chrome_options.add_argument('--headless')  # 使用无头谷歌浏览器模式
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--no-sandbox')
        '''
        :param request: 请求
        :param spider: 爬虫名
        :return:
        '''
        # 判断是哪个爬虫
        if spider.name == 'scjrm_zszq':
            # 判断是否是登陆
            # if request.url == "http://www.scjrm.com/site/login.html":
            print("<<<<<<<" +request.url)
            spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe")
            spider.driver.get("http://www.scjrm.com/site/login.html")
            # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click()
            time.sleep(2)
            #模拟输入账号密码
            username = spider.driver.find_element_by_id('phonenumber')
            password = spider.driver.find_element_by_id('password')
            username.send_keys('18030535053')
            password.send_keys('123456')
            #模拟点击“登录”按钮
            spider.driver.find_element_by_id('sub_bt').click()
            time.sleep(1)
            spider.driver.get(request.url)
            time.sleep(3)
            spider.cookies = spider.driver.get_cookies()
            time.sleep(1)
            return HtmlResponse(url=spider.driver.current_url,  # 登录后的url
                                body=spider.driver.page_source,  # html源码
                                encoding='utf-8')
            # 不是登录
            # else:
            #     req = requests.session()  # 会话
            #     for cookie in spider.cookies:
            #         req.cookies.set(cookie['name'], cookie["value"])
            #     req.headers.clear()  # 清空头
            #     newpage = req.get(request.url)
            #     time.sleep(5)
            #     return HtmlResponse(url=request.url,  # 当前连接
            #                         body=newpage.text,  # 源代码  # 源代码
            #                         encoding="utf-8", request=request)  # 返回页面信息

        if spider.name == 'scjuchuang_yxzq':
            # 判断是否是登陆
            # if request.url.find('login') != -1:
            spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe")
            spider.driver.get('https://www.scjuchuang.com/login')
            # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click()
            time.sleep(2)
            #模拟输入账号密码
            username = spider.driver.find_element_by_class_name('loginName')
            password = spider.driver.find_element_by_class_name('loginPassword')
            username.send_keys('yczs123')
            password.send_keys('123456')
            #模拟点击“登录”按钮
            spider.driver.find_element_by_class_name('loginBtn').click()
            time.sleep(1)
            spider.driver.get('https://www.scjuchuang.com/goods?attr=1&page=1')
            # spider.driver.find_element_by_link_text('院线专区').click()
            spider.cookies = spider.driver.get_cookies()
            return HtmlResponse(url=spider.driver.current_url,  # 登录后的url
                                body=spider.driver.page_source,  # html源码
                                encoding='utf-8')

        elif spider.name == 'rjyiyao_xpsj':
            # 判断是否是登陆
            # if request.url.find('login') != -1:
            spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe", chrome_options=chrome_options)
            spider.driver.get('http://new.rjyiyao.com/web/login')
            # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click()
            time.sleep(2)
            #模拟输入账号密码
            username = spider.driver.find_element_by_id('username')
            password = spider.driver.find_element_by_id('password')
            username.send_keys('18030535053')
            password.send_keys('123456')
            #模拟点击“登录”按钮
            spider.driver.find_element_by_id('btnLogin').click()
            time.sleep(1)
            # spider.driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[3]/div[2]/a[2]/img').click()  # 新品上架
            # windows = spider.driver.window_handles
            # spider.driver.switch_to.window(windows[1])  # 切换到第二页
            spider.driver.get('http://new.rjyiyao.com/web/product/group/5?page=1')
            time.sleep(5)
            spider.cookies = spider.driver.get_cookies()
            return HtmlResponse(url=spider.driver.current_url,  # 登录后的url
                                body=spider.driver.page_source,  # html源码
                                encoding='utf-8')

        elif spider.name == 'rjyiyao_zkzq':
            # 判断是否是登陆
            # if request.url.find('login') != -1:
            spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe", chrome_options=chrome_options)
            spider.driver.get('http://new.rjyiyao.com/web/login')
            # spider.driver.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a').click()
            time.sleep(1)
            #模拟输入账号密码
            username = spider.driver.find_element_by_id('username')
            password = spider.driver.find_element_by_id('password')
            username.send_keys('18030535053')
            password.send_keys('123456')
            #模拟点击“登录”按钮
            spider.driver.find_element_by_id('btnLogin').click()
            time.sleep(2)
            spider.driver.get('http://new.rjyiyao.com/web/product/sale/3?page=1')
            # spider.driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[3]/div[2]/a[2]/img').click()  # 新品上架
            # windows = spider.driver.window_handles
            # spider.driver.switch_to.window(windows[1])  # 切换到第二页
            time.sleep(5)
            spider.cookies = spider.driver.get_cookies()
            return HtmlResponse(url=spider.driver.current_url,  # 登录后的url
                                body=spider.driver.page_source,  # html源码
                                encoding='utf-8')

        elif spider.name == 'sckxyy_ypzq':
            # 判断是否是登陆
            # if request.url.find('login') != -1:
            spider.driver = webdriver.Chrome(executable_path="C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe")
            spider.driver.get('http://www.sckxyy.com/Login.html')
            time.sleep(2)
            #模拟输入账号密码
            username = spider.driver.find_element_by_id('usernameLogin')
            password = spider.driver.find_element_by_id('passwordLogin')
            username.send_keys('bianyuantianshi')
            password.send_keys('123456')
            #模拟点击“登录”按钮
            spider.driver.find_element_by_id('userLogin').click()
            time.sleep(1)
            spider.cookies = spider.driver.get_cookies()
            spider.driver.get('http://www.sckxyy.com/Drug_zone.html#Monday-bg-two')
            # spider.driver.find_element_by_link_text('普药专区').click()  # 普药专区
            # time.sleep(5)
            # windows = spider.driver.window_handles
            # spider.driver.switch_to.window(windows[1])  # 切换到第二页

            return HtmlResponse(url=spider.driver.current_url,  # 登录后的url
                                body=spider.driver.page_source,  # html源码
                                encoding='utf-8')
Example #60
0
 def test_restrict_xpaths_with_html_entities(self):
     html = '<html><body><p><a href="/&hearts;/you?c=&euro;">text</a></p></body></html>'
     response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='iso8859-15')
     links = SgmlLinkExtractor(restrict_xpaths='//p').extract_links(response)
     self.assertEqual(links,
                      [Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC', text=u'text')])