Esempio n. 1
0
 def parse_item(self, response):
     item = HexunItem()
     item["title"] = response.xpath("//span[@class='ArticleTitleText']/a/text()").extract()
     item["link"] = response.url
     pat_link = '(http://click.tool.hexun.com/click.aspx\?articleid=.*?)"'
     click_link = re.compile(pat_link,re.S).findall(str(response.body))[0]
     headers = {
         'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
         'Referer': response.url
     }
     opener = urllib.request.build_opener()
     headall = []
     for key, value in headers.items():
         headers_item = (key, value)
         headall.append(headers_item)
     opener.addheaders = headall
     urllib.request.install_opener(opener)
     click_data = urllib.request.urlopen(click_link).read().decode("utf-8",'ignore')
     pat_click = '"articleClickCount"\).innerHTML = (.*?);'
     pat_comment = '"articleCommentCount"\).innerHTML = (.*?);'
     click = re.compile(pat_click,re.S).findall(click_data)[0]
     commemt = re.compile(pat_comment,re.S).findall(click_data)[0]
     print(click,commemt)
     item["click"] = click
     item["comment"] = commemt
     #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     #i['name'] = response.xpath('//div[@id="name"]').extract()
     #i['description'] = response.xpath('//div[@id="description"]').extract()
     return item
Esempio n. 2
0
    def detail_parse(self,response):
        item = HexunItem()
        articlestr = response.text
        #截取之后的
        articles = articlestr[23:-4]
        # print(articles)
        json_obj = json.loads((articles))
        urls = []
        for i in range(0,len(json_obj['result'])):
            print(str(json_obj['result'][i]))
            urls.append(str(json_obj['result'][i]['entityurl']))

        item['url'] = urls
        yield item


        #     json_obj = json.loads((sss[23:-2]))
        #     print(type(json_obj))
        #     print(json_obj)
        #     for i in range(0, len(json_obj['result']) - 1):
        #         print(str(json_obj['result'][i]))
        #         print(str(json_obj['result'][i]['entitytime']))
        #     # s1 = "hx_json11587781686930( "
        #     # print(len(s1))

        # with open("test.txt", "r") as f:
        #     for res in json_obj['result']:
        #         f.writelines(str(res['entityurl']))
        #         # print(str(res['entityurl']))
        #     f.close()
        pass
Esempio n. 3
0
 def data(self, response):
     item = HexunItem()
     item['url'] = response.url
     item['title'] = response.meta['title']
     item['stock'] = response.meta['stock']
     item['time'] = response.xpath("//span[@class='pr20']/text()").extract()[0]
     item['author'] = response.xpath("//*[@rel='nofollow']/text()").extract()[0]
     item['text'] = ''.join(response.xpath("//div[@class='art_contextBox']/p/text()").extract())
     yield item
Esempio n. 4
0
 def parse(self, response):
     soup = BeautifulSoup(response.body, "html.parser")
     td_list = soup.body.find(id='BankNameList').find_all('td', class_='fir_td')
     for td in td_list:
         item = HexunItem()
         print(td.div.string.strip())
         print(td.find(class_='pere').em.string.strip())
         result = td.div.string.strip().split('/')
         item['from_currency'] = result[0]
         item['to_currency'] = result[1]
         item['rate'] = td.find(class_='pere').em.string.strip()
         yield item
Esempio n. 5
0
 def parseItem(self, response):
     jsonData = json.loads(response.body_as_unicode().strip(';').strip('(').strip(')'))
     datas = jsonData['Data'][0]
     contractCode=self.getContractName(response)
     for dataItem in datas:
         lldpeItem = HexunItem()
         lldpeItem['product'] = contractCode
         lldpeItem['dateTime'] = dataItem[0]
         lldpeItem['price'] = dataItem[1]
         lldpeItem['amount'] = dataItem[2]
         lldpeItem['volumn'] = dataItem[3]
         lldpeItem['avePrice'] = dataItem[4]
         lldpeItem['openInterest'] = dataItem[5]
         yield lldpeItem
Esempio n. 6
0
 def parse(self, response):
     item = HexunItem()
     #提取名字和链接
     item['name']=response.xpath("//span[@class='ArticleTitleText']/a/text()").extract()
     item["url"]=response.xpath("//span[@class='ArticleTitleText']/a/@href").extract()
     #使用urllib和re模块获取博文的评论数和阅读数
     #构造提取评论数和点击数网址的正则表达式
     # edit it according to the reality
     pat1='<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">'  
      #hcurl为存储评论数和点击数的网址(后面用来二次爬取) 网页结构变化后表达式可能提取不到
     urls=re.compile(pat1).findall(str(response.body))
     hcurl = urls[0] if urls else ""
     if not hcurl:
         print("Extracted nothing!")
     
     # 模拟成浏览器
     headers2 = ("User-Agent",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0")
     opener = urllib.request.build_opener()
     opener.addheaders = [headers2]
     # 将opener安装为全局
     urllib.request.install_opener(opener)
     #data为对应博客列表页的所有博文的点击数与评论数数据
     data=urllib.request.urlopen(hcurl).read()
     #pat2为提取文章阅读数的正则表达式
     pat2="click\d*?','(\d*?)'"
     #pat3为提取文章评论数的正则表达式
     pat3="comment\d*?','(\d*?)'"
     #提取阅读数和评论数数据并分别赋值给item下的hits和comment
     item["hits"]=re.compile(pat2).findall(str(data))
     #print(item["hits"])
     item["comment"]=re.compile(pat3).findall(str(data))
     yield item
     #提取博文列表页的总页数
     pat4="blog.hexun.com/p(.*?)/"
     #通过正则表达式获取到的数据为一个列表,倒数第二个元素为总页数
     data2=re.compile(pat4).findall(str(response.body))
     if(len(data2)>=2):
         totalurl=data2[-2]
     else:
         totalurl=1
     #print("一共"+str(totalurl)+"页") #调试用
     #进入for循环,依次爬取各博文列表页的博文数据
     for i in range(2,int(totalurl)+1):
         #构造下一次要爬取的url,爬取一下页博文列表页中的数据
         nexturl="http://"+str(self.uid)+".blog.hexun.com/p"+str(i)+"/default.html"
         #进行下一次爬取,下一次爬取仍然模拟成浏览器进行
         yield Request(nexturl,callback=self.parse,headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"})
Esempio n. 7
0
    def parse_item(self, response):
        item = HexunItem()
        item['title'] = response.xpath("//span[@class='ArticleTitleText']/a/text()").extract()[0]
        item['link'] = response.url
        item['blog_id'] = response.xpath("//script").re('ARecommend.aspx\?blogid=(.*?)&',re.S)[0]
        item['article_id'] = response.xpath("//span[@class='ArticleTitleText']/a/@href").re('blog.hexun.com/(.*?)_d.html')[0]

        cc_url = "http://click.tool.hexun.com/click.aspx?articleid=%s&blogid=%s" %(item['article_id'],item['blog_id'])
        print('url', cc_url)
        headers = {
            'Referer': response.url,
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
        }
        res = requests.get(cc_url, headers=headers)
        data=res.text
        print(data)
        item['comment']=re.compile('articleCommentCount.*?= (.*?);', re.S).findall(data)[0]
        item['click'] = re.compile('articleClickCount.*?= (.*?);', re.S).findall(data)[0]
        return item