Ejemplo n.º 1
0
 def parse(self, response):
     item = SomenewItem()
     url = response.url
     item['article_id'] = get_md5(url)
     com_num = response.xpath(
         "//h2[@id='comm_span']/span/text()").extract_first()
     if com_num is None:
         com_num = 0
     else:
         com_num = com_num.replace("(", '').replace(")", '').replace(
             "\n", '').replace("\t", '')
     com_int = re.match("(.*)k$", str(com_num))
     if com_int is not None:
         item['comm_num'] = int(float(com_int.group(1)) * 1000)
     else:
         item['comm_num'] = com_num
     item['read_num'] = '0'
     item['env_num'] = '0'
     conid = re.match('.*?(\d+)', url)
     if conid:
         conid = conid.group(1)
         fav_url = 'https://www.thepaper.cn/cont_vote_json.jsp?contid=' + str(
             conid)
         yield scrapy.Request(fav_url,
                              callback=self.get_fav_num,
                              meta={"item": item})
     else:
         item['fav_num'] = 0
         item['hot_value'] = int(item['fav_num']) + int(item['comm_num'])
         yield item
Ejemplo n.º 2
0
 def parse(self, response):
     id = response.xpath(
         "//div[@class='attitude']/span/@data-id").extract_first()
     upnum_url = 'http://www.bjnews.com.cn/webapi/getupnum?id=' + str(id)
     item = SomenewItem()
     item['article_id'] = get_md5(response.url)
     yield scrapy.Request(upnum_url,
                          callback=self.get_comment_num,
                          meta={'item': item})
Ejemplo n.º 3
0
 def parse(self, response):
     content = re.search("getcomments:'(.*?)'", response.text, re.S)
     url = content.group(1)
     comment_url = 'http:' + url
     item = SomenewItem()
     item['article_id'] = get_md5(response.url)
     yield scrapy.Request(comment_url,
                          callback=self.get_comment_num,
                          meta={'item': item})
Ejemplo n.º 4
0
 def parse(self, response):
     item = SomenewItem()
     url = str(response.url)
     title = response.xpath("//div[@class='l_a']/h1/text()").extract_first()
     item['article_id'] = get_md5(url)
     sourceid = response.xpath(
         "//meta[@name='contentid']/@content").extract_first()
     com_url = 'https://commentn.huanqiu.com/api/v2/async?a=comment&m=source_info&appid=e8fcff106c8f&sourceid=' + sourceid + '&url=' + \
               url + '&title=' + title
     yield scrapy.Request(com_url,
                          callback=self.get_com_num,
                          meta={'item': item})
Ejemplo n.º 5
0
 def parse(self, response):
     content = re.findall(r'("productKey".*)?"target"', response.text, re.S)
     content = ''.join(content).replace('\n', '').replace(' ', '')
     con = "{" + content + "}"
     con = eval(
         con
     )  # {'productKey': 'a2869674571f77b5a0867c3d71db5856', 'docId': 'E9UT79BB0001875P'}
     item = SomenewItem()
     comment_url = 'https://comment.api.163.com/api/v1/products/{productKey}/threads/{docId}?ibc=jssdk'.format(
         productKey=con['productKey'], docId=con['docId'])
     item['article_id'] = get_md5(response.url)
     yield scrapy.Request(url=comment_url,
                          callback=self.get_comment_num,
                          meta={'item': item})
Ejemplo n.º 6
0
 def parse(self, response):
     fullText = response.body.decode()
     # print(fullText)
     item = SomenewItem()
     comm_re = re.search('comments_count\: (\d+)', fullText, re.S)
     if comm_re:
         item['comm_num'] = comm_re.group(1)
     else:
         item['comm_num'] = 0
     item['read_num'] = 0
     item['fav_num'] = 0
     item['env_num'] = 0
     item['hot_value'] = item['comm_num']
     item['article_id'] = get_md5(response.url)
     # print(item)
     yield item
Ejemplo n.º 7
0
    def parse(self, response):
        item = SomenewItem()
        url = response.url
        item['article_id'] = get_md5(url)
        com_parm = response.xpath(
            "//meta[@name='sudameta'][2]/@content").extract_first()
        if com_parm == 'sinaog:0':
            com_parm = response.xpath(
                "//meta[@name='sudameta'][1]/@content").extract_first()
        com_parm_dic = {
            i.split(':')[0]: i.split(':')[1]
            for i in com_parm.split(';')
        }
        com_url = 'http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=' + com_parm_dic[
            'comment_channel'] + '&newsid=' + com_parm_dic[
                'comment_id'] + '&group=undefined&compress=0&ie=utf-8'

        yield scrapy.Request(com_url,
                             callback=self.get_com_num,
                             meta={"item": item})
Ejemplo n.º 8
0
 def parse(self, response):
     item = SomenewItem()
     html = response.body.decode()
     comm_num_re = re.search('\"comments_count\"\: (\d+)', html, re.S)
     if comm_num_re:
         item['comm_num'] = comm_num_re.group(1)
     else:
         item['comm_num'] = 0
     env_num_re = re.search('\"reposts_count\"\: (\d+)', html, re.S)
     if env_num_re:
         item['env_num'] = env_num_re.group(1)
     else:
         item['env_num'] = 0
     fav_num_re = re.search('\"attitudes_count\"\: (\d+)', html, re.S)
     if fav_num_re:
         item['fav_num'] = fav_num_re.group(1)
     else:
         item['fav_num'] = 0
     item['read_num'] = 0
     item['hot_value'] = int(item['comm_num']) + int(item['env_num']) + int(
         item['fav_num'])
     item['article_id'] = get_md5(response.url)
     yield item
Ejemplo n.º 9
0
 def parse(self, response):
     item = SomenewItem()
     url = response.url
     item['article_id'] = get_md5(url)
     # if not item['content'] == '':
     html = response.xpath(
         "//*[@id='Main-Article-QQ']/div/div[1]/div[2]/script/text()"
     ).extract_first()
     # 如果没有取到cmt_id,说明comm_num=0
     if html:
         html = html.replace("\n", '').replace(' ', '')
         cmt_id = re.match('.*?cmt_id=(\d+).*', html).group(1)
         com_url = 'https://coral.qq.com/article/' + cmt_id + '/commentnum'
         yield scrapy.Request(com_url,
                              callback=self.get_comm_num,
                              dont_filter=True,
                              meta={'item': item})
     else:
         item['comm_num'] = 0
         item['fav_num'] = '0'
         item['read_num'] = '0'
         item['env_num'] = '0'
         item['hot_value'] = 0
         yield item