Esempio n. 1
0
 def parse(self, response):
     item = SomenewItem()
     url = response.url
     item['article_id'] = get_md5(url)
     com_num = response.xpath(
         "//h2[@id='comm_span']/span/text()").extract_first()
     if com_num is None:
         com_num = 0
     else:
         com_num = com_num.replace("(", '').replace(")", '').replace(
             "\n", '').replace("\t", '')
     com_int = re.match("(.*)k$", str(com_num))
     if com_int is not None:
         item['comm_num'] = int(float(com_int.group(1)) * 1000)
     else:
         item['comm_num'] = com_num
     item['read_num'] = '0'
     item['env_num'] = '0'
     conid = re.match('.*?(\d+)', url)
     if conid:
         conid = conid.group(1)
         fav_url = 'https://www.thepaper.cn/cont_vote_json.jsp?contid=' + str(
             conid)
         yield scrapy.Request(fav_url,
                              callback=self.get_fav_num,
                              meta={"item": item})
     else:
         item['fav_num'] = 0
         item['hot_value'] = int(item['fav_num']) + int(item['comm_num'])
         yield item
Esempio n. 2
0
    def get_detail(self, response):
        print(response.url, '我是响应的rul')
        item = SomenewItem()
        item['title'] = response.xpath(
            '//*[@id="Title"]/text()').extract_first()
        item['time'] = response.url.split('/paper/c/')[1][:9].replace(
            '8', '8/')
        item['content'] = response.xpath(
            '//*[@id="ozoom"]/founder-content/p/text()').extract()
        item['come_from'] = '河北日报'
        if item['title'] and item['content']:
            item['url'] = response.url
            item['content'] = ''.join(item['content']).replace(
                '\u3000',
                u' ').replace('\xa0', u' ').replace('\n', '').replace(
                    '\u2002', '').replace('\t', '').replace('\r', '').strip()
            m = hashlib.md5()
            m.update(str(item['url']).encode('utf8'))
            item['article_id'] = m.hexdigest()
            item['media'] = '河北日报'
            item['create_time'] = datetime.datetime.now().strftime(
                '%Y/%m/%d %H:%M:%S')
            item['comm_num'] = "0"
            item['fav_num'] = '0'
            item['read_num'] = '0'
            item['env_num'] = '0'
            item['media_type'] = '网媒'
            item['addr_province'] = '河北省'
            # print('燕赵都市报' * 100)

            yield item
Esempio n. 3
0
 def get_detail(self, response):
     print(response.url, '我是响应的rul')
     item = SomenewItem()
     item['title'] = response.xpath(
         '/html/body/div[3]/h1/text()').extract_first()
     item['time'] = response.xpath(
         '//*[@id="top_bar"]/div/div[2]/span[1]/text()').extract_first()
     item['content'] = response.xpath(
         '//*[@id="artibody"]/p/text()').extract()
     try:
         item['come_from'] = response.xpath(
             '//*[@id="top_bar"]/div/div[2]/a/text()').extract()[0]
     except:
         pass
     if item['title'] and item['content']:
         item['url'] = response.url
         item['content'] = ''.join(item['content']).replace(
             '\u3000',
             u' ').replace('\xa0', u' ').replace('\n', '').replace(
                 '\u2002', '').replace('\t', '').replace('\r', '').strip()
         m = hashlib.md5()
         m.update(str(item['url']).encode('utf8'))
         item['article_id'] = m.hexdigest()
         item['media'] = '邯郸新闻网'
         item['create_time'] = datetime.datetime.now().strftime(
             '%Y/%m/%d %H:%M:%S')
         item['comm_num'] = "0"
         item['fav_num'] = '0'
         item['read_num'] = '0'
         item['env_num'] = '0'
         item['media_type'] = '网媒'
         item['addr_province'] = '河北省'
         print('邯郸新闻网' * 100)
         yield item
Esempio n. 4
0
 def get_detail(self, response):
         print(response.url, '我是响应的rul')
         item = SomenewItem()
         item['title'] = response.xpath('/html/body/table[3]/tr[2]/td[1]/table[2]/tr/td/table[1]/tr/td/div/h1/text()').extract()[0]
         item['time'] = response.xpath('//td[@class="STYLE2 zi12"]/text()').extract()[1]
         item['come_from'] = response.xpath('//td[5]/a/span/text()').extract_first()
         item['content'] = response.xpath('//td/p/text()').extract()
         if item['title'] and item['content']:
             item['come_from'] = item['come_from'].split('来源:')[1]
             item['time'] = item['time'].split('\r\n')[1]
             item['url'] = response.url
             item['content'] = ''.join(item['content']).replace('\u3000', u' ').replace('\xa0', u' ').replace('\n',
                                                                                                              '').replace(
                 '\u2002', '').replace('\t', '').replace('\r', '').strip()
             m = hashlib.md5()
             m.update(str(item['url']).encode('utf8'))
             item['article_id'] = m.hexdigest()
             item['media'] = '环渤海新闻网'
             item['create_time'] = datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')
             item['comm_num'] = "0"
             item['fav_num'] = '0'
             item['read_num'] = '0'
             item['env_num'] = '0'
             item['media_type'] = '网媒'
             item['addr_province'] = '河北省'
             item['addr_city'] = '唐山'
             print('环渤海新闻网' * 100)
             yield item
Esempio n. 5
0
    def get_detail(self,response):
        print(response.url,'我是响应的rul')
        item= SomenewItem()
        item['title']= response.xpath('/html/body/div[4]/h1/text()').extract_first()
        item['time'] = response.xpath('/html/body/div[4]/div/div/text()').extract()[0]
        item['content'] = response.xpath('//*[@id="rwb_zw"]/p/text()').extract()
        item['come_from'] = response.xpath('/html/body/div[4]/div/div/text()').extract()[0]

        if item['title'] and item['content']:
            item['time'] = item['time'].split('\xa0\xa0')[0].strip().split('\r\n')[0][0:19]
            print(len(item['time']))
            item['come_from'] = item['come_from'].split('\xa0\xa0')[1].strip().split('来源:')[1]
            item['url'] = response.url
            item['content'] = ''.join(item['content']).replace('\u3000', u' ').replace('\xa0', u' ').replace('\n',
                                                                                                               '').replace(
                '\u2002', '').replace('\t','').replace('\r','').strip()
            m = hashlib.md5()
            m.update(str(item['url']).encode('utf8'))
            item['article_id'] = m.hexdigest()
            item['media'] = '邢台网'
            item['create_time'] = datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')
            item['comm_num'] = "0"
            item['fav_num'] = '0'
            item['read_num'] = '0'
            item['env_num'] = '0'
            item['media_type'] = '网媒'
            item['addr_province'] = '河北省'
            item['addr_city'] = '邢台'
            print('邢台网'*100)
            print(item)
            yield item
Esempio n. 6
0
 def get_detail(self, response):
     print(response.url, '我是响应的rul')
     item = SomenewItem()
     item['title'] = response.xpath('//div[1]/h1/text()').extract_first()
     item['time'] = response.xpath(
         '//div[2]/div[1]/div/span[2]/text()').extract_first()
     item['come_from'] = response.xpath(
         '//div[2]/div[1]/div/span[1]/text()').extract()
     item['content'] = response.xpath(
         '//div[1]/div[2]/div[2]/p/text()').extract()
     if item['title'] and item['content']:
         item['come_from'] = item['come_from'][0].split('来源:')[1]
         item['url'] = response.url
         item['content'] = ''.join(item['content']).replace(
             '\u3000',
             u' ').replace('\xa0', u' ').replace('\n', '').replace(
                 '\u2002', '').replace('\t', '').replace('\r', '').strip()
         m = hashlib.md5()
         m.update(str(item['url']).encode('utf8'))
         item['article_id'] = m.hexdigest()
         item['media'] = '河北青年报'
         item['create_time'] = datetime.datetime.now().strftime(
             '%Y/%m/%d %H:%M:%S')
         item['comm_num'] = "0"
         item['fav_num'] = '0'
         item['read_num'] = '0'
         item['env_num'] = '0'
         item['media_type'] = '网媒'
         item['addr_province'] = '河北省'
         print('河北青年报' * 100)
Esempio n. 7
0
 def get_detail(self, response):
         print(response.url, '我是响应的rul')
         item = SomenewItem()
         try:
             item['title'] = response.xpath('//div[1]/h2/text()').extract()[0].strip()
         except:
             pass
         item['time'] = response.xpath('//div[@class="info"]/text()').extract()
         item['come_from'] = '石家庄新闻网'
         item['content'] = response.xpath('/html/body/div/div[5]/div[1]/div[1]/div[5]/p/text()|//div[1]/div/div/p/text()').extract()
         if item['title'] and item['content']:
             for i in item['time']:
                 data = re.findall(r'(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})', i)
                 if data:
                     item['time'] = data[0]
             item['url'] = response.url
             item['content'] = ''.join(item['content']).replace('\u3000', u' ').replace('\xa0', u' ').replace('\n',
                                                                                                              '').replace(
                 '\u2002', '').replace('\t', '').replace('\r', '').strip()
             m = hashlib.md5()
             m.update(str(item['url']).encode('utf8'))
             item['article_id'] = m.hexdigest()
             item['media'] = '石家庄新闻网'
             item['create_time'] = datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')
             item['comm_num'] = "0"
             item['fav_num'] = '0'
             item['read_num'] = '0'
             item['env_num'] = '0'
             item['media_type'] = '网媒'
             item['addr_province'] = '河北省'
             item['addr_city'] = '河北省'
             print('石家庄新闻网' * 100)
             yield item
Esempio n. 8
0
 def parse(self, response):
     id = response.xpath(
         "//div[@class='attitude']/span/@data-id").extract_first()
     upnum_url = 'http://www.bjnews.com.cn/webapi/getupnum?id=' + str(id)
     item = SomenewItem()
     item['article_id'] = get_md5(response.url)
     yield scrapy.Request(upnum_url,
                          callback=self.get_comment_num,
                          meta={'item': item})
Esempio n. 9
0
 def parse(self, response):
     content = re.search("getcomments:'(.*?)'", response.text, re.S)
     url = content.group(1)
     comment_url = 'http:' + url
     item = SomenewItem()
     item['article_id'] = get_md5(response.url)
     yield scrapy.Request(comment_url,
                          callback=self.get_comment_num,
                          meta={'item': item})
Esempio n. 10
0
 def parse(self, response):
     item = SomenewItem()
     url = str(response.url)
     title = response.xpath("//div[@class='l_a']/h1/text()").extract_first()
     item['article_id'] = get_md5(url)
     sourceid = response.xpath(
         "//meta[@name='contentid']/@content").extract_first()
     com_url = 'https://commentn.huanqiu.com/api/v2/async?a=comment&m=source_info&appid=e8fcff106c8f&sourceid=' + sourceid + '&url=' + \
               url + '&title=' + title
     yield scrapy.Request(com_url,
                          callback=self.get_com_num,
                          meta={'item': item})
Esempio n. 11
0
 def parse(self, response):
     content = re.findall(r'("productKey".*)?"target"', response.text, re.S)
     content = ''.join(content).replace('\n', '').replace(' ', '')
     con = "{" + content + "}"
     con = eval(
         con
     )  # {'productKey': 'a2869674571f77b5a0867c3d71db5856', 'docId': 'E9UT79BB0001875P'}
     item = SomenewItem()
     comment_url = 'https://comment.api.163.com/api/v1/products/{productKey}/threads/{docId}?ibc=jssdk'.format(
         productKey=con['productKey'], docId=con['docId'])
     item['article_id'] = get_md5(response.url)
     yield scrapy.Request(url=comment_url,
                          callback=self.get_comment_num,
                          meta={'item': item})
Esempio n. 12
0
 def parse(self, response):
     fullText = response.body.decode()
     # print(fullText)
     item = SomenewItem()
     comm_re = re.search('comments_count\: (\d+)', fullText, re.S)
     if comm_re:
         item['comm_num'] = comm_re.group(1)
     else:
         item['comm_num'] = 0
     item['read_num'] = 0
     item['fav_num'] = 0
     item['env_num'] = 0
     item['hot_value'] = item['comm_num']
     item['article_id'] = get_md5(response.url)
     # print(item)
     yield item
Esempio n. 13
0
 def get_detail(self, response):
     print(response.url, '我是响应的rul')
     item = SomenewItem()
     item['title'] = response.xpath(
         '/html/body/div[1]/table/tr/td[1]/table/tr/td/table[2]/tr[1]/td/div/h2/text()'
     ).extract_first()
     item['time'] = response.xpath(
         '/html/body/div[1]/table/tr/td[1]/table/tr/td/table[2]/tr[2]/td/div/text()'
     ).extract()
     item['come_from'] = response.xpath(
         '/html/body/div[1]/table/tr/td[1]/table/tr/td/table[2]/tr[2]/td/div/text()'
     ).extract()
     item['content'] = response.xpath(
         '/html/body/div[1]/table/tr/td/table/tr/td/table/tr/td/p/span/text()\
     |/html/body/div[1]/table/tr/td[1]/table/tr/td/table[2]/tr[3]/td/text()|/html/body/div[1]/table/tr/td[1]/table/tr/td/table[2]/tr[3]/td/div/text()'
     ).extract()
     if item['title'] and item['content']:
         item['come_from'] = item['come_from'][2]
         for node in item['time']:
             data = re.findall(r'(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})',
                               node)
             if data:
                 item['time'] = data[0]
         item['url'] = response.url
         item['content'] = ''.join(item['content']).replace(
             '\u3000',
             u' ').replace('\xa0', u' ').replace('\n', '').replace(
                 '\u2002', '').replace('\t', '').replace('\r', '').strip()
         m = hashlib.md5()
         m.update(str(item['url']).encode('utf8'))
         item['article_id'] = m.hexdigest()
         item['media'] = '衡水新闻网'
         item['create_time'] = datetime.datetime.now().strftime(
             '%Y/%m/%d %H:%M:%S')
         item['comm_num'] = "0"
         item['fav_num'] = '0'
         item['read_num'] = '0'
         item['env_num'] = '0'
         item['media_type'] = '网媒'
         item['addr_province'] = '河北省'
         item['addr_city'] = '衡水'
         print('衡水新闻网' * 100)
         yield item
         print(item)
Esempio n. 14
0
 def get_detail(self, response):
     print(response.url, '我是响应的rul')
     item = SomenewItem()
     item['title'] = response.xpath('//h1/text()').extract_first()
     item['time'] = response.xpath(
         '//div[@class="post_source"]/text()').extract()
     item['come_from'] = response.xpath(
         '//div[@class="post_source"]/text()[2]|//div[@class="post_source"]/a/text()'
     ).extract()
     item['content'] = response.xpath('//p/text()').extract()
     if item['title'] and item['content']:
         if len(item['come_from']) == 1:
             item['come_from'] = item['come_from'][0].split(
                 '\r\n')[1].split('\n')[0]
         elif len(item['come_from']) == 2:
             item['come_from'] = item['come_from'][1]
         else:
             pass
         for node in item['time']:
             data = re.findall(r'(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})',
                               node)
             if data:
                 item['time'] = data[0]
         item['url'] = response.url
         item['content'] = ''.join(item['content']).replace(
             '\u3000',
             u' ').replace('\xa0', u' ').replace('\n', '').replace(
                 '\u2002', '').replace('\t', '').replace('\r', '').strip()
         m = hashlib.md5()
         m.update(str(item['url']).encode('utf8'))
         item['article_id'] = m.hexdigest()
         item['media'] = '长城网'
         item['create_time'] = datetime.datetime.now().strftime(
             '%Y/%m/%d %H:%M:%S')
         item['comm_num'] = "0"
         item['fav_num'] = '0'
         item['read_num'] = '0'
         item['env_num'] = '0'
         item['media_type'] = '网媒'
         item['addr_province'] = '河北省'
         print('河北新闻网' * 100)
         yield item
Esempio n. 15
0
    def parse(self, response):
        item = SomenewItem()
        url = response.url
        item['article_id'] = get_md5(url)
        com_parm = response.xpath(
            "//meta[@name='sudameta'][2]/@content").extract_first()
        if com_parm == 'sinaog:0':
            com_parm = response.xpath(
                "//meta[@name='sudameta'][1]/@content").extract_first()
        com_parm_dic = {
            i.split(':')[0]: i.split(':')[1]
            for i in com_parm.split(';')
        }
        com_url = 'http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=' + com_parm_dic[
            'comment_channel'] + '&newsid=' + com_parm_dic[
                'comment_id'] + '&group=undefined&compress=0&ie=utf-8'

        yield scrapy.Request(com_url,
                             callback=self.get_com_num,
                             meta={"item": item})
Esempio n. 16
0
 def parse(self, response):
     item = SomenewItem()
     html = response.body.decode()
     comm_num_re = re.search('\"comments_count\"\: (\d+)', html, re.S)
     if comm_num_re:
         item['comm_num'] = comm_num_re.group(1)
     else:
         item['comm_num'] = 0
     env_num_re = re.search('\"reposts_count\"\: (\d+)', html, re.S)
     if env_num_re:
         item['env_num'] = env_num_re.group(1)
     else:
         item['env_num'] = 0
     fav_num_re = re.search('\"attitudes_count\"\: (\d+)', html, re.S)
     if fav_num_re:
         item['fav_num'] = fav_num_re.group(1)
     else:
         item['fav_num'] = 0
     item['read_num'] = 0
     item['hot_value'] = int(item['comm_num']) + int(item['env_num']) + int(
         item['fav_num'])
     item['article_id'] = get_md5(response.url)
     yield item
Esempio n. 17
0
    def get_detail(self,response):
        print(response.url,'我是响应的rul')
        item= SomenewItem()
        item['title']= response.xpath('//h1/text()').extract_first()
        try:
            item['time'] = response.xpath('//div[@class="post_source"]/text()').extract()[0]
        except:
            pass
        item['content'] = response.xpath('//div[@class="text"]/p/text()|//*[@id="content"]/p/span/text()|//*[@id="content"]/span/span/p/text()').extract()
        try:
            item['come_from'] = response.xpath('//div[@class="post_source"]/a/text()|//div[@class="g_width content"]/div[1]/text()').extract_first()
        except:
            pass
        if item['title'] and item['content']:

            item['come_from'] = item['come_from'].split('来源:')[1].split('\n')[0]
            try:
                item['time'] = item['time'].split('\u3000')[0].split('\n')[1]
            except:
                pass
            item['url'] = response.url
            item['content'] = ''.join(item['content']).replace('\u3000', u' ').replace('\xa0', u' ').replace('\n',
                                                                                                               '').replace(
                '\u2002', '').replace('\t','').replace('\r','').strip()
            m = hashlib.md5()
            m.update(str(item['url']).encode('utf8'))
            item['article_id'] = m.hexdigest()
            item['media'] = '河北新闻网'
            item['create_time'] = datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')
            item['comm_num'] = "0"
            item['fav_num'] = '0'
            item['read_num'] = '0'
            item['env_num'] = '0'
            item['media_type'] = '网媒'
            item['addr_province'] = '河北省'
            print('河北新闻网'*100)
            yield item
Esempio n. 18
0
 def parse(self, response):
     item = SomenewItem()
     url = response.url
     item['article_id'] = get_md5(url)
     # if not item['content'] == '':
     html = response.xpath(
         "//*[@id='Main-Article-QQ']/div/div[1]/div[2]/script/text()"
     ).extract_first()
     # 如果没有取到cmt_id,说明comm_num=0
     if html:
         html = html.replace("\n", '').replace(' ', '')
         cmt_id = re.match('.*?cmt_id=(\d+).*', html).group(1)
         com_url = 'https://coral.qq.com/article/' + cmt_id + '/commentnum'
         yield scrapy.Request(com_url,
                              callback=self.get_comm_num,
                              dont_filter=True,
                              meta={'item': item})
     else:
         item['comm_num'] = 0
         item['fav_num'] = '0'
         item['read_num'] = '0'
         item['env_num'] = '0'
         item['hot_value'] = 0
         yield item