def parse(self, response): item = SomenewItem() url = response.url item['article_id'] = get_md5(url) com_num = response.xpath( "//h2[@id='comm_span']/span/text()").extract_first() if com_num is None: com_num = 0 else: com_num = com_num.replace("(", '').replace(")", '').replace( "\n", '').replace("\t", '') com_int = re.match("(.*)k$", str(com_num)) if com_int is not None: item['comm_num'] = int(float(com_int.group(1)) * 1000) else: item['comm_num'] = com_num item['read_num'] = '0' item['env_num'] = '0' conid = re.match('.*?(\d+)', url) if conid: conid = conid.group(1) fav_url = 'https://www.thepaper.cn/cont_vote_json.jsp?contid=' + str( conid) yield scrapy.Request(fav_url, callback=self.get_fav_num, meta={"item": item}) else: item['fav_num'] = 0 item['hot_value'] = int(item['fav_num']) + int(item['comm_num']) yield item
def get_detail(self, response): print(response.url, '我是响应的rul') item = SomenewItem() item['title'] = response.xpath( '//*[@id="Title"]/text()').extract_first() item['time'] = response.url.split('/paper/c/')[1][:9].replace( '8', '8/') item['content'] = response.xpath( '//*[@id="ozoom"]/founder-content/p/text()').extract() item['come_from'] = '河北日报' if item['title'] and item['content']: item['url'] = response.url item['content'] = ''.join(item['content']).replace( '\u3000', u' ').replace('\xa0', u' ').replace('\n', '').replace( '\u2002', '').replace('\t', '').replace('\r', '').strip() m = hashlib.md5() m.update(str(item['url']).encode('utf8')) item['article_id'] = m.hexdigest() item['media'] = '河北日报' item['create_time'] = datetime.datetime.now().strftime( '%Y/%m/%d %H:%M:%S') item['comm_num'] = "0" item['fav_num'] = '0' item['read_num'] = '0' item['env_num'] = '0' item['media_type'] = '网媒' item['addr_province'] = '河北省' # print('燕赵都市报' * 100) yield item
def get_detail(self, response): print(response.url, '我是响应的rul') item = SomenewItem() item['title'] = response.xpath( '/html/body/div[3]/h1/text()').extract_first() item['time'] = response.xpath( '//*[@id="top_bar"]/div/div[2]/span[1]/text()').extract_first() item['content'] = response.xpath( '//*[@id="artibody"]/p/text()').extract() try: item['come_from'] = response.xpath( '//*[@id="top_bar"]/div/div[2]/a/text()').extract()[0] except: pass if item['title'] and item['content']: item['url'] = response.url item['content'] = ''.join(item['content']).replace( '\u3000', u' ').replace('\xa0', u' ').replace('\n', '').replace( '\u2002', '').replace('\t', '').replace('\r', '').strip() m = hashlib.md5() m.update(str(item['url']).encode('utf8')) item['article_id'] = m.hexdigest() item['media'] = '邯郸新闻网' item['create_time'] = datetime.datetime.now().strftime( '%Y/%m/%d %H:%M:%S') item['comm_num'] = "0" item['fav_num'] = '0' item['read_num'] = '0' item['env_num'] = '0' item['media_type'] = '网媒' item['addr_province'] = '河北省' print('邯郸新闻网' * 100) yield item
def get_detail(self, response): print(response.url, '我是响应的rul') item = SomenewItem() item['title'] = response.xpath('/html/body/table[3]/tr[2]/td[1]/table[2]/tr/td/table[1]/tr/td/div/h1/text()').extract()[0] item['time'] = response.xpath('//td[@class="STYLE2 zi12"]/text()').extract()[1] item['come_from'] = response.xpath('//td[5]/a/span/text()').extract_first() item['content'] = response.xpath('//td/p/text()').extract() if item['title'] and item['content']: item['come_from'] = item['come_from'].split('来源:')[1] item['time'] = item['time'].split('\r\n')[1] item['url'] = response.url item['content'] = ''.join(item['content']).replace('\u3000', u' ').replace('\xa0', u' ').replace('\n', '').replace( '\u2002', '').replace('\t', '').replace('\r', '').strip() m = hashlib.md5() m.update(str(item['url']).encode('utf8')) item['article_id'] = m.hexdigest() item['media'] = '环渤海新闻网' item['create_time'] = datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S') item['comm_num'] = "0" item['fav_num'] = '0' item['read_num'] = '0' item['env_num'] = '0' item['media_type'] = '网媒' item['addr_province'] = '河北省' item['addr_city'] = '唐山' print('环渤海新闻网' * 100) yield item
def get_detail(self,response): print(response.url,'我是响应的rul') item= SomenewItem() item['title']= response.xpath('/html/body/div[4]/h1/text()').extract_first() item['time'] = response.xpath('/html/body/div[4]/div/div/text()').extract()[0] item['content'] = response.xpath('//*[@id="rwb_zw"]/p/text()').extract() item['come_from'] = response.xpath('/html/body/div[4]/div/div/text()').extract()[0] if item['title'] and item['content']: item['time'] = item['time'].split('\xa0\xa0')[0].strip().split('\r\n')[0][0:19] print(len(item['time'])) item['come_from'] = item['come_from'].split('\xa0\xa0')[1].strip().split('来源:')[1] item['url'] = response.url item['content'] = ''.join(item['content']).replace('\u3000', u' ').replace('\xa0', u' ').replace('\n', '').replace( '\u2002', '').replace('\t','').replace('\r','').strip() m = hashlib.md5() m.update(str(item['url']).encode('utf8')) item['article_id'] = m.hexdigest() item['media'] = '邢台网' item['create_time'] = datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S') item['comm_num'] = "0" item['fav_num'] = '0' item['read_num'] = '0' item['env_num'] = '0' item['media_type'] = '网媒' item['addr_province'] = '河北省' item['addr_city'] = '邢台' print('邢台网'*100) print(item) yield item
def get_detail(self, response): print(response.url, '我是响应的rul') item = SomenewItem() item['title'] = response.xpath('//div[1]/h1/text()').extract_first() item['time'] = response.xpath( '//div[2]/div[1]/div/span[2]/text()').extract_first() item['come_from'] = response.xpath( '//div[2]/div[1]/div/span[1]/text()').extract() item['content'] = response.xpath( '//div[1]/div[2]/div[2]/p/text()').extract() if item['title'] and item['content']: item['come_from'] = item['come_from'][0].split('来源:')[1] item['url'] = response.url item['content'] = ''.join(item['content']).replace( '\u3000', u' ').replace('\xa0', u' ').replace('\n', '').replace( '\u2002', '').replace('\t', '').replace('\r', '').strip() m = hashlib.md5() m.update(str(item['url']).encode('utf8')) item['article_id'] = m.hexdigest() item['media'] = '河北青年报' item['create_time'] = datetime.datetime.now().strftime( '%Y/%m/%d %H:%M:%S') item['comm_num'] = "0" item['fav_num'] = '0' item['read_num'] = '0' item['env_num'] = '0' item['media_type'] = '网媒' item['addr_province'] = '河北省' print('河北青年报' * 100)
def get_detail(self, response): print(response.url, '我是响应的rul') item = SomenewItem() try: item['title'] = response.xpath('//div[1]/h2/text()').extract()[0].strip() except: pass item['time'] = response.xpath('//div[@class="info"]/text()').extract() item['come_from'] = '石家庄新闻网' item['content'] = response.xpath('/html/body/div/div[5]/div[1]/div[1]/div[5]/p/text()|//div[1]/div/div/p/text()').extract() if item['title'] and item['content']: for i in item['time']: data = re.findall(r'(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})', i) if data: item['time'] = data[0] item['url'] = response.url item['content'] = ''.join(item['content']).replace('\u3000', u' ').replace('\xa0', u' ').replace('\n', '').replace( '\u2002', '').replace('\t', '').replace('\r', '').strip() m = hashlib.md5() m.update(str(item['url']).encode('utf8')) item['article_id'] = m.hexdigest() item['media'] = '石家庄新闻网' item['create_time'] = datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S') item['comm_num'] = "0" item['fav_num'] = '0' item['read_num'] = '0' item['env_num'] = '0' item['media_type'] = '网媒' item['addr_province'] = '河北省' item['addr_city'] = '河北省' print('石家庄新闻网' * 100) yield item
def parse(self, response): id = response.xpath( "//div[@class='attitude']/span/@data-id").extract_first() upnum_url = 'http://www.bjnews.com.cn/webapi/getupnum?id=' + str(id) item = SomenewItem() item['article_id'] = get_md5(response.url) yield scrapy.Request(upnum_url, callback=self.get_comment_num, meta={'item': item})
def parse(self, response): content = re.search("getcomments:'(.*?)'", response.text, re.S) url = content.group(1) comment_url = 'http:' + url item = SomenewItem() item['article_id'] = get_md5(response.url) yield scrapy.Request(comment_url, callback=self.get_comment_num, meta={'item': item})
def parse(self, response): item = SomenewItem() url = str(response.url) title = response.xpath("//div[@class='l_a']/h1/text()").extract_first() item['article_id'] = get_md5(url) sourceid = response.xpath( "//meta[@name='contentid']/@content").extract_first() com_url = 'https://commentn.huanqiu.com/api/v2/async?a=comment&m=source_info&appid=e8fcff106c8f&sourceid=' + sourceid + '&url=' + \ url + '&title=' + title yield scrapy.Request(com_url, callback=self.get_com_num, meta={'item': item})
def parse(self, response): content = re.findall(r'("productKey".*)?"target"', response.text, re.S) content = ''.join(content).replace('\n', '').replace(' ', '') con = "{" + content + "}" con = eval( con ) # {'productKey': 'a2869674571f77b5a0867c3d71db5856', 'docId': 'E9UT79BB0001875P'} item = SomenewItem() comment_url = 'https://comment.api.163.com/api/v1/products/{productKey}/threads/{docId}?ibc=jssdk'.format( productKey=con['productKey'], docId=con['docId']) item['article_id'] = get_md5(response.url) yield scrapy.Request(url=comment_url, callback=self.get_comment_num, meta={'item': item})
def parse(self, response): fullText = response.body.decode() # print(fullText) item = SomenewItem() comm_re = re.search('comments_count\: (\d+)', fullText, re.S) if comm_re: item['comm_num'] = comm_re.group(1) else: item['comm_num'] = 0 item['read_num'] = 0 item['fav_num'] = 0 item['env_num'] = 0 item['hot_value'] = item['comm_num'] item['article_id'] = get_md5(response.url) # print(item) yield item
def get_detail(self, response): print(response.url, '我是响应的rul') item = SomenewItem() item['title'] = response.xpath( '/html/body/div[1]/table/tr/td[1]/table/tr/td/table[2]/tr[1]/td/div/h2/text()' ).extract_first() item['time'] = response.xpath( '/html/body/div[1]/table/tr/td[1]/table/tr/td/table[2]/tr[2]/td/div/text()' ).extract() item['come_from'] = response.xpath( '/html/body/div[1]/table/tr/td[1]/table/tr/td/table[2]/tr[2]/td/div/text()' ).extract() item['content'] = response.xpath( '/html/body/div[1]/table/tr/td/table/tr/td/table/tr/td/p/span/text()\ |/html/body/div[1]/table/tr/td[1]/table/tr/td/table[2]/tr[3]/td/text()|/html/body/div[1]/table/tr/td[1]/table/tr/td/table[2]/tr[3]/td/div/text()' ).extract() if item['title'] and item['content']: item['come_from'] = item['come_from'][2] for node in item['time']: data = re.findall(r'(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})', node) if data: item['time'] = data[0] item['url'] = response.url item['content'] = ''.join(item['content']).replace( '\u3000', u' ').replace('\xa0', u' ').replace('\n', '').replace( '\u2002', '').replace('\t', '').replace('\r', '').strip() m = hashlib.md5() m.update(str(item['url']).encode('utf8')) item['article_id'] = m.hexdigest() item['media'] = '衡水新闻网' item['create_time'] = datetime.datetime.now().strftime( '%Y/%m/%d %H:%M:%S') item['comm_num'] = "0" item['fav_num'] = '0' item['read_num'] = '0' item['env_num'] = '0' item['media_type'] = '网媒' item['addr_province'] = '河北省' item['addr_city'] = '衡水' print('衡水新闻网' * 100) yield item print(item)
def get_detail(self, response): print(response.url, '我是响应的rul') item = SomenewItem() item['title'] = response.xpath('//h1/text()').extract_first() item['time'] = response.xpath( '//div[@class="post_source"]/text()').extract() item['come_from'] = response.xpath( '//div[@class="post_source"]/text()[2]|//div[@class="post_source"]/a/text()' ).extract() item['content'] = response.xpath('//p/text()').extract() if item['title'] and item['content']: if len(item['come_from']) == 1: item['come_from'] = item['come_from'][0].split( '\r\n')[1].split('\n')[0] elif len(item['come_from']) == 2: item['come_from'] = item['come_from'][1] else: pass for node in item['time']: data = re.findall(r'(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})', node) if data: item['time'] = data[0] item['url'] = response.url item['content'] = ''.join(item['content']).replace( '\u3000', u' ').replace('\xa0', u' ').replace('\n', '').replace( '\u2002', '').replace('\t', '').replace('\r', '').strip() m = hashlib.md5() m.update(str(item['url']).encode('utf8')) item['article_id'] = m.hexdigest() item['media'] = '长城网' item['create_time'] = datetime.datetime.now().strftime( '%Y/%m/%d %H:%M:%S') item['comm_num'] = "0" item['fav_num'] = '0' item['read_num'] = '0' item['env_num'] = '0' item['media_type'] = '网媒' item['addr_province'] = '河北省' print('河北新闻网' * 100) yield item
def parse(self, response): item = SomenewItem() url = response.url item['article_id'] = get_md5(url) com_parm = response.xpath( "//meta[@name='sudameta'][2]/@content").extract_first() if com_parm == 'sinaog:0': com_parm = response.xpath( "//meta[@name='sudameta'][1]/@content").extract_first() com_parm_dic = { i.split(':')[0]: i.split(':')[1] for i in com_parm.split(';') } com_url = 'http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=' + com_parm_dic[ 'comment_channel'] + '&newsid=' + com_parm_dic[ 'comment_id'] + '&group=undefined&compress=0&ie=utf-8' yield scrapy.Request(com_url, callback=self.get_com_num, meta={"item": item})
def parse(self, response): item = SomenewItem() html = response.body.decode() comm_num_re = re.search('\"comments_count\"\: (\d+)', html, re.S) if comm_num_re: item['comm_num'] = comm_num_re.group(1) else: item['comm_num'] = 0 env_num_re = re.search('\"reposts_count\"\: (\d+)', html, re.S) if env_num_re: item['env_num'] = env_num_re.group(1) else: item['env_num'] = 0 fav_num_re = re.search('\"attitudes_count\"\: (\d+)', html, re.S) if fav_num_re: item['fav_num'] = fav_num_re.group(1) else: item['fav_num'] = 0 item['read_num'] = 0 item['hot_value'] = int(item['comm_num']) + int(item['env_num']) + int( item['fav_num']) item['article_id'] = get_md5(response.url) yield item
def get_detail(self,response): print(response.url,'我是响应的rul') item= SomenewItem() item['title']= response.xpath('//h1/text()').extract_first() try: item['time'] = response.xpath('//div[@class="post_source"]/text()').extract()[0] except: pass item['content'] = response.xpath('//div[@class="text"]/p/text()|//*[@id="content"]/p/span/text()|//*[@id="content"]/span/span/p/text()').extract() try: item['come_from'] = response.xpath('//div[@class="post_source"]/a/text()|//div[@class="g_width content"]/div[1]/text()').extract_first() except: pass if item['title'] and item['content']: item['come_from'] = item['come_from'].split('来源:')[1].split('\n')[0] try: item['time'] = item['time'].split('\u3000')[0].split('\n')[1] except: pass item['url'] = response.url item['content'] = ''.join(item['content']).replace('\u3000', u' ').replace('\xa0', u' ').replace('\n', '').replace( '\u2002', '').replace('\t','').replace('\r','').strip() m = hashlib.md5() m.update(str(item['url']).encode('utf8')) item['article_id'] = m.hexdigest() item['media'] = '河北新闻网' item['create_time'] = datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S') item['comm_num'] = "0" item['fav_num'] = '0' item['read_num'] = '0' item['env_num'] = '0' item['media_type'] = '网媒' item['addr_province'] = '河北省' print('河北新闻网'*100) yield item
def parse(self, response): item = SomenewItem() url = response.url item['article_id'] = get_md5(url) # if not item['content'] == '': html = response.xpath( "//*[@id='Main-Article-QQ']/div/div[1]/div[2]/script/text()" ).extract_first() # 如果没有取到cmt_id,说明comm_num=0 if html: html = html.replace("\n", '').replace(' ', '') cmt_id = re.match('.*?cmt_id=(\d+).*', html).group(1) com_url = 'https://coral.qq.com/article/' + cmt_id + '/commentnum' yield scrapy.Request(com_url, callback=self.get_comm_num, dont_filter=True, meta={'item': item}) else: item['comm_num'] = 0 item['fav_num'] = '0' item['read_num'] = '0' item['env_num'] = '0' item['hot_value'] = 0 yield item