class Spider(object): """ 这是一个爬虫模板 """ def __init__(self, redis_example): self.headers = { 'Content-Type': 'text/html; charset=utf-8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', # 'Cookie': 'vip_rip=101.86.55.85; vip_address=%257B%2522pname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522cname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522pid%2522%253A%2522103101%2522%252C%2522cid%2522%253A%2522103101101%2522%257D; vip_province=103101; vip_province_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_code=103101101; vip_wh=VIP_SH; mars_pid=0; mars_sid=a369b0e73f9656dbd3eda470968f6cd2; _smt_uid=5d4156d3.52d69d05; user_class=a; VipUINFO=luc%3Aa%7Csuc%3Aa%7Cbct%3Ac_new%7Chct%3Ac_new%7Cbdts%3A0%7Cbcts%3A0%7Ckfts%3A0%7Cc10%3A0%7Crcabt%3A0%7Cp2%3A0%7Cp3%3A1%7Cp4%3A0%7Cp5%3A1; vipte_viewed_=6917921732696396695%2C793920209978892%2C2161644495; visit_id=4C5B033907F8247A18F2811FF8D147F0; _jzqco=%7C%7C%7C%7C%7C1.15943944.1564563154491.1564740333894.1564740386032.1564740333894.1564740386032.0.0.0.24.24; mars_cid=1564563151837_048422ec87f93127ee1eced568a171af', 'Host': 'category.vip.com', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } # 时间部分,按小时抓取 date_time = str(datetime.now() - timedelta(days=1)).split('.')[0] start_time_test = time.strftime('%Y-%m-%d 00:00:00') end_time = time.strftime('%Y-%m-%d %H:%M:%S') a = end_time.split(' ')[1].split(':')[0] if a == '00': start_time_data = date_time hours_name = '22_24' wen_jian_jia_date = str(datetime.now() - timedelta( days=1)).split('.')[0].split(' ')[0].replace('-', '') else: two_hours_ago = int(a) - 2 if len(str(two_hours_ago)) == 1: two_hour_ago = '0' + str(two_hours_ago) else: two_hour_ago = str(two_hours_ago) hours_name = str(two_hour_ago) + '_' + str(a) start_time_data = start_time_test wen_jian_jia_date = time.strftime('%Y%m%d') print('爬取时间段:{}到{}'.format(start_time_data, end_time)) logging.info('爬取时间段:{}到{}'.format(start_time_data, end_time)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = start_time_data # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = end_time # 标记爬虫工作 self.is_break = False self.redis_example = redis_example self.pid = os.getpid() self.h2_name = hours_name self.date_time = wen_jian_jia_date # 链接hdfs self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') self.hdfsclient.makedirs( '/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format( wen_jian_jia_date, hours_name)) # 创建每日文件夹 self.time_data = str(time.time()).split('.')[0] # 时间戳转换时间 def time_change(self, data): # 替换抓取数据中的html标签 try: timeStamp = float(int(data) / 1000) timeArray = time.localtime(timeStamp) otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) return otherStyleTime except: pass # 过滤月销量里面的非数字 def re_not_number(self, data): try: message = str(data) ret1 = re.sub(r'\D', '', message) return ret1 except: pass # 获取评论量 def parse_comments_num(self, goods_dict): try: headers = { # 'Cookie': 'vip_rip=101.86.55.85; vip_address=%257B%2522pname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522cname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522pid%2522%253A%2522103101%2522%252C%2522cid%2522%253A%2522103101101%2522%257D; vip_province=103101; vip_province_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_code=103101101; vip_wh=VIP_SH; mars_pid=0; mars_sid=a369b0e73f9656dbd3eda470968f6cd2; _smt_uid=5d4156d3.52d69d05; VipDFT=1; visit_id=2221152ECC2AD948DF7AB8D56322CE59; vipAc=cf3c0da6d5b52c0f6088b0148efbdb22; vipshop_passport_src=https%3A%2F%2Fdetail.vip.com%2Fdetail-1710618487-6918048587083491095.html; PASSPORT_ACCESS_TOKEN=1FDEBDAAF470FFB2C3C6A9EEAF7256FBA60D1F08; VipRUID=298018734; VipUID=0f94f94cc1ea26b39e78438380499d64; VipRNAME=152*****067; VipLID=0%7C1564973676%7C4b447f; VipDegree=D1; user_class=c; VipUINFO=luc%3Ac%7Csuc%3Ac%7Cbct%3Ac_new%7Chct%3Ac_new%7Cbdts%3A0%7Cbcts%3A0%7Ckfts%3A0%7Cc10%3A0%7Crcabt%3A0%7Cp2%3A0%7Cp3%3A1%7Cp4%3A0%7Cp5%3A1; PHPSESSID=b9bnc95dlt7r4eg2r196td02i4; vipte_viewed_=6917921732696396695%2C793920209978892%2C2161644495%2C6918048587083491095%2C6917922115290256471; VipCI_te=0%7C%7C1564974326; _jzqco=%7C%7C%7C%7C%7C1.15943944.1564563154491.1564974076993.1564974326073.1564974076993.1564974326073.0.0.0.39.39; waitlist=%7B%22pollingId%22%3A%22F90BE7CF-3F21-4012-800F-E1F26000E5BF%22%2C%22pollingStamp%22%3A1564974516121%7D; mars_cid=1564563151837_048422ec87f93127ee1eced568a171af', 'Host': 'detail.vip.com', 'Pragma': 'no-cache', 'Referer': goods_dict['url'], 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } url = 'https://detail.vip.com/v2/mapi?_path=rest/content/reputation/getCountBySpuId&spuId={}&brandId={}&app_name=shop_pc'.format( goods_dict['spuId'], goods_dict['brandId']) try: time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) achieve_num_data = json.loads(response.text)['data'] goods_dict['achieve_num'] = achieve_num_data if int(achieve_num_data) == 0: page_num = int( math.ceil(float((int(achieve_num_data) + 1) / 10))) # logger.log(31, '评论数是: %s , 评论页数是: %s ' % (goods_dict['achieve_num'], str(page_num))) # print(goods_dict) self.parse_comments(goods_dict, page_num) else: page_num = int(math.ceil(float(int(achieve_num_data) / 10))) # logger.log(31, '评论数是: %s , 评论页数是: %s ' % (goods_dict['achieve_num'], str(page_num))) # print(goods_dict) self.parse_comments(goods_dict, page_num) except: print(222222222222222222222222, traceback.format_exc()) # 抓取商品评论 def parse_comments(self, goods_dict, page_num): try: if page_num == 0: pass # logger.log(31, '0000000000000000没有商品评论信息000000000000000000') else: is_break = self.is_break headers = { # 'Cookie': 'vip_rip=101.86.55.85; vip_address=%257B%2522pname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522cname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522pid%2522%253A%2522103101%2522%252C%2522cid%2522%253A%2522103101101%2522%257D; vip_province=103101; vip_province_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_code=103101101; vip_wh=VIP_SH; mars_pid=0; mars_sid=a369b0e73f9656dbd3eda470968f6cd2; _smt_uid=5d4156d3.52d69d05; VipDFT=1; visit_id=2221152ECC2AD948DF7AB8D56322CE59; vipAc=cf3c0da6d5b52c0f6088b0148efbdb22; vipshop_passport_src=https%3A%2F%2Fdetail.vip.com%2Fdetail-1710618487-6918048587083491095.html; PASSPORT_ACCESS_TOKEN=1FDEBDAAF470FFB2C3C6A9EEAF7256FBA60D1F08; VipRUID=298018734; VipUID=0f94f94cc1ea26b39e78438380499d64; VipRNAME=152*****067; VipLID=0%7C1564973676%7C4b447f; VipDegree=D1; user_class=c; VipUINFO=luc%3Ac%7Csuc%3Ac%7Cbct%3Ac_new%7Chct%3Ac_new%7Cbdts%3A0%7Cbcts%3A0%7Ckfts%3A0%7Cc10%3A0%7Crcabt%3A0%7Cp2%3A0%7Cp3%3A1%7Cp4%3A0%7Cp5%3A1; PHPSESSID=b9bnc95dlt7r4eg2r196td02i4; vipte_viewed_=6917921732696396695%2C793920209978892%2C2161644495%2C6918048587083491095%2C6917922115290256471; VipCI_te=0%7C%7C1564974326; _jzqco=%7C%7C%7C%7C%7C1.15943944.1564563154491.1564974076993.1564974326073.1564974076993.1564974326073.0.0.0.39.39; waitlist=%7B%22pollingId%22%3A%22F90BE7CF-3F21-4012-800F-E1F26000E5BF%22%2C%22pollingStamp%22%3A1564974516121%7D; mars_cid=1564563151837_048422ec87f93127ee1eced568a171af', 'Host': 'detail.vip.com', 'Pragma': 'no-cache', 'Referer': goods_dict['url'], 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } for i in range(1, int(page_num) + 1): # logger.log(31, '*************************抓取评论第:%s 页' % i) url = 'https://detail.vip.com/v2/mapi?_path=rest/content/reputation/queryBySpuId&spuId={}&brandId={}&page={}&pageSize=10&app_name=shop_pc&keyWordNlp=%E6%9C%80%E6%96%B0'.format( goods_dict['spuId'], goods_dict['brandId'], i) try: time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) # 商品评价列表 comments_list = json.loads(response.text)['data'] if int(len(comments_list)) == 0: break else: comment_dict = dict() for item in comments_list: date_data = self.time_change( item['reputation']['postTime']) # print(date_data) if self.start_time <= date_data: comment_dict['platform'] = goods_dict[ 'platform'] comment_dict['date'] = date_data.split(' ')[0] comment_dict['time'] = date_data.split(' ')[1] comment_dict['keyword'] = goods_dict['keyword'] comment_dict['name'] = goods_dict['name'] comment_dict['imageurl'] = goods_dict['商品图片'] comment_dict['audiourl'] = '' comment_dict['url'] = goods_dict['url'] comment_dict['shop_name'] = '' comment_dict['user_name'] = item[ 'reputationUser']['authorName'] comment_dict['author_id'] = str( item['reputationUser']['userIdentity']) comment_dict['content'] = item['reputation'][ 'content'] comment_dict['content_id'] = str( item['reputation']['reputationId']) comment_dict['brand'] = goods_dict['brand'] comment_dict['price'] = goods_dict['price'] comment_dict['sales'] = goods_dict['sales'] comment_dict['focus_count'] = '' comment_dict['comment_num'] = goods_dict[ 'achieve_num'] comment_dict['views'] = '' comment_dict['likes'] = '' comment_dict['comments_count'] = '' comment_dict['reposts_count'] = '' comment_dict['topic_id'] = str( goods_dict['url'].split('-')[2].replace( '.html', '')) try: comment_dict['type'] = item[ 'reputationProduct']['colorInfo'] except: comment_dict['type'] = '' try: comment_dict['size'] = item[ 'reputationProduct']['size'] except: comment_dict['size'] = '' comment_dict['file_code'] = '179' # logger.log(31, '---------------正在写入符合时间的商品评论---------------------') # print(comment_dict) # self.write_Nike_jsonfile(comment_dict) item = json.dumps(dict(comment_dict), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_2h/ecommerce/{}/{}/179_{}_WPH_nike{}.json' .format(self.date_time, self.h2_name, time.strftime('%Y%m%d'), self.pid), item, encoding='utf-8') if self.start_time > date_data.split( ' ')[0].strip(): is_break = True if is_break: break except: print(33333333333333333333, traceback.format_exc()) # def parse_xlsx(self): # # 设置路径 # path = './快消采集关键词_0916_v3-1.xlsx' # # 打开execl # workbook = xlrd.open_workbook(path) # # # 根据sheet索引或者名称获取sheet内容 # Data_sheet = workbook.sheets()[0] # 通过索引获取 # # # print(Data_sheet.name) # 获取sheet名称 # rowNum = Data_sheet.nrows # sheet行数 # colNum = Data_sheet.ncols # sheet列数 # # # 获取所有单元格的内容 # list = [] # for i in range(rowNum): # rowlist = [] # for j in range(colNum): # rowlist.append(Data_sheet.cell_value(i, j)) # list.append(rowlist) # # for data in list[1::]: # brand = data[0] # # print(brand) # yield { # '关键词': brand, # } def run(self, lock): for num in range(1000000): lock.acquire() redis_url_num = self.redis_example.llen('WPH_nike_url') if str(redis_url_num) == '0': print( '**********Redis消息队列中url为空.....进程 {} 抓取结束.....***********'. format(str(os.getpid()))) item = self.redis_example.brpop('WPH_nike_url', timeout=3600)[1] lock.release() item1 = json.loads(item.decode()) # print(item) self.parse_comments_num(item1)
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self): # 时间部分 # 爬虫开始抓取的日期 date = datetime.now() - timedelta(days=1) news_start_time = str(date).split(' ')[0] # 爬虫结束的抓取日期 current_time = datetime.now() # 当前日期 current_day = str(current_time).split(' ')[0] print('爬取时间段:{}到{}'.format(news_start_time, current_day)) logging.info('爬取时间段:{}到{}'.format(news_start_time, current_day)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = current_day # 标记爬虫工作 self.is_break = False # 链接hdfs self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') self.hdfsclient.makedirs( '/user/cspider_daily/nike_daily/ecommerce/{}'.format( time.strftime('%Y%m%d'))) # 创建每日文件夹 self.time_data = str(time.time()).split('.')[0] # 替换所有的HTML标签 def re_html(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r'\n', '', ret1) ret3 = re.sub(r'\u3000', '', ret2) ret4 = re.sub(r'品牌:', '', ret3) ret5 = re.sub(r'\xa0', '', ret4) ret6 = re.sub(r'→_→', '', ret5) ret7 = re.sub(r'…', '', ret6) ret8 = re.sub(r'https:', '', ret7) return ret8 except: pass # 过滤月销量里面的非数字 def re_not_number(self, data): try: message = str(data) ret1 = re.sub(r'\D', '', message) return ret1 except: pass # 过滤商品价格 def re_price(self, data): try: message = str(data) ret1 = re.sub(r'pcData\(', '', message) ret2 = re.sub(r'\)', '', ret1) return ret2 except: pass # 过滤商品品牌信息 def re_brand(self, data): try: message = str(data) ret1 = re.sub(r'"brandName":', '', message) ret2 = re.sub(r'&', '', ret1) ret3 = re.sub(r'"', '', ret2) return ret3 except: pass # 根据关键词搜索请求得到商品信息 def parse_url(self, data): # 创建一个字典接收数据 goods_dict = dict() goods_dict['平台'] = data['平台'] goods_dict['关键词'] = data['关键词'] goods_dict['URL'] = data['URL'] goods_dict['商品图片'] = data['商品图片'] goods_dict['商品名'] = data['商品名'] goods_dict['shop_name'] = data['shop_name'] goods_dict['品牌'] = data['品牌'] goods_dict['月销量'] = data['月销量'] goods_dict['价格'] = data['价格'] goods_dict['评论人数'] = data['评论人数'] # logger.log(31, '--------********正在抓取的商品是:%s********--------' % goods_dict) self.parse_goods_details(goods_dict) # 解析商品品牌信息 def parse_goods_details(self, goods_dict): try: headers = { 'Content-Type': 'text/html;charset=utf-8', # 'Connection': 'keep-alive', # 'Cookie': 'SN_SESSION_ID=c55ac35a-f7d1-4b0c-b48a-f88e8bb896f4; useMp4=1.701108; _snvd=1555383181562rH9y3n/THLV; cityCode=021; districtId=12113; cityId=9264; hm_guid=ac41a4ae-4373-4445-ab29-65e90c29b272; _df_ud=60a62287-237d-4cf0-ada4-d39a276f2c2d; _device_session_id=p_2fb27762-ef79-4f07-9f25-e0acad62907a; _cp_dt=bf4a6a96-909f-450a-b7ca-2d8d0b363cee-86574; _snsr=direct%7Cdirect%7C%7C%7C; _snzwt=THiw3Z16a429d6f24nzVa227f; _snmc=1; city=1000267; province=20; district=10002671; provinceCode=20; districtCode=01; streetCode=0210199; SN_CITY=20_021_1000267_9264_01_12113_1_1; authId=si0BE64747CDCB0EC1B819BB87E6D52FC1; secureToken=E180078268FCC770B6CFC47BFC919E55; _snms=155592217017833779; smhst=651484555|0000000000a10607567457|0000000000a10010138536|0070176294a601763915|0000000000a102374199|0000000000a101822787|0000000000a11012720481|0070752460a11024165323|0070745700a193148008|0000000000a861276981|0000000000a11028136288|0070705161a11002911104|0070756234a101822780|0000000000; _snma=1%7C15553832315961909%7C1555383231596%7C1555923318059%7C1555923324804%7C140%7C9; _snmp=155592332389716467; _snmb=155591411681863515%7C1555923324825%7C1555923324807%7C37', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } # print(goods_dict) goods_url = goods_dict['URL'] # response = requests.get(url=goods_url, headers=headers, proxies=random.choice(proxies), timeout=10) try: time.sleep(0.2) response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except (requests.exceptions.ConnectionError, ConnectionResetError): try: time.sleep(0.2) response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except (requests.exceptions.ConnectionError, ConnectionResetError): time.sleep(0.2) response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) # print(response.text) # print('正在抓取的页面是: %s' % goods_url) data = response.text # print(html) # 用正则截取价格和评论链接里需要的两串ID partNumber = re.search(r'"partNumber".*?,', data) vendorCode = re.search(r'"vendorCode".*?,', data) # print(partNumber.group(), vendorCode.group()) goods_dict['partNumber'] = self.re_not_number(partNumber.group()) goods_dict['vendorCode'] = self.re_not_number(vendorCode.group()) # print(goods_dict) if int(self.re_not_number(goods_dict['评论人数'])) == 0: logger.log(31, '-------------没有商品评论信息------------') else: # 获取评论页数 page_num = int( math.ceil( float( int(self.re_not_number(goods_dict['评论人数'])) / 10))) self.goods_comments(goods_dict, page_num) except: print(2222222222222222222, traceback.format_exc()) # 解析商品评论 def goods_comments(self, goods_dict, page_num): try: is_break = self.is_break partNumber = goods_dict['partNumber'] vendorCode = goods_dict['vendorCode'] headers = { 'Content-Type': 'application/javascript;charset=UTF-8', # 'Connection': 'keep-alive', # 'Cookie': '_snvd=1555383181562rH9y3n/THLV; cityCode=021; districtId=12113; cityId=9264; hm_guid=ac41a4ae-4373-4445-ab29-65e90c29b272; _df_ud=60a62287-237d-4cf0-ada4-d39a276f2c2d; _device_session_id=p_2fb27762-ef79-4f07-9f25-e0acad62907a; _cp_dt=bf4a6a96-909f-450a-b7ca-2d8d0b363cee-86574; city=1000267; province=20; district=10002671; provinceCode=20; districtCode=01; streetCode=0210199; SN_CITY=20_021_1000267_9264_01_12113_1_1; tradeMA=127; route=3798b42173574ff4536b1645bfa56286; _snzwt=THusFg16a66e65b60nBjXc7ab; _snsr=direct%7Cdirect%7C%7C%7C; _snmc=1; _snms=155652264991095847; authId=si07DE872B7B580CBB2CB11C7105B450A8; secureToken=5C8868551C3103287B59ADEDD6B90567; smhst=192279908|0000000000a600733096|0000000000a600479244|0000000000a10700388709|0070547159a651484540|0000000000a826233089|0000000000a10243606506|0000000000a101822738|0000000000a101822744|0000000000a160764310|0000000000a122819279|0000000000a651484555|0000000000a10607567457|0000000000a10010138536|0070176294a601763915|0000000000a102374199|0000000000a101822787|0000000000a11012720481|0070752460a11024165323|0070745700a193148008|0000000000; _snma=1%7C15553832315961909%7C1555383231596%7C1556524706411%7C1556524786984%7C224%7C15; _snmp=155652478697968344; _snmb=155652102706620667%7C1556524786995%7C1556524786988%7C28', 'Host': 'review.suning.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } if int(page_num) >= 50: pages = 50 else: pages = page_num # 抓取商品评论链接(总共50页,第一页从1开始) for i in range(1, int(pages) + 1): comment_url = 'https://review.suning.com/ajax/cluster_review_lists/style--{}-{}-newest-{}-default-10-----reviewList.htm?callback=reviewList'.format( partNumber, vendorCode, i) # print(comment_url) try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except (requests.exceptions.ConnectionError, ConnectionResetError): try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except (requests.exceptions.ConnectionError, ConnectionResetError): time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) comment_data = response.text # print(comment_data) comment = re.search(r'{"commodityReviews":.*"reCloudDrill":0}', comment_data) # print(comment.group()) items = json.loads(comment.group())['commodityReviews'] if len(items) == 0: break else: goods_comment = dict() for item in items: # print(item) date_data = item['publishTime'].split(' ')[0] time_data = item['publishTime'].split(' ')[1] # print(date_data, time_data) try: content = self.re_html(item['content']) except: content = '' # 追加评论 try: content_add = item['againReview']['againContent'] except: content_add = '' # 判断评论时间是否在规定的抓取时间内 if self.start_time <= date_data.strip(): goods_comment['platform'] = goods_dict['平台'] goods_comment['date'] = date_data.strip() goods_comment['time'] = time_data.strip() goods_comment['keyword'] = goods_dict['关键词'] goods_comment['name'] = goods_dict['商品名'] goods_comment['imageurl'] = goods_dict['商品图片'] goods_comment['audiourl'] = '' goods_comment['url'] = goods_dict['URL'] goods_comment['shop_name'] = goods_dict[ 'shop_name'] goods_comment['user_name'] = item['userInfo'][ 'nickName'] goods_comment[ 'content'] = content + ';' + content_add goods_comment['content_id'] = str( item['commodityReviewId']) goods_comment['brand'] = goods_dict['品牌'] goods_comment['price'] = goods_dict['价格'] goods_comment['sales'] = goods_dict['月销量'] goods_comment['focus_count'] = '' goods_comment['comment_num'] = goods_dict['评论人数'] goods_comment['views'] = '' goods_comment['author_id'] = '' goods_comment['reposts_count'] = '' goods_comment['topic_id'] = str( goods_dict['URL'].split('/')[4].replace( '.html', '')) test_data = item['commodityInfo']['charaterId1'] if test_data == '尺码': goods_comment['type'] = item['commodityInfo'][ 'charaterDesc2'] goods_comment['size'] = item['commodityInfo'][ 'charaterDesc1'] else: goods_comment['type'] = item['commodityInfo'][ 'charaterDesc1'] goods_comment['size'] = item['commodityInfo'][ 'charaterDesc2'] self.likes_comments(goods_comment) if date_data.strip() < self.start_time: is_break = True if is_break: break except: print(3333333333333333333, traceback.format_exc()) # 解析商品评论的点赞数和回复数 def likes_comments(self, goods_comment): try: comment_id = goods_comment['content_id'] url = 'https://review.suning.com/ajax/useful_count/635960739-usefulCnt.htm'.format( comment_id) headers = { 'Content-Type': 'application/javascript;charset=UTF-8', # 'Cookie': 'tradeMA=55; _snvd=1565067528273QvL8ia7lwZC; SN_CITY=20_021_1000267_9264_01_12113_2_0; cityCode=021; districtId=12113; cityId=9264; hm_guid=ca34f536-186e-4619-aa8f-6c8808ee39a6; _df_ud=e64b917e-c77c-46e0-9d10-d84c86c93f3a; _device_session_id=p_806c72c6-6fa6-462d-bf88-f660c7094c1a; _cp_dt=21f7906b-c341-404f-996b-4d4f2e32e4af-70039; route=e46977517568f7cad53fbfe19eaf4774; _snmc=1; _snsr=direct%7Cdirect%7C%7C%7C; authId=siC700F4CB8ABB1C2E87F1FA1E9650CF7A; secureToken=F9331FD98F503CE8898949382003910A; _snzwt=THs64g16ce02abb69OAUS9a89; _snms=156712934067680848; smhst=690105206|0000000000a10118749983|0000000000a10689501376|0070222946a10949954840|0000000000a10966209249|0000000000a10757523126|0000000000a10620476914|0000000000a11180422688|0000000000a10966225829|0000000000a769909849|0070230352a10580507394|0070222946a826193435|0000000000a10163182478|0000000000a10964625880|0000000000a10571100966|0070074453; _snma=1%7C156506752678869586%7C1565067526788%7C1567129356201%7C1567129676548%7C137%7C12; _snmp=156712967506243164; _snmb=156712899210934272%7C1567129676573%7C1567129676552%7C8', 'Host': 'review.suning.com', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } try: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) # print(response.text) likes_comments_data = json.loads( response.text.replace('usefulCnt(', '').replace(')', '')) goods_comment['likes'] = likes_comments_data[ 'reviewUsefuAndReplylList'][0]['usefulCount'] goods_comment['comments_count'] = likes_comments_data[ 'reviewUsefuAndReplylList'][0]['replyCount'] goods_comment['file_code'] = '53' # logger.log(31, '****-------正在写入符合时间的商品评论信息-------****') item = json.dumps(dict(goods_comment), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_daily/ecommerce/{}/53_{}_{}_Suning_nike_1.json' .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d'), self.time_data), item, encoding='utf-8') except: print(4444444444444444444444, traceback.format_exc()) def run(self): f = open('./{}_suning_shop_img_url.json'.format('20200407'), 'r', encoding='utf-8') goods_data_list = [] for line in f.readlines(): dic_data = json.loads(line) goods_data_list.append(dic_data) for data in goods_data_list: self.parse_url(data)
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self, file_path, comment_path): self.headers_one = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36' } self.start_url = '' # 评论接口模板 self.commnet_port_url = '' # # 时间判断部分 date = datetime.now() - timedelta(days=2) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=0) # 昨天时间 yesterday = str(yesterday).split(' ')[0] # print('爬取时间段:{}到{}'.format(news_start_time, yesterday)) # # logging.info('爬取时间段:{}到{}'.format(news_start_time, yesterday)) # # # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # self.start_time = '2010-03-20' # # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = yesterday # self.end_time = '2019-12-09 14:08' # # 标记爬虫工作 # get_now_time = time.time() - 86400 # 一天或者三小时 的秒数 # # get_now_time = time.time() - 8640000 # 一百天 # print(get_now_time) # time_local = time.localtime(float(get_now_time)) # # 转换成新的时间格式(2016-05-05 20:28:54) # dt = time.strftime("%Y-%m-%d %H:%M", time_local) # "%Y-%m-%d %H:%M:%S" # print(dt) # end_t = time.time() # print(end_t) # time_local = time.localtime(float(end_t)) # # 转换成新的时间格式(2016-05-05 20:28:54) # end_dt = time.strftime("%Y-%m-%d %H:%M", time_local) # "%Y-%m-%d %H:%M:%S" # print(end_dt) # # end_time = str(end_time).split(' ')[0] # print('爬取时间段:{}到{}'.format(dt, end_dt)) # logging.info('爬取时间段:{}到{}'.format(dt, str(datetime.now()))) # 定义开始时间 y-m-d 离现在时间远 # self.start_time = dt # self.start_time = '2019-09-09 00:22' # 定义结束时间 y-m-d 离现在时间近 # self.end_time = end_dt # self.end_time = '2019-09-16 10:22' self.is_work = True self.xhsapi = XhsApi('8ac1d719cd0a2d16') # 代理服务器 proxyHost = "http-cla.abuyun.com" proxyPort = "9030" # 代理隧道验证信息 proxyUser = "******" proxyPass = "******" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, "user": proxyUser, "pass": proxyPass, } self.proxies = {"http": proxyMeta, "https": proxyMeta} self.set_list = [] self.info = seeeion_id_list # try: # os.mkdir('./json_file/{}'.format(str(datetime.now()).split(' ')[0])) # except: # pass with open('./session_id_list_topic.json') as f: session_id = f.read() self.session_id_list = eval(session_id) self.session_id_error = [] self.error_count = 0 self.file_path = file_path self.comment_path = comment_path self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') self.hdfsclient.makedirs('{}/{}'.format( self.file_path, str(datetime.now()).split(' ')[0].replace('-', ''))) # 创建每日文件夹 self.hdfsclient.makedirs('{}/{}'.format( self.comment_path, str(datetime.now()).split(' ')[0].replace('-', ''))) # 创建每日文件夹 self.time_time = str(time.time()).split('.')[0] # def get_session_id(self): # register_smid_ret = self.xhsapi.register_smid_proxy(self.ip) # print('register_smid_ret:' + register_smid_ret) # smid = json.loads(register_smid_ret)['detail']['deviceId'] # print('smid:' + smid) # self.xhsapi.set_smid(smid) # # 激活用户 # active_user_ret = self.xhsapi.active_user_proxy(self.ip) # print('active_user_ret:' + active_user_ret) # # 设置session id # session_id = json.loads(active_user_ret)['data']['session'] # print('session_id:' + session_id) # item = { # 'deviceId': "abbd5bf5-3a82-3fcd-b8b8-4e4c48f68950", # 'device_fingerprint': "201908191457046c8b8bd154ae84d8f7c9f8e912c573870183341147f781ee", # 'device_fingerprint1': "201908191457046c8b8bd154ae84d8f7c9f8e912c573870183341147f781ee", # 'sid': "session.1566198308579055731492", # 'search_id': "A9F65F9019EF946464D38BF16C0E250A", # } # item['device_fingerprint'] = smid # item['device_fingerprint1'] = smid # item['sid'] = "session." + session_id # print(item) def get_sid(self): register_smid_ret = self.xhsapi.register_smid() print('register_smid_ret:' + register_smid_ret) smid = json.loads(register_smid_ret)['detail']['deviceId'] print('smid:' + smid) self.xhsapi.set_smid(smid) # 激活用户 active_user_ret = self.xhsapi.active_user() print('active_user_ret:' + active_user_ret) # 设置session id session_id = json.loads(active_user_ret)['data']['session'] print('session_id:' + session_id) return smid, session_id def change_ip(self): logger.log(31, '开始切换ip') url = 'http://proxy.abuyun.com/switch-ip' time.sleep(random.randint(1, 15)) response = requests.get(url, proxies=self.proxies) logger.log(31, '现使用ip:' + response.text) def res_ip(self): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'webapi.http.zhimacangku.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } # 5-25分 500个ip import time time.sleep(3) url = 'http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=310000&city=0&yys=0&port=1&time=2&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1®ions=' ip_pro = requests.get(url, headers=headers) # print(ip_pro.text) # ip_data = json.loads(ip_pro.text) ip = ip_pro.text.strip() # ip = str(ip_data['data'][0]['ip']) + ':' + str(ip_data['data'][0]['port']) return ip def get_serach_list(self, page, keyword): info = random.choice(self.session_id_list) # info = self.sid_info print(self.session_id_list.index(info)) parms = { 'keyword': keyword, 'platform': 'android', 'filters': '', # 'sort': '', # 综合 排序 # 'sort': 'popularity_descending', # 最热 排序 'sort': 'time_descending', # 最新 排序 'page': page, 'page_size': '20', 'source': 'explore_feed', # 'search_id': info['search_id'], 'api_extra': '', 'deviceId': info['deviceId'], 'device_fingerprint': info['device_fingerprint'], 'device_fingerprint1': info['device_fingerprint1'], 'versionName': '5.35.1', 'channel': 'YingYongBao', 'sid': info['sid'], 'lang': 'zh', 't': str(round(time.time())), } url = 'https://www.xiaohongshu.com/api/sns/v9/search/notes' for i in range(10): res = self.xhsapi.get_sign(url, parms) print(1111, res) if len(res['shield']) == 32: break res = self.xhsapi.get_sign(url, parms) print(res['sign']) parms['sign'] = res['sign'] headers = { # 'authorization': info['sid'], # 'device_id': info['deviceId'], 'user-agent': 'Dalvik/2.1.0 (Linux; U; Android 6.0; DIG-AL00 Build/HUAWEIDIG-AL00) Resolution/720*1280 Version/6.8.0.3 Build/6080103 Device/(HUAWEI;DIG-AL00) NetType/WiFi', 'shield': res['shield'], 'Host': 'www.xiaohongshu.com', 'accept-encoding': 'gzip', 'Connection': 'Keep-Alive', } response = requests.get(url, params=parms, headers=headers) print(response.url) if '"result":0' in response.text and 'msg:' in response.text: del self.session_id_list[self.session_id_list.index(info)] return json_text = json.loads(response.text) print(json_text) note_list = json_text["data"]["notes"] for note in note_list: title = note["title"] if not title: title = note["desc"] id = note["id"] print(title) time.sleep(0.1) if id not in self.set_list: try: self.get_note(id, keyword) except: print(traceback.format_exc()) try: self.get_note(id, keyword) except: print(traceback.format_exc()) self.set_list.append(id) def get_note(self, note_id, keyword, index=0): info = random.choice(self.info) # info = self.sid_info logger.log(31, 'session_id下标: ' + str(self.info.index(info))) self.xhsapi.set_smid(info['device_fingerprint']) self.xhsapi.set_session_id(info['sid'].split('.')[-1]) note_ret = self.xhsapi.get_note(note_id) # print(333333, note_ret) if '参数错误' in note_ret: logger.log(31, '参数错误,重试.....') self.get_note(note_id, keyword, index) return # print(response.text) # if '"result":0' in response.text and 'msg:' in response.text: # logger.log(31, '无效id:', info) # del self.session_id_list[self.session_id_list.index(info)] # return if '{"msg":"","result":0,"success":true}' in note_ret: self.session_id_error.append(info) if self.session_id_error.count(info) > 5: logger.log(31, '无效id:' + str(info)) # del self.info[self.info.index(info)] if self.error_count > 5: self.change_ip() self.error_count = 0 self.error_count += 1 self.get_note(note_id, keyword, index) return json_text = json.loads(note_ret) # print(11111, json_text) data = json_text["data"][0]['note_list'][0] item = {} item['platform'] = '小红书' # print(222222, data) date_all = data['time'] time_local = time.localtime(float(date_all)) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M", time_local) # "%Y-%m-%d %H:%M:%S" logger.log(31, "时间: " + str(dt)) # # 做时间判断部分--------------- get_news_time = time.mktime( time.strptime(str(dt).split(' ')[0], "%Y-%m-%d")) end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d")) if self.start_time != '': start_time = time.mktime(time.strptime(self.start_time, "%Y-%m-%d")) else: start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d")) if float(get_news_time) < float(start_time) and index > 1: logger.log(31, '不符合时间') # self.redis_example.sadd('xiaohongshu_out_day_url', note_id) self.is_work = False return elif float(start_time) <= float(get_news_time) <= float(end_time): logging.log(31, '符合时间') news_date = dt.split(' ')[0] news_time = dt.split(' ')[1] item['date'] = news_date item['time'] = news_time title = data['share_info']["title"] item['title'] = title item['content'] = data["desc"] + '#今日份AJ女生' note_id = data["id"] item['content_id'] = note_id item['article_author'] = data["user"]["nickname"] item['clicks'] = '' item['views'] = data['view_count'] comments = data["comments_count"] item['comments_count'] = comments item['likes'] = data["liked_count"] item['dislikes'] = '' item['keyword'] = keyword article_url = data['share_info']["link"] item['article_url'] = article_url item['series_url'] = '' item['list_url'] = '' item['article_type'] = '' item['article_source'] = '' item['insert_time'] = str(datetime.now()).split('.')[0] item['update_time'] = str(datetime.now()).split('.')[0] item['topic_id'] = note_id item['author_id'] = data["user"]["id"] item['file_code'] = '28' item['reposts_count'] = data['shared_count'] if data['topics']: item['topic'] = data['topics'][0]['name'] item['get_topic_id'] = data['topics'][0]['id'] item['get_topic_url'] = data['topics'][0]['link'] else: item['topic'] = '' item['get_topic_id'] = '' item['get_topic_url'] = '' # if '韩束' not in item['title'] and '韩束' not in item['content']: # print('检索文章没有包含关键词,判定不符合数据......') # return # else: # print('符合检索关键词的文章......') # print(item) logging.log(31, item) self.write_news_jsonfile(item) # self.queue.put(item) if int(comments) > 0: try: self.get_note_comment(note_id, keyword, article_url, news_date, news_time, title) except: if '503 Service Temporarily' in traceback.format_exc( ) or 'requests.exceptions.SSLError' in traceback.format_exc( ): self.change_ip() logging.error(traceback.format_exc()) try: self.get_note_comment(note_id, keyword, article_url, news_date, news_time, title) except: if '503 Service Temporarily' in traceback.format_exc( ) or 'requests.exceptions.SSLError' in traceback.format_exc( ): self.change_ip() logging.error(traceback.format_exc()) # @retry(stop_max_attempt_number=2, retry_on_exception=retry_if_key_error) def get_note_comment(self, note_id, keyword, article_url, news_date, news_time, title, start='', now_page=1): if start: response = self.xhsapi.get_note_comments(note_id, 20, start) else: response = self.xhsapi.get_note_comments(note_id, 20) # if '"result":0' in response.text and 'msg:' in response.text: # del self.session_id_list[self.session_id_list.index(s)] # return data = json.loads(response) # print(data) try: comment_list = data['data']["comments"] except: logging.log(31, data) logging.error(traceback.format_exc()) return comment_count = data['data']["comment_count_l1"] last_comment_id = '' # total_item = '' for comment in comment_list: item = {} item['platform'] = '小红书' item['source_date'] = news_date item['source_time'] = news_time date_all = comment['time'] # #转换成localtime time_local = time.localtime(float(date_all)) # 转换成新的时间格式(2016-05-05 20:28:54) comment_date = time.strftime("%Y-%m-%d %H:%M", time_local) # "%Y-%m-%d %H:%M:%S" # # 做时间判断部分--------------- # get_news_time = time.mktime(time.strptime(str(comment_date), "%Y-%m-%d %H:%M")) # # end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d %H:%M")) # if self.start_time != '': # start_time = time.mktime(time.strptime(self.start_time, "%Y-%m-%d %H:%M")) # else: # start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d %H:%M")) # if float(get_news_time) < float(start_time): # self.is_work = False # return # # if float(start_time) <= float(get_news_time): get_news_time = time.mktime( time.strptime(str(comment_date).split(' ')[0], "%Y-%m-%d")) end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d")) if self.start_time != '': start_time = time.mktime( time.strptime(self.start_time, "%Y-%m-%d")) else: start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d")) if float(get_news_time) < float(start_time): self.is_get_comment = False # 返回的回答消息是按时间进行排序的,所以当时间小于指定时间时,就停止爬取, # break elif float(start_time) <= float(get_news_time) <= float(end_time): item['date'] = comment_date.split(' ')[0] item['time'] = comment_date.split(' ')[1] item['title'] = title item['author'] = comment['user']["nickname"] item['author_id'] = comment['user']["userid"] item['content'] = comment["content"] comment_id = comment["id"] last_comment_id = comment_id item['content_id'] = comment_id item['floor'] = '' item['keyword'] = keyword item['source_url'] = article_url item['comment_url'] = article_url item['views'] = '' item['comments_count'] = '' item['likes'] = comment["like_count"] item['dislikes'] = '' item['insert_time'] = str(datetime.now()).split('.')[0] item['update_time'] = str(datetime.now()).split('.')[0] item['topic_id'] = note_id item['file_code'] = '42' item['reposts_count'] = '' # print(item) # print(11111111, item) # item = json.dumps(dict(item), ensure_ascii=False) + '\n' # total_item = total_item + item # self.comment_queue.put(item) self.write_comment_jsonfile(item) # self.comment_queue.put # print(last_comment_id) all_page_num = math.ceil(float(int(comment_count) / 20)) if int(all_page_num) > now_page and self.is_work: now_page += 1 time.sleep(0.1) try: self.get_note_comment(note_id, keyword, article_url, news_date, news_time, title, last_comment_id, now_page) except: try: self.get_note_comment(note_id, keyword, article_url, news_date, news_time, title, last_comment_id, now_page) except: pass def get_user(self, user_id, page, num): info = random.choice(self.info) # info = self.sid_info print(self.info.index(info)) self.xhsapi.set_smid(info['device_fingerprint']) self.xhsapi.set_session_id(info['sid'].split('.')[-1]) # response = requests.get(url, params=parms, headers=headers) note_ret = self.xhsapi.get_user_note(user_id, page, num) print(1111, note_ret) if '参数错误' in note_ret: logger.log(31, '参数错误,重试.....') self.get_user(user_id, page, num) return # if '"result":0' in response.text and 'msg:' in response.text: # logger.log(31, '无效id:', info) # del self.session_id_list[self.session_id_list.index(info)] # return if '{"msg":"","result":0,"success":true}' in note_ret: self.change_ip() # self.session_id_error.append(info) # if self.session_id_error.count(info) > 5: # logger.log(31, '无效id:' + str(info)) # del self.session_id_list[self.session_id_list.index(info)] # if self.error_count > 5: # self.change_ip() # self.error_count = 0 # self.error_count += 1 # self.get_user(user_id, page, num) # return data = json.loads(note_ret) notes = data['data']['notes'] if not notes: with open('uses_id', 'a') as f: f.write(user_id + '\n') else: for index, note in enumerate(notes): # item = {} # print(note) id = note['id'] if not self.is_work: return try: time.sleep(1) self.get_note(id, '', index) except: if '503 Service Temporarily' in traceback.format_exc( ) or 'requests.exceptions.SSLError' in traceback.format_exc( ): self.change_ip() logging.error(traceback.format_exc()) try: time.sleep(1) self.get_note(id, '', index) except: if '503 Service Temporarily' in traceback.format_exc( ) or 'requests.exceptions.SSLError' in traceback.format_exc( ): self.change_ip() logging.error(traceback.format_exc()) try: time.sleep(1) self.get_note(id, '', index) except: if '503 Service Temporarily' in traceback.format_exc( ) or 'requests.exceptions.SSLError' in traceback.format_exc( ): self.change_ip() logging.error(traceback.format_exc()) time.sleep(1) def get_topic(self, topic_id, page, num): """ get topice info :param user_id: :param page: :param num: :return: """ info = random.choice(self.info) # info = self.sid_info logging.log(31, self.info.index(info)) self.xhsapi.set_smid(info['device_fingerprint']) self.xhsapi.set_session_id(info['sid'].split('.')[-1]) # response = requests.get(url, params=parms, headers=headers) note_ret = self.xhsapi.get_topic_notes(topic_id, page, num) # logging.log(31, note_ret) if '参数错误' in note_ret: logger.log(31, '参数错误,重试.....') self.get_topic(topic_id, page, num) return # if '"result":0' in response.text and 'msg:' in response.text: # logger.log(31, '无效id:', info) # del self.session_id_list[self.session_id_list.index(info)] # return if '{"msg":"","result":0,"success":true}' in note_ret: self.change_ip() # self.session_id_error.append(info) # if self.session_id_error.count(info) > 5: # logger.log(31, '无效id:' + str(info)) # del self.session_id_list[self.session_id_list.index(info)] # if self.error_count > 5: # self.change_ip() # self.error_count = 0 # self.error_count += 1 self.get_topic(topic_id, page, num) return # return data = json.loads(note_ret) notes = data['data']['noteList'] if not notes: # with open('uses_id', 'a') as f: # f.write(topic_id + '\n') pass else: for index, note in enumerate(notes): # item = {} # print(note) id = note['id'] # if not self.is_work: # return for i in range(10): try: time.sleep(1) self.get_note(id, '', index) break except: time.sleep(3) logging.error(traceback.format_exc()) time.sleep(1) def get_topic_v2(self, topic_id, page): info = random.choice(self.info) # info = self.sid_info logging.log(31, self.info.index(info)) self.xhsapi.set_smid(info['device_fingerprint']) self.xhsapi.set_session_id(info['sid'].split('.')[-1]) parms = { # 'sort': 'hot', # 最热 排序 'sort': 'time', # 最新 排序 'page': page, # 最新 排序 'pageSize': '6', # 'sid': info['sid'], 'sid': 'session.1570584984409448341951', } url = 'https://www.xiaohongshu.com/fe_api/burdock/v1/page/{}/notes'.format( topic_id) for i in range(10): res = self.xhsapi.get_xsign(url, parms) # if len(res['shield']) == 32: break res = self.xhsapi.get_sign(url, parms) print(res) parms['sign'] = res['sign'] headers = { # 'authorization': info['sid'], # 'device_id': info['deviceId'], 'user-agent': 'Dalvik/2.1.0 (Linux; U; Android 6.0; DIG-AL00 Build/HUAWEIDIG-AL00) Resolution/720*1280 Version/6.8.0.3 Build/6080103 Device/(HUAWEI;DIG-AL00) NetType/WiFi', 'shield': res['shield'], 'Host': 'www.xiaohongshu.com', 'accept-encoding': 'gzip', 'Connection': 'Keep-Alive', } # 写入json文件 def write_news_jsonfile(self, item): # print(item) logging.log(31, '写入数据') item = json.dumps(dict(item), ensure_ascii=False) + '\n' # with open('./json_file/{}/28_{}_xiaohongshu_article_topic_time.json'.format(str(datetime.now()).split(' ')[0], str(datetime.now()).split(' ')[0]), 'ab') as f: # f.write(item.encode("utf-8")) try: self.hdfsclient.new_write( '{}/{}/28_{}_{}_xiaohongshu_article_topic_time.json'.format( self.file_path, str(datetime.now()).split(' ')[0].replace('-', ''), str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item, encoding='utf-8') except urllib3.exceptions.NewConnectionError: self.write_news_jsonfile(item) def write_comment_jsonfile(self, item): # print(item) logging.log(31, '写入评论') item = json.dumps(dict(item), ensure_ascii=False) + '\n' # with open('./json_file/{}/42_{}_xiaohongshu_comment_topic_time.json'.format(str(datetime.now()).split(' ')[0], str(datetime.now()).split(' ')[0]), 'ab') as f: # f.write(item.encode("utf-8")) try: self.hdfsclient.new_write( '{}/{}/42_{}_{}_xiaohongshu_comment_topic_time.json'.format( self.comment_path, str(datetime.now()).split(' ')[0].replace('-', ''), str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item, encoding='utf-8') except urllib3.exceptions.NewConnectionError: self.write_comment_jsonfile(item) def get_file_name_time(self): a = str(datetime.now()) hour = a.split(' ')[-1].split(':')[0] num = int(hour) / 3 num = int(num) * 3 if num == 0: num = 24 a = str(datetime.now() - timedelta(days=1)) # 昨天时间 num = a.split(' ')[0] + ' ' + str(num) return num def run(self): # excel_file = xlrd.open_workbook(r'./韩束小红书投放.xlsx') # excel_form = excel_file.sheet_by_name('KOC') # for i in range(2, 102): # rows = excel_form.row_values(i) # print(rows) # name = rows[2] # user_url = rows[3] # flows = rows[4] # likes = rows[5] # direction = rows[6] # date_time = rows[7] # print(user_url) # print(date_time) # user_id = user_url.split('/')[-1].split('?')[0] # self.is_work = True # self.tiezi_list = [] # print(user_id) # if len(str(date_time)) > 5: # date_time = str(date_time)[:4] # get_date = '2020-' + str(date_time).replace('.', '-') # print(get_date) # # str_time = time.mktime(time.strptime(get_date, "%Y-%m-%d")) # # print(str_time) # # self.start_time = get_date # # self.end_time = get_date # for i in range(1, 400): # if not self.is_work: # break # try: # time.sleep(1) # self.get_topic(user_id, i, '10') # except: # if '503 Service Temporarily' in traceback.format_exc() or 'requests.exceptions.SSLError' in traceback.format_exc(): # self.change_ip() # try: # time.sleep(1) # self.get_topic(user_id, i, '10') # except: # if '503 Service Temporarily' in traceback.format_exc() or 'requests.exceptions.SSLError' in traceback.format_exc(): # self.change_ip() # try: # time.sleep(1) # self.get_topic(user_id, i, '10') # except: # if '503 Service Temporarily' in traceback.format_exc() or 'requests.exceptions.SSLError' in traceback.format_exc(): # self.change_ip() # print(traceback.format_exc()) # time.sleep(1) # self.get_note('5ce2a1ea0000000006016cd9') # # self.get_comment('5ce2a1ea0000000006016cd9', 20) for i in range(1, 400): logging.log(31, '主贴翻页:' + str(i)) if not self.is_work and i > 3: break for j in range(10): try: self.get_topic('5e60bd92dd0a2a00013fe218', i, 6) break except: self.change_ip() logging.error(traceback.format_exc())
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self, file_path): self.headers_one = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36' } # 时间判断部分 date = datetime.now() - timedelta(days=7) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=1) # 昨天时间 yesterday = str(yesterday).split(' ')[0] # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # self.start_time = '2019-09-09' # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = yesterday # self.end_time = '2019-09-16' logging.log(31, '爬取时间段:{}到{}'.format(self.start_time, self.end_time)) # 定义评论的抓取时间范围 self.comment_start_time = yesterday # 一天回复 # self.comment_start_time = '2019-08-01' # 一天回复 # self.comment_start_time = '' # 不限定时间回复 self.comment_end_time = yesterday # self.comment_end_time = yesterday # 标记爬虫工作 self.is_work = True self.file_path = file_path self.hdfsclient = HdfsClient(url='http://*****:*****@class="ui-list"]/li') for li in li_list: title = li.xpath('.//h2/a/text()')[0] news_url = li.xpath('.//h2/a/@href')[0] topic_time = ''.join( li.xpath('.//div[@class="ui-topic-attr"]/span[2]/text()') ).strip().split(':')[1] last_rreplay_time = li.xpath( './/div[@class="ui-topic-attr"]/span[3]/text()')[0].split( ':')[1] views_replay = li.xpath( './/div[@class="ui-topic-attr"]/span[4]/text()')[0] # 做时间判断部分--------------- get_time = self.time_change(topic_time) get_news_time = time.mktime(time.strptime(get_time, "%Y-%m-%d")) end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d")) if self.start_time != '': start_time = time.mktime( time.strptime(self.start_time, "%Y-%m-%d")) else: start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d")) if float(get_news_time) < float(start_time): self.is_work = False if float(start_time) <= float(get_news_time) < float( end_time): # 符合时间段的内容 self.get_topic_page(news_url, title, views_replay, keyword) a_list = data.xpath('.//a[@class="ui-page-cell"]') # 翻页判断和操作 for a in a_list: get_text = a.xpath('.//parent::a/text()') get_text = ''.join(get_text) if '下一页' in get_text: next_url = 'https://www.libaclub.com/' + a.xpath( './/parent::a/@href')[0] self.get_search_page(next_url, keyword) def get_topic_page(self, url, title, views_replay, keyword): logger.log(31, '主贴url: ' + url) response = requests.get(url, headers=self.headers_one, timeout=60) data = etree.HTML(response.content.decode('utf-8', 'ignore')) div_list = data.xpath('.//div[@class="ui-topic"]') total_item = '' for div in div_list: content = div.xpath( './/div[@class="ui-topic-content fn-break"]/text()')[0] item = {} item['platform'] = '篱笆网' date_all = div.xpath( './/div[@class="ui-topic-operate"]/div[@class="fn-left"]/text()' )[0] item['date'] = date_all.split(' ')[0] item['time'] = date_all.split(' ')[1] try: item['author'] = div.xpath( './/div[@class="ui-topic-author"]/p[1]/a/text()')[0] item['author_id'] = div.xpath( './/div[@class="ui-topic-author"]/p[1]/a/@href')[0].split( '/')[-1] except: item['author'] = div.xpath( './/div[@class="ui-topic-author"]/p[@class="ui-topic-author-name ui-topic-author-anonymityName"]/text()' )[0] item['author_id'] = '' try: item['post_client'] = div.xpath( './/div[@class="from-iphone"]/a/text()')[0] except: item['post_client'] = '' item['title'] = title item['content'] = content.strip() item['content_id'] = div.xpath( './/div[@class="ui-topic-operate"]/div[@class="fn-right"]/a[1]/@href' )[0].split('/')[-1].split('.')[0].split('_')[-1] item['brand'] = '' item['carseries'] = '' try: item['from'] = div.xpath( './/div[@class="ui-topic-author"]/p[4]/text()')[0] except: item['from'] = '' item['series_url'] = '' item['url'] = url floor = div.xpath('.//span[@class="ui-dropdown-self"]/text()')[0] item['floor'] = floor item['identification'] = '' item['favorite'] = '' try: item['signin_time'] = div.xpath( './/div[@class="ui-topic-author"]/p[3]/text()')[0] except: item['signin_time'] = '' if floor == '楼主': item['views'] = views_replay.split('/')[0] item['reply_no'] = views_replay.split('/')[1] self.source_date = date_all.split(' ')[0] self.source_time = date_all.split(' ')[1] item['is_topics'] = '是' else: item['reply_no'] = '' item['views'] = '' item['is_topics'] = '否' item['source_date'] = self.source_date item['source_time'] = self.source_time item['likes'] = '' item['is_elite'] = '' item['topic_count'] = '' item['reply_count'] = '' item['pick_count'] = '' item['follows'] = '' item['topic_categroy'] = '' item['topic_type'] = '' item['insert_time'] = str(datetime.now()).split('.')[0] item['update_time'] = str(datetime.now()).split('.')[0] item['topic_id'] = url.split('.h')[0].split('_')[-2] item['reply_floor'] = '' item['keyword'] = keyword item['file_code'] = '185' item['reposts_count'] = '' # print(item) item = json.dumps(dict(item), ensure_ascii=False) + '\n' total_item += item self.__write_news_jsonfile(total_item) if data.xpath('.//a[@class="ui-paging-next"]/@href'): # 判断是否有下一页, 翻页操作 next_page_url = 'https://www.libaclub.com' + data.xpath( './/a[@class="ui-paging-next"]/@href')[0] self.get_topic_page(next_page_url, title, views_replay, keyword) # 写入json文件 def __write_news_jsonfile(self, item): # with open('./../libawang/{}_liba_news_nike.json'.format(str(datetime.now()).split(' ')[0]), 'ab') as f: # f.write(item.encode("utf-8")) self.hdfsclient.new_write('{}/{}/185_{}_{}_liba_news.json'.format( self.file_path, str(datetime.now()).split(' ')[0].replace('-', ''), str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item, encoding='utf-8') def time_change(self, str_time): """ 时间可是转换, 将‘分钟前’,‘小时前’,‘昨天’,‘前天’, '天前',转换成标准时间格式Y-m-d h:m:s :param str_time: :return: """ if '秒' in str_time or '刚刚' in str_time: get_time = str(datetime.now()).split('.')[0] return get_time elif '分钟' in str_time: get_time_num = re.search('\d{1,2}', str_time).group(0) get_time_num = int(get_time_num) * 60 int_time = int(str(time.time()).split('.')[0]) - get_time_num # #转换成localtime time_local = time.localtime(float(int_time)) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d", time_local) # "%Y-%m-%d %H:%M:%S" return dt elif '小时' in str_time: get_time_num = re.search('\d{1,2}', str_time).group(0) get_time_num = int(get_time_num) * 60 * 60 # print(get_time_num) int_time = int(str(time.time()).split('.')[0]) - get_time_num # #转换成localtime time_local = time.localtime(float(int_time)) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d", time_local) # "%Y-%m-%d %H:%M:%S" return dt elif '昨天' in str_time: try: part_time = str_time.split(' ')[1] yesterday = datetime.now() - timedelta(days=1) # 昨天时间 yesterday = str(yesterday).split(' ')[0] except: yesterday = datetime.now() - timedelta(days=1) # 昨天时间 yesterday = str(yesterday).split(' ')[0] return yesterday elif '前天' in str_time: part_time = str_time.split(' ')[1] two_days_ago = datetime.now() - timedelta(days=2) # 昨天时间 two_days_ago = str(two_days_ago).split( ' ')[0] + ' ' + part_time.replace('点', ':').replace('分', '') return two_days_ago elif '天前' in str_time: part_time = str_time.split('天前')[0] two_days_ago = datetime.now() - timedelta( days=int(part_time)) # 昨天时间 two_days_ago = str(two_days_ago).split(' ')[0] return two_days_ago elif '年' in str_time: str_time = str_time.replace('年', '-').replace('月', '-').replace('日', '') return str_time elif ' ' in str_time and '202' not in str_time: str_time = str( datetime.now()).split('-')[0] + '-' + str_time.split(' ')[0] return str_time else: # str_time = '2019-' + str_time.replace('月', '-').replace('日', '') return str_time def run(self): url = 'https://www.libaclub.com/facade.php?act=search&searchAction=keyword&keyword={}&sId=&timetype=2&timeBegin=1563938285&timeEnd=1566530285&sid=0&searchScope=0&orderBy=0&page=1' url_list = get_config_para('nike_daily_keywords') logger.log(31, url_list) for item in url_list: # print(1) keyword = item['keywords'] logger.log(31, keyword) if keyword: search_url = url.format(keyword) try: self.get_search_page(search_url, keyword) except: logger.error(traceback.format_exc())
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self): self.headers_one = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36' } self.start_url = '' # 评论接口模板 self.commnet_port_url = '' # 时间部分,按小时抓取 # 爬虫开始抓取的日期 date = datetime.now() - timedelta(days=7) news_start_time = str(date).split(' ')[0] # 爬虫结束的抓取日期 current_time = datetime.now() # 当前日期 current_day = str(current_time).split(' ')[0] print('爬取时间段:{}到{}'.format(news_start_time, current_day)) logging.info('爬取时间段:{}到{}'.format(news_start_time, current_day)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = current_day # 标记爬虫工作1 self.is_break = False # 标记爬虫工作2 self.is_work = False # 链接hdfs self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') self.hdfsclient.makedirs( '/user/cspider_daily/nike_daily/forum/{}'.format( time.strftime('%Y%m%d'))) # 创建每日文件夹 self.time_data = str(time.time()).split('.')[0] # 替换所有的HTML标签 def re_html(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r' -', '', ret1) ret3 = re.sub( r' ', '', ret2) ret4 = re.sub(r"hot\(.*\d?','", '', ret3) ret5 = re.sub(r'\[', '', ret4) ret6 = re.sub(r'\]', '', ret5) ret7 = re.sub(r"',", "", ret6) ret8 = re.sub(r"'", "", ret7) return ret8 except: pass # 过滤月销量里面的非数字 def re_not_number(self, data): try: message = str(data) ret1 = re.sub(r'\D', '', message) return ret1 except: pass # 匹配具体时间 def clean_date(self, x): now = datetime.now() if str(x).find('昨天') != -1: x = datetime.strftime(now + timedelta(days=-1), '%Y-%m-%d %H:%M:%S') elif str(x).find('前天') != -1: x = datetime.strftime(now + timedelta(days=-2), '%Y-%m-%d %H:%M:%S') elif str(x).find('天前') != -1: x = datetime.strftime( now + timedelta(days=-int(str(x).replace('天前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('小时前') != -1: x = datetime.strftime( now + timedelta(hours=-int(str(x).replace('小时前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('分钟前') != -1: x = datetime.strftime( now + timedelta(minutes=-int(str(x).replace('分钟前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('今天') != -1: x = str(x).replace('今天', now.strftime('%Y-%m-%d') + ' ') elif str(x).find('刚刚') != -1: x = now.strftime('%Y-%m-%d %H:%M:%S') elif str(x).find('秒前') != -1: x = now.strftime('%Y-%m-%d %H:%M:%S') elif str(x).find('月前') != -1: x = datetime.strftime( now + timedelta(weeks=-4 * int(str(x).replace('月前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('周前') != -1: x = datetime.strftime( now + timedelta(weeks=-int(str(x).replace('周前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('[') != -1: x = x.replace('[', '').replace(']', '') elif str(x).find('月') != -1: x = x.replace('月', '-').replace('日', '') return x def parse_goods_id(self, key_word): try: # key_word_data = urllib.parse.quote(key_word) url = 'http://bbs.dahe.cn/search.php?mod=forum' headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'Cookie': 's8hO_404f_saltkey=tvEEW5wV; s8hO_404f_lastvisit=1568680094; s8hO_404f_sid=IHtErs; PHPSESSID=nr01ffrg19e81likscg0lmejb2; __asc=be50d61716d3cda6bb0dc6485ed; __auc=be50d61716d3cda6bb0dc6485ed; Hm_lvt_49fc517ed1175ad0089c07fe695a54c4=1568684010; s8hO_404f_lastact=1568683853%09search.php%09forum; Hm_lpvt_49fc517ed1175ad0089c07fe695a54c4=1568684168', 'Host': 'bbs.dahe.cn', 'Origin': 'http://bbs.dahe.cn', 'Pragma': 'no-cache', 'Referer': 'http://bbs.dahe.cn/search.php?mod=forum', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } form_data = { 'formhash': '89e49222', 'srchtxt': key_word.encode('gbk'), 'searchsubmit': 'yes' } try: time.sleep(0.2) response = requests.post(url=url, headers=headers, data=form_data) except: try: time.sleep(0.2) response = requests.post(url=url, headers=headers, proxies=proxies, data=form_data) except: time.sleep(0.2) response = requests.post(url=url, headers=headers, proxies=proxies, data=form_data) response.encoding = 'gbk' print(response.url) searchid = self.re_not_number(response.url.split('&')[1]) print('关键词解析对应id是:', searchid) is_break = self.is_break insert_time = time.strftime('%Y-%m-%d %H:%M:%S') url = 'http://bbs.dahe.cn/search.php?mod=forum&searchid={}&orderby=dateline&ascdesc=desc&searchsubmit=yes&page={}' # print(url) headers = { 'Content-Type': 'text/html; charset=gbk', # 'Cookie': 's8hO_404f_saltkey=T4WK2597; s8hO_404f_lastvisit=1566265382; PHPSESSID=hp8k3kq01k4p4et54us1vljsu7; Hm_lvt_49fc517ed1175ad0089c07fe695a54c4=1566269243; yfx_c_g_u_id_10000033=_ck19082010472216611967379906556; __auc=d9a596fe16cacec003e8f31e310; s8hO_404f_atarget=1; __asc=cbf1082316cb721670e06723157; zycna=tzGXcwYAChsBAXxONRbq5Xoc; yfx_c_g_u_id_10000007=_ck19082210393212688365475513495; yfx_f_l_v_t_10000007=f_t_1566441572262__r_t_1566441572262__v_t_1566441572262__r_c_0; wdcid=0cb840f230762783; s8hO_404f_yy_ad_status=2; yfx_f_l_v_t_10000033=f_t_1566269242659__r_t_1566440515358__v_t_1566442626841__r_c_1; s8hO_404f_st_t=0%7C1566443342%7Ce4370d9ec8f238172511195afa70bf43; s8hO_404f_forum_lastvisit=D_1496_1566440306D_1880_1566440345D_2988_1566443342; s8hO_404f_st_p=0%7C1566443988%7C5efa9cc93f4efcd80a2db1e41de54594; s8hO_404f_visitedfid=261D2988D1889D1006D780D1875D1213D1778D1880D1496; s8hO_404f_viewid=tid_1240948; s8hO_404f_sendmail=1; s8hO_404f_sid=HXxXR3; s8hO_404f_lastact=1566444218%09search.php%09forum; Hm_lpvt_49fc517ed1175ad0089c07fe695a54c4=1566444478', 'Host': 'bbs.dahe.cn', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } try: time.sleep(0.2) response1 = requests.get(url=url.format(searchid, 1), headers=headers, allow_redirects=False) except: try: time.sleep(0.2) response1 = requests.get(url=url.format(searchid, 1), headers=headers, allow_redirects=False, proxies=proxies) except: time.sleep(0.2) response1 = requests.get(url=url.format(searchid, 1), headers=headers, allow_redirects=False, proxies=proxies) response1.encoding = 'gbk' # print(response.text) # 将响应转换成一个element对象 html = etree.HTML(response1.text) # 获取帖子总数 topic_num = self.re_not_number( self.re_html( html.xpath('//div[@class="sttl mbn"]/h2/em/text()'))) if int(topic_num) == 0: logger.log( 31, '*******-------关键词:%s 搜索不到内容-------*******' % key_word) else: # 获取帖子页数 pages_num = int(math.ceil(float(int(topic_num) / 40))) logger.log( 31, '---关键词: %s ,搜到的帖子总数是: %s ,帖子总页数是: %s ---' % (key_word, topic_num, pages_num)) for i in range(1, int(pages_num) + 1): topic_url = url.format(searchid, key_word, i) # logger.log(31, '抓取第%s页数商品数据' % i) try: time.sleep(0.2) response2 = requests.get(url=topic_url, headers=headers, allow_redirects=False) except: try: time.sleep(0.2) response2 = requests.get(url=topic_url, headers=headers, allow_redirects=False, proxies=proxies) except: time.sleep(0.2) response2 = requests.get(url=topic_url, headers=headers, allow_redirects=False, proxies=proxies) # 将响应转换成一个element对象 html1 = etree.HTML(response2.text) # 获取帖子列表 topic_list = html1.xpath('//div[@class="tl"]/div[2]/ul/li') # print(len(topic_list)) da_he_dict = dict() # 遍历帖子列表 for data in topic_list: date_time_data = data.xpath('./p[3]/span[1]/text()')[0] # print(date_time_data) date_data_test = date_time_data.split(' ')[0].strip() # print(date_data_test) # 年, 月, 日 year_data = date_data_test.split('-')[0] month_test = date_data_test.split('-')[1] day_test = date_data_test.split('-')[2] if len(month_test) == 2: month_data = month_test else: month_data = '0' + month_test if len(day_test) == 2: day_data = day_test else: day_data = '0' + day_test date_data = (year_data + '-' + month_data + '-' + day_data).strip() time_data = (date_time_data.split(' ')[1] + ':00').strip() if self.start_time <= date_data: da_he_dict['platform'] = '大河网' da_he_dict['source_date'] = date_data da_he_dict['source_time'] = time_data da_he_dict['date'] = date_data da_he_dict['time'] = time_data da_he_dict['insert_time'] = insert_time da_he_dict['author'] = data.xpath( './p[3]/span[2]/a/text()')[0] da_he_dict[ 'author_url'] = 'http://bbs.dahe.cn/' + data.xpath( './p[3]/span[2]/a/@href')[0] da_he_dict['author_id'] = self.re_not_number( data.xpath('./p[3]/span[2]/a/@href')[0]) da_he_dict['title'] = self.re_html( data.xpath('./h3/a//text()')) da_he_dict[ 'url'] = 'http://bbs.dahe.cn/' + data.xpath( './h3/a/@href')[0] da_he_dict['brand'] = '' da_he_dict['carseries'] = '' da_he_dict['series_url'] = '' # print(da_he_dict) self.parse_topic_data(da_he_dict) if date_data < self.start_time: is_break = True if is_break: break except: print(111111111111111111111, traceback.format_exc()) # 解析帖子内容 def parse_topic_data(self, da_he_dict): try: url = da_he_dict['url'] headers = { 'Content-Type': 'text/html; charset=gbk', 'Host': 'bbs.dahe.cn', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } # print(url) logger.log(31, url) try: # time.sleep(0.5) response = requests.get(url=url, headers=headers, allow_redirects=False) except: try: # time.sleep(0.5) response = requests.get(url=url, headers=headers, allow_redirects=False, proxies=proxies) except: # time.sleep(0.5) response = requests.get(url=url, headers=headers, allow_redirects=False, proxies=proxies) response.encoding = 'gbk' # 将响应转换成一个element对象 html = etree.HTML(response.text) # print(response.text) # # 获取发帖时间 # time_data_test = self.clean_date(self.re_html(html.xpath('//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div[1]/div/div[2]/em/text()|//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div[1]/div/div[2]/em/span/text()')).replace('\\xa0', ' ').replace('发表于 ', '').replace('发表于 ', '')) # # print(url) # print(time_data_test) # time_data = time_data_test.split(' ')[1] # lang = len(time_data_test.split(':')) # if int(lang) == 3: # time_data_1 = time_data # else: # time_data_1 = time_data.split(':')[0] + ':' + time_data.split(':')[1] + ':' + '00' # print(da_he_dict['date'], '--------', time_data_1) # da_he_dict['source_time'] = time_data_1 # da_he_dict['time'] = time_data_1 # 获取浏览数,回复数 reply_data = html.xpath( '//div[@id="postlist" and @class="pl bm"]/table[1]/tr/td[1]/div/span/text()' ) # print(reply_data) da_he_dict['reply_no'] = reply_data[4] da_he_dict['views'] = reply_data[1] # 获取发帖人客户端 post_client = html.xpath( '//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div[1]/div/div[2]/span[1]/a//text()' ) if post_client == []: da_he_dict['post_client'] = '' else: da_he_dict['post_client'] = post_client[0] + post_client[1] da_he_dict['content'] = self.re_html( html.xpath( '//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div[2]/div/div[1]/table/tr/td//text()' )).replace('\\r', '').replace('\\n', '').replace('\\u3000', '').replace('\\xa0', '') da_he_dict['imageurl'] = html.xpath( '//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div[2]/div/div[1]//img/@src' ) da_he_dict['audiourl'] = '' da_he_dict['content_id'] = da_he_dict['url'].split('-')[1] da_he_dict['from'] = '' da_he_dict['is_topics'] = '是' da_he_dict['floor'] = html.xpath( '//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div/strong/a/text()' )[0].strip() da_he_dict['identification'] = '' da_he_dict['favorite'] = '' da_he_dict['signin_time'] = '' da_he_dict['likes'] = '' # 判断是否是热帖 is_elite = html.xpath( '//div[@id="postlist" and @class="pl bm"]/div[1]/div/img/@title' ) if is_elite == []: da_he_dict['is_elite'] = '否' else: da_he_dict['is_elite'] = '是' da_he_dict['topic_count'] = '' da_he_dict['reply_count'] = '' da_he_dict['pick_count'] = '' da_he_dict['follows'] = '' da_he_dict['topic_categroy'] = '' da_he_dict['topic_type'] = '' da_he_dict['reposts_count'] = '' da_he_dict['update_time'] = time.strftime('%Y-%m-%d %H:%M:%S') da_he_dict['topic_id'] = da_he_dict['url'].split('-')[1] da_he_dict['file_code'] = '182' # logger.log(31, '----------------正在写入主贴----------------') # print(da_he_dict) item = json.dumps(dict(da_he_dict), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_daily/forum/{}/182_{}_{}_dahe_Nike.json' .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d'), self.time_data), item, encoding='utf-8') if int(da_he_dict['reply_no']) == 0: # logger.log(31, '没有回帖') pass else: # 获取回帖页数 pages_num = int( math.ceil(float(int(da_he_dict['reply_no']) / 10))) # logger.log(31, '回帖数: %s 回帖总页数是:%s' % (da_he_dict['reply_no'], pages_num)) self.parse_reply(pages_num, da_he_dict) except: print(222222222222222222222, traceback.format_exc()) # 抓取回帖内容 def parse_reply(self, pages_num, da_he_dict): try: is_work = self.is_work start_time = time.strftime('%Y-%m-%d %H:%M:%S') headers = { 'Content-Type': 'text/html; charset=gbk', 'Host': 'bbs.dahe.cn', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } for i in range(pages_num, 0, -1): url = 'http://bbs.dahe.cn/thread-{}-{}-1.html'.format( da_he_dict['topic_id'], i) try: # time.sleep(1) response = requests.get(url=url, headers=headers, allow_redirects=False) except: try: # time.sleep(1) response = requests.get(url=url, headers=headers, allow_redirects=False, proxies=proxies) except: # time.sleep(1) response = requests.get(url=url, headers=headers, allow_redirects=False, proxies=proxies) response.encoding = 'gbk' # 将响应转换成一个element对象 html = etree.HTML(response.text) reply_dict = dict() # 获取回帖列表 reply_list = html.xpath( '//div[@id="postlist" and @class="pl bm"]/div') # print(len(reply_list)) for item in reply_list[::-1]: floor_data = self.re_html( item.xpath( './table/tr[1]/td[2]/div/strong/a/text()|./tr[1]/td[2]/div[1]/strong/a//text()' )).replace('\\r', '').replace('\\n', '').replace( '#', '').replace(' ', '') # print(floor_data) url_data = response.url floor_test = floor_data date_time_test = item.xpath( './table/tr[1]/td[2]/div[1]/div/div[2]/em/text()|./table/tr[1]/td[2]/div[1]/div/div[2]/em/span/text()' ) # print(date_time_test) if date_time_test == []: pass else: date_time_data = self.re_html(date_time_test).replace( '发表于 ', '').replace('\\xa0', ' ').replace('发表于 ', '') # print(date_time_data) if re.search(r'前天|昨天', date_time_data) != None: datetime_data = self.clean_date( date_time_data.split(' ')[0]).split(' ')[ 0] + ' ' + date_time_data.split(' ')[1] elif re.search(r'天前', date_time_data) != None: datetime_data = self.clean_date(date_time_data) else: datetime_data = date_time_data # print(datetime_data) # 发表日期 date_data = datetime_data.split(' ')[0].strip() date_data_test = date_data.split('-') if len(date_data_test[1]) == 1 and len( date_data_test[2]) == 1: date_data_parse = date_data_test[ 0] + '-0' + date_data_test[ 1] + '-0' + date_data_test[2] elif len(date_data_test[1]) == 1 and len( date_data_test[2]) != 1: date_data_parse = date_data_test[ 0] + '-0' + date_data_test[ 1] + '-' + date_data_test[2] elif len(date_data_test[1]) != 1 and len( date_data_test[2]) == 1: date_data_parse = date_data_test[ 0] + '-' + date_data_test[ 1] + '-0' + date_data_test[2] else: date_data_parse = date_data_test[ 0] + '-' + date_data_test[ 1] + '-' + date_data_test[2] # 发表时间 time_data_test = datetime_data.split(' ')[1] lang = len(time_data_test.split(':')) if int(lang) == 3: time_data = time_data_test.strip() else: time_data = (time_data_test.split(':')[0] + ':' + time_data_test.split(':')[1] + ':' + '00').strip() # print(date_data, '*******', time_data) if self.start_time <= date_data_parse.strip(): reply_dict['platform'] = da_he_dict['platform'] reply_dict['source_date'] = da_he_dict['date'] reply_dict['source_time'] = da_he_dict['time'] reply_dict['date'] = date_data_parse.strip() reply_dict['time'] = time_data reply_dict['author'] = item.xpath( './table/tr[1]/td[1]/div/div[1]/div/a/text()' )[0] reply_dict[ 'author_url'] = 'http://bbs.dahe.cn/' + item.xpath( './table/tr[1]/td[1]/div/div[1]/div/a/@href' )[0] reply_dict['author_id'] = self.re_not_number( item.xpath( './table/tr[1]/td[1]/div/div[1]/div/a/@href' )[0]) reply_dict['post_client'] = da_he_dict[ 'post_client'] reply_dict['title'] = da_he_dict['title'] reply_dict['content'] = self.re_html( item.xpath( './table/tr[1]/td[2]/div[2]/div/div[1]/table/tr/td//text()' )).replace('\\r', '') reply_dict['imageurl'] = '' reply_dict['audiourl'] = '' reply_dict['content_id'] = self.re_not_number( item.xpath('./@id')[0]) reply_dict['brand'] = '' reply_dict['carseries'] = '' reply_dict['from'] = '' reply_dict['series_url'] = '' reply_dict['url'] = url_data reply_dict['is_topics'] = '否' reply_dict['floor'] = floor_test reply_dict['identification'] = '' reply_dict['favorite'] = '' reply_dict['signin_time'] = '' reply_dict['reply_no'] = '' reply_dict['views'] = '' reply_dict['likes'] = '' reply_dict['is_elite'] = da_he_dict['is_elite'] reply_dict['topic_count'] = '' reply_dict['reply_count'] = '' reply_dict['pick_count'] = '' reply_dict['follows'] = '' reply_dict['topic_categroy'] = '' reply_dict['topic_type'] = '' reply_dict['reposts_count'] = '' reply_dict['insert_time'] = start_time reply_dict['update_time'] = time.strftime( '%Y-%m-%d %H:%M:%S') reply_dict['topic_id'] = da_he_dict['topic_id'] reply_dict['file_code'] = '182' # logger.log(31, '******************开始写入回帖数据**********************') # print(reply_dict) item = json.dumps(dict(reply_dict), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_daily/forum/{}/182_{}_{}_dahe_Nike.json' .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d'), self.time_data), item, encoding='utf-8') if date_data < self.start_time: is_work = True if is_work: break except: print(333333333333333333333, traceback.format_exc()) # 读取excel获取关键词 def parse_xlsx(self): # 设置路径 path = './快消采集关键词_v12_20200119.xlsx' # 打开execl workbook = xlrd.open_workbook(path) # 根据sheet索引或者名称获取sheet内容 Data_sheet = workbook.sheets()[0] # 通过索引获取 rowNum = Data_sheet.nrows # sheet行数 colNum = Data_sheet.ncols # sheet列数 # 获取所有单元格的内容 list = [] for i in range(rowNum): rowlist = [] for j in range(colNum): rowlist.append(Data_sheet.cell_value(i, j)) list.append(rowlist) for data in list[1::]: brand = data[0] # print(brand) yield { '关键词': brand, } def run(self): key_word_list = [] for item in self.parse_xlsx(): # print(item) key_word_list.append(item) for item_data in key_word_list: time.sleep(10) # print(item_data['关键词']) self.parse_goods_id(item_data['关键词'])
class YiDianSpider(object): def __init__(self, file_path, comment_path): self.headers_two = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', # 'Connection':'keep-alive', 'Cookie': 'cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%2216730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201542776168%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201542776168%7D%7D; UM_distinctid=16730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461; JSESSIONID=208cee9fea61049d61e7d18f9e9c275ecf530a9e308a94dde36658adc01a0594; wuid=154945905891357; wuid_createAt=2018-11-21 12:56:9', 'Host': 'www.baidu.com', 'Referer': 'http://www.yidianzixun.com/channel/c11', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } self.proxies = ['218.95.55.154:4243'] # 去重列表 self.set_list = [] # self.error_url_list = [] self.headers_one = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'www.baidu.com', # 'Proxy-Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36' } self.user_agent = [ 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1', ] a = str(datetime.now()) hour = a.split(' ')[-1].split(':')[0] num = int(hour) / 3 num = int(num) * 3 if num == 0: # 对于凌晨 0 点的判断 # 时间判断部分 date = datetime.now() - timedelta(days=1) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=1) # 昨天时间 yesterday = str(yesterday).split(' ')[0] else: # 时间判断部分 date = datetime.now() - timedelta(days=0) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=0) # 昨天时间 yesterday = str(yesterday).split(' ')[0] # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = yesterday try: self.page_ip = proxies.res_ip() print('ip: ', self.page_ip) # self.page_ip = '116.248.160.138:4261' except: time.sleep(3) print('调用ip时发生错误:{}'.format(traceback.format_exc())) logger.error('调用ip时发生错误:{}'.format(traceback.format_exc())) self.page_ip = proxies.res_ip() self.ip_count = 0 # 定义评论的抓取时间范围 # self.comment_start_time = yesterday # 一天回复 self.comment_start_time = '' # 不限定时间回复 self.comment_end_time = yesterday # self.comment_end_time = yesterday self.is_get_comment = True self.file_name_time = self.get_file_name_time() self.file_path = file_path self.comment_apth = comment_path self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') hour = str(datetime.now()).split(' ')[-1].split(':')[0] if str(hour) != '00': two_hour_ago = int(hour) - 2 if len(str(two_hour_ago)) == 1: two_hour_ago = '0' + str(two_hour_ago) self.hour_name = str(two_hour_ago) + '_' + str(hour) else: self.hour_name = '22_24' self.hdfsclient.makedirs('{}/{}/{}'.format( self.file_path, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name)) # 创建每日文件夹 self.hdfsclient.makedirs('{}/{}/{}'.format( self.comment_apth, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name)) # 创建每日文件夹 self.time_time = str(time.time()).split('.')[0] def get_channel_id(self): url = 'http://www.yidianzixun.com/channel/c11' try: response = requests.get(url, proxies={'http': self.page_ip}, timeout=30) data = response.content.decode() data = re.search('channel_id(.*?)汽车', data).group(0) channel_id = re.search('\d{8,15}', data).group(0) cokies = response.headers['Set-Cookie'] print(cokies) id = re.search('JSESSIONID=([a-z0-9]{30,80});', cokies).group(1) return channel_id, id except: print(traceback.format_exc()) if self.ip_count < 10: self.page_ip = proxies.res_ip() print('跟换ip中: ', self.page_ip) self.ip_count += 1 time.sleep(5) self.get_channel_id() else: raise IndexError def get_news_list_port(self, url): headers_port = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Host': 'www.yidianzixun.com', 'Connection': 'keep-alive', # 'Upgrade-Insecure-Requests': '1', 'Referer': 'http://www.yidianzixun.com/', 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Mobile Safari/537.36', 'Cookie': 'wuid=289836166779454; wuid_createAt=2019-10-29 16:11:43; Hm_lvt_15fafbae2b9b11d280c79eff3b840e45=1572336703; Hm_lpvt_15fafbae2b9b11d280c79eff3b840e45=1572336703; JSESSIONID=7c64fe11dc634f7bce6816ca76a196fb915ea8d8a307a0a41b26269846df44ef', 'X-Requested-With': 'XMLHttpRequest' } # headers_port['Cookie'] = cookie print(url) response = requests.get(url, headers=headers_port, proxies={'http': self.page_ip}) # print(response.url) # print(response.text) data = response.content.decode() data = json.loads(data) data = data['result'] # print(data) for news in data: item = {} try: title = news['title'] except: continue item['title'] = title itemid = news['docid'] url = 'http://www.yidianzixun.com/article/' + itemid print(url) news_date = news['date'] if 'V_' not in itemid: if url not in self.set_list: # self.write_news_jsonfile(item) try: self.get_news_page_info(url) except: print(traceback.format_exc()) self.set_list.append(url) # 获取通过js生成的spt的值 def get_spt(self, start, channel_id): # start = 10 end = start + 10 n = "/home/q/news_list_for_channel?channel_id=11756176923&cstart=0&cend=10&infinite=true&refresh=1&__from__=pc&multi=5" e = str(channel_id) # ctx = execjs.compile( # ''' # function good (n,e,i,t){ # for (var o = "sptoken", a = "", c = 1; c < arguments.length; c++){ # o += arguments[c]; # } # for (var c = 0; c < o.length; c++) { # var r = 10 ^ o.charCodeAt(c); # a += String.fromCharCode(r) # } # return a # } # ''' # ) # spt = ctx.call('good', n, e, start, end) # return spt def get_news_page_info(self, url): item = {} response = requests.get(url) print(response.url) data = etree.HTML(response.content.decode()) title = data.xpath('.//h2/text()')[0] if data.xpath('.//a[@class="doc-source"]/text()'): source = data.xpath('.//a[@class="doc-source"]/text()')[0] else: source = data.xpath('.//div[@class="meta"]/span[1]/text()')[0] # date_time = data.xpath('.//div[@class="meta"]/span[2]/text()')[0] if data.xpath('.//div[@id="imedia-article"]//text()'): content = data.xpath('.//div[@id="imedia-article"]//text()') elif data.xpath('.//div[@id="imedia-article"]/article/p//text()'): content = data.xpath( './/div[@id="imedia-article"]/article/p//text()') elif data.xpath( './/div[@id="imedia-article"]/section/section//text()'): content = data.xpath( './/div[@id="imedia-article"]/section/section//text()') elif data.xpath('.//div[@class="content-bd"]/div/div//text()'): content = data.xpath('.//div[@class="content-bd"]/div/div//text()') elif data.xpath('.//div[@class="content-bd"]/p//text()'): content = data.xpath('.//div[@class="content-bd"]/p//text()') elif data.xpath('.//div[@class="content-bd"]/div/div/text()'): content = data.xpath('.//div[@class="content-bd"]/div/div//text()') elif data.xpath('.//div[@class="content-bd"]/section//text()'): content = data.xpath('.//div[@class="content-bd"]/section//text()') elif data.xpath('.//div[@class="content-bd"]/section/text()'): content = data.xpath('.//div[@class="content-bd"]/section/text()') elif data.xpath('.//div[@class="content-bd"]//text()'): content = data.xpath('.//div[@class="content-bd"]//text()') else: content = data.xpath( './/div[@id="imedia-article"]/section/section/section/p//text()' ) content = ''.join(content) # get_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item['platform'] = '一点资讯' item['title'] = title item['article_source'] = source # 文章来源 item['article_author'] = '' # 文章作者 item['content'] = content # if len(data.xpath('.//div[@class="meta"]/span')) == 3: # date_all = data.xpath('.//div[@class="meta"]/span[3]/text()')[0] # elif len(data.xpath('.//div[@class="meta"]/span')) == 2: # date_all = data.xpath('.//div[@class="meta"]/span[2]/text()')[0] # else: date_all = data.xpath('.//div[@class="meta"]/span//text()') date_all = ''.join(date_all).strip() try: if date_all == '昨天' or '小时前' in date_all: yesterday = datetime.now() - timedelta(days=1) # 昨天时间 yesterday = str(yesterday).split(' ')[0] # print(date_all, yesterday) item['date'] = yesterday elif date_all == '2天前': yesterday = datetime.now() - timedelta(days=2) # 2前天时间 yesterday = str(yesterday).split(' ')[0] # print(date_all, yesterday) item['date'] = yesterday elif date_all == '3天前': yesterday = datetime.now() - timedelta(days=3) # 3前天时间 yesterday = str(yesterday).split(' ')[0] # print(date_all, yesterday) item['date'] = yesterday else: news_date = re.search(r'\d{4}\.\d{1,2}\.\d{1,2}', date_all).group(0) # print(222222, news_date) # print(33333, date_all) item['date'] = news_date.replace('.', '-') except: item['date'] = self.comment_end_time # print(item) item['time'] = '' item['likes'] = '' item['clicks'] = '' item['views'] = '' item['keyword'] = '' item['comments_count'] = '' item['article_url'] = url # 文章详情URL item['dislikes'] = '' # 踩人数 item['series_url'] = '' # 车系首页 item['list_url'] = 'http://www.yidianzixun.com/channel/c11' # 文章列表URL item['article_type_1st'] = '' # 文章类型 item['article_type_2nd'] = '' # 文章类型 item['insert_time'] = str(datetime.now()).split('.')[0] # 初始爬取时间 item['update_time'] = str(datetime.now()).split('.')[0] # 最后爬取时间 item['content_id'] = url.split('/')[-1].split('?')[0] # 文章id item['topic_id'] = url.split('/')[-1].split('?')[0] # 主贴id item['author_id'] = '' # 作者id item['file_code'] = '26' # 文件编号 # 做时间判断部分--------------- 这个部分区分于另外一个部分 # if date_all == '昨天' or date_all == '2天前' or date_all == '3天前' or '小时前' in date_all: # print(date_all, '时间符合') # print(item) self.write_news_jsonfile(item) news_id = url.split('/')[-1] self.is_get_comment = True self.get_commnet_info(news_id, title, url, item['date']) # 获取评论信息 def get_commnet_info(self, news_id, title, source_url, source_date, last_comment_id=''): item = {} url = 'http://www.yidianzixun.com/home/q/getcomments?_=1542864983174&docid={}&s=&count=30&last_comment_id={}&appid=web_yidian'.format( str(news_id), last_comment_id) response = requests.get(url) data = json.loads(response.content.decode()) comments = data['comments'] if comments: total_item = '' for comment in comments: # print(comment) # print('爬取评论中') item['platform'] = '一点资讯' item['title'] = title content = comment['comment'] item['content'] = content author = comment['nickname'] item['author'] = author date_all = comment['createAt'] comment_date = date_all.split(' ')[0] comment_time = date_all.split(' ')[1] # 评论部分做时间判断部分--------------- get_news_time = time.mktime( time.strptime(str(comment_date), "%Y-%m-%d")) end_time = time.mktime( time.strptime(self.comment_end_time, "%Y-%m-%d")) if self.comment_start_time != '': start_time = time.mktime( time.strptime(self.comment_start_time, "%Y-%m-%d")) else: start_time = time.mktime( time.strptime('2010-1-1', "%Y-%m-%d")) if float(get_news_time) < float(start_time): self.is_get_comment = False # 返回的回答消息是按时间进行排序的,所以当时间小于指定时间时,就停止爬取, break elif float(start_time) <= float(get_news_time) <= float( end_time): item['date'] = comment_date item['time'] = comment_time item['source_date'] = source_date item['source_time'] = '' item['source_url'] = source_url item['floor'] = '' item['keyword'] = '' item['comment_url'] = source_url item['views'] = '' item['comments_count'] = '' item['likes'] = '' item['author_id'] = comment['userid'] # 用户id item['dislikes'] = '' # 踩人数 item['insert_time'] = str( datetime.now()).split('.')[0] # 初始爬取时间 item['update_time'] = str( datetime.now()).split('.')[0] # 最后爬取时间 item['content_id'] = comment['comment_id'] # 内容id item['topic_id'] = source_url.split('/')[-1].split('?')[ 0] # 主贴id item['file_code'] = '40' # 文件编号 item = json.dumps(dict(item), ensure_ascii=False) + '\n' total_item += item self.write_comment_jsonfile(total_item) if len(comments) == 30 and self.is_get_comment: last_comment_id = comments[-1]['comment_id'] print('评论翻页') self.get_commnet_info(news_id, title, source_url, source_date, last_comment_id=last_comment_id) def write_news_jsonfile(self, item): logger.log(31, '正在写入新闻数据......') # with open('./../yidianzixun/26_{}_yidianzixun_news.json'.format(str(now_time)), 'ab') as f: # f.write(item.encode('utf-8')) self.hdfsclient.new_write( '{}/{}/{}/26_{}_{}_yidianzixun_news.json'.format( self.file_path, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name, str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item, encoding='utf-8') def write_comment_jsonfile(self, item): logger.log(31, '正在写入评论数据......') item = json.dumps(dict(item), ensure_ascii=False) + '\n' # with open('./../yidianzixun/40_{}_yidianzixun_commnet.json'.format(str(now_time)), 'ab') as f: # f.write(item.encode('utf-8')) self.hdfsclient.new_write( '{}/{}/{}/40_{}_{}_yidianzixun_commnet.json'.format( self.comment_apth, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name, str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item, encoding='utf-8') def get_news_url(self, num): """ 从百度搜索关键词,然后获取符合的新闻的url, 提取抓取数量 """ # 时间 get_time = time.time() str_time = str(get_time)[:-4] date = datetime.now() - timedelta(days=7) a = str(date)[:-7] timeArray = time.strptime(a, "%Y-%m-%d %H:%M:%S") # 转换为时间戳: timeStamp = int(time.mktime(timeArray)) end_time = str(timeStamp) + '.' + str_time.split('.')[1] print(str_time, end_time) # url = 'https://www.baidu.com/s?q1=汽车&q2=&q3=&q4=&gpc=stf%3D{}%2C{}%7Cstftype%3D1&ft=&q5=&q6=www.yidianzixun.com&tn=baiduadv&pn={}'.format(end_time, str_time, num) url = 'https://www.baidu.com/s?wd=site%3A(www.yidianzixun.com)%20%E6%B1%BD%E8%BD%A6&pn={}&oq=site%3A(www.yidianzixun.com)%20%E6%B1%BD%E8%BD%A6&ct=2097152&tn=baiduadv&ie=utf-8&si=(www.yidianzixun.com)&rsv_pq=e948db9e00097fcd&rsv_t=1273sdRx9rzb35pYERweuGf1mV6RO2BZZUthjhhdYlSidhjyUjzN%2FuD2LYJ1%2Fso&gpc=stf%3D{}%2C{}%7Cstftype%3D2&tfflag=1'.format( num, end_time, str_time) print(url) # ip = random.choice(self.proxies_list) response = requests.get(url, headers=self.headers_one, verify=False, timeout=30) # , proxies={'https': ip} content = etree.HTML(response.content.decode()) if content.xpath('.//h3[@class="t"]/a/@href'): url_list = content.xpath('.//h3[@class="t"]/a/@href') print(url_list) print(len(url_list)) for url_ch in url_list: response = requests.get(url_ch, headers=self.headers_two, allow_redirects=False) print(response.status_code) news_url = response.headers['Location'] print(news_url) if news_url not in self.set_list: try: self.get_news_page_info(news_url) except Exception as e: print(e) time.sleep(15) self.set_list.append(news_url) def get_file_name_time(self): a = str(datetime.now()) hour = a.split(' ')[-1].split(':')[0] num = int(hour) / 3 num = int(num) * 3 if num == 0: num = 24 a = str(datetime.now() - timedelta(days=1)) # 昨天时间 num = a.split(' ')[0] + ' ' + str(num) return num def run(self): url = 'http://www.yidianzixun.com/home/q/news_list_for_channel' get_time = time.time() get_time = ''.join(str(get_time).split('.')) url_list = [ # 体育 'http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id=13402171666&cstart=0&cend=10&infinite=true&refresh=1&__from__=wap&_spt=yz~eaod%3B9%3E%3A8%3B%3D%3B%3C%3C%3C%3A%3B%3A&appid=web_yidian&_={}', # NBA 'http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id=13402171682&cstart=0&cend=10&infinite=true&refresh=1&__from__=wap&_spt=yz~eaod%3B9%3E%3A8%3B%3D%3B%3C28%3A%3B%3A&appid=web_yidian&_={}', # 财经 'http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id=13402171698&cstart=0&cend=10&infinite=true&refresh=1&__from__=wap&_spt=yz~eaod%3B9%3E%3A8%3B%3D%3B%3C32%3A%3B%3A&appid=web_yidian&_={}' ] for get_url in url_list: for i in range(2): try: for j in range(30): url = get_url.format( str(time.time()).replace('.', '')[:-4]) try: self.get_news_list_port(url) except requests.exceptions.ProxyError: print(traceback.format_exc()) break except TypeError: print(traceback.format_exc()) logger.error('内容解析错误') except: print(traceback.format_exc()) logger.error('其他错误') time.sleep(10) self.page_ip = proxies.res_ip()
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self): # 时间部分 # 爬虫开始抓取的日期 date = datetime.now() - timedelta(days=1) news_start_time = str(date).split(' ')[0] # 爬虫结束的抓取日期 current_time = datetime.now() # 当前日期 current_day = str(current_time).split(' ')[0] print('爬取时间段:{}到{}'.format(news_start_time, current_day)) logging.info('爬取时间段:{}到{}'.format(news_start_time, current_day)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = current_day # 标记爬虫工作 self.is_work = False # 链接hdfs self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') self.hdfsclient.makedirs( '/user/cspider_daily/nike_daily/ecommerce/{}'.format( time.strftime('%Y%m%d'))) # 创建每日文件夹 self.time_data = str(time.time()).split('.')[0] # 替换所有的HTML标签 def re_html(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r'\n', '', ret1) ret3 = re.sub(r'\u3000', '', ret2) ret4 = re.sub(r'品牌:', '', ret3) ret5 = re.sub(r'\xa0', '', ret4) ret6 = re.sub(r'→_→', '', ret5) ret7 = re.sub(r'…', '', ret6) ret8 = re.sub(r'https:', '', ret7) return ret8 except: pass # 过滤月销量里面的非数字 def re_not_number(self, data): try: message = str(data) ret1 = re.sub(r'\D', '', message) return ret1 except: pass # 根据关键词搜索请求得到商品信息 def parse_goods(self, key_word): try: # 根据关键词,例如:洗发水,抓取商品信息 url = 'https://list.mogujie.com/search?q={}&cKey=43&page=1&sort=pop'.format( key_word) headers = { # 'authority': 'list.mogujie.com', # 'method': 'GET', # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', # 'accept-encoding': 'gzip, deflate, br', # 'accept-language': 'zh-CN,zh;q=0.9', # 'cache-control': 'no-cache', 'cookie': '__mgjuuid=ebddbce7-601f-4f3d-a860-d5ba8f411688; _TDeParam=1-1RjCYYeGOiwg6JI5UDopvg', 'pragma': 'no-cache', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } try: time.sleep(0.1) response = requests.get(url=url, headers=headers, allow_redirects=False, timeout=20) except: try: time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=20) except: time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=20) # print(response.text) # 判断比对获取的是否是关键词关联搜索的商品 rewriteKeyword = json.loads(response.text)['result'] if 'searchRewrite' in rewriteKeyword: if rewriteKeyword['searchRewrite'][ 'rewriteKeyword'] == key_word.replace(' ', ''): # 获取商品总数 goods_num = json.loads( response.text)['result']['wall']['total'] # 商品总页数 page_num = int(math.ceil(float(int(goods_num) / 75))) for i in range(1, page_num + 1): logger.log( 31, '------正在抓取关键词: %s 的第: %s 页商品数据, 商品总页数是: %s ------' % (key_word, i, page_num)) goods_url = 'https://list.mogujie.com/search?q={}&cKey=43&page={}&sort=pop'.format( key_word, i) try: time.sleep(0.2) response1 = requests.get(url=goods_url, headers=headers, allow_redirects=False, timeout=20) except: try: time.sleep(0.2) response1 = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=20) except: time.sleep(0.2) response1 = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=20) # 获取商品列表信息节点 goods_list = json.loads( response1.text)['result']['wall']['docs'] # print(len(goods_list)) # 遍历商品信息节点列表 for node in goods_list: goods_dict = dict() goods_dict['platform'] = '蘑菇街' goods_dict['keyword'] = key_word goods_dict['url'] = node['link'] goods_dict['imageurl'] = node['img'] goods_dict['audiourl'] = '' goods_dict['name'] = node['title'] goods_dict['sales'] = '' goods_dict['price'] = node['price'] goods_dict['itemID'] = node['tradeItemId'] goods_dict['brand'] = '' goods_dict['focus_count'] = node['cfav'] # print(goods_dict) self.parse_goods_details(goods_dict) else: logger.log(31, '------关键词: %s 搜索不到对应的商品数据--------' % key_word) else: # 获取商品总数 goods_num = json.loads( response.text)['result']['wall']['total'] # 商品总页数 page_num = int(math.ceil(float(int(goods_num) / 75))) for i in range(1, page_num + 1): # logger.log(31, '------正在抓取关键词: %s 的第: %s 页商品数据, 商品总页数是: %s ------' % (key_word, i, page_num)) goods_url = 'https://list.mogujie.com/search?q={}&cKey=43&page={}&sort=pop'.format( key_word, i) try: time.sleep(0.2) response1 = requests.get(url=goods_url, headers=headers, allow_redirects=False, timeout=20) except: try: time.sleep(0.2) response1 = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=20) except: time.sleep(0.2) response1 = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=20) # 获取商品列表信息节点 goods_list = json.loads( response1.text)['result']['wall']['docs'] # print(len(goods_list)) # 遍历商品信息节点列表 for node in goods_list: goods_dict = dict() goods_dict['platform'] = '蘑菇街' goods_dict['keyword'] = key_word goods_dict['url'] = node['link'] goods_dict['imageurl'] = node['img'] goods_dict['audiourl'] = '' goods_dict['name'] = node['title'] goods_dict['sales'] = '' goods_dict['price'] = node['price'] goods_dict['itemID'] = node['tradeItemId'] goods_dict['brand'] = '' goods_dict['focus_count'] = node['cfav'] # print(goods_dict) self.parse_goods_details(goods_dict) except: print(111111111111111111111, traceback.format_exc()) # 解析商品评论人数 def parse_goods_details(self, goods_dict): try: headers = { 'cookie': '__mgjuuid=7e841984-d679-49eb-9994-89abaec55322; _mwp_h5_token_enc=36d248108519bf86cf2fa681dbc521f8; _mwp_h5_token=3c71c26a371458b615f433396b39eccf_1564968570925; _ga=GA1.2.2057442167.1565061045; _gid=GA1.2.2144070558.1565061045; __mgjref=https%3A%2F%2Fshop.mogu.com%2Fdetail%2F1m6os9s%3Facm%3D3.ms.1_4_1m6os9s.43.1185-68998.4aiUQrym0Gs9T.sd_117-swt_43-imt_6-t_4aiUQrym0Gs9T-lc_4-pit_1-qid_21841-dit_170-idx_0-dm1_5001%26ptp%3D31.nXjSr.0.0.wLDh8N89', 'pragma': 'no-cache', 'Referer': goods_dict['url'], 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } # print(goods_dict) url = 'https://rate.mogu.com/jsonp/pc.rate.ratelist/v2?pageSize=20&sort=1&isNewDetail=1&itemId={}&type=1&marketType=market_mogujie&page=1'.format( goods_dict['itemID']) try: time.sleep(0.2) response = requests.get(url=url, headers=headers, allow_redirects=False, timeout=20) except: try: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=20) except: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=20) # print(response.url) # print(response.text) commnent_num_data = re.search(r'{".*"success":true}', response.text) num_data = commnent_num_data.group() # print(num_data) if 'total' not in num_data: pass # logger.log(31, '-----------没有商品评论数据-----------') else: goods_dict['achieve_num'] = json.loads( num_data)['data']['total'] # 获取评论页数 page_num = int( math.ceil(float(int(goods_dict['achieve_num']) / 20))) # print(goods_dict['achieve_num'], page_num) self.goods_comments(goods_dict, page_num) except: print(2222222222222222222, traceback.format_exc()) # 解析商品评论 def goods_comments(self, goods_dict, page_num): try: is_break = self.is_work headers = { 'cookie': '__mgjuuid=7e841984-d679-49eb-9994-89abaec55322; _mwp_h5_token_enc=36d248108519bf86cf2fa681dbc521f8; _mwp_h5_token=3c71c26a371458b615f433396b39eccf_1564968570925; _ga=GA1.2.2057442167.1565061045; _gid=GA1.2.2144070558.1565061045; __mgjref=https%3A%2F%2Fshop.mogu.com%2Fdetail%2F1m6os9s%3Facm%3D3.ms.1_4_1m6os9s.43.1185-68998.4aiUQrym0Gs9T.sd_117-swt_43-imt_6-t_4aiUQrym0Gs9T-lc_4-pit_1-qid_21841-dit_170-idx_0-dm1_5001%26ptp%3D31.nXjSr.0.0.wLDh8N89', 'pragma': 'no-cache', 'Referer': goods_dict['url'], 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } # 抓取商品评论链接(总共50页,第一页从1开始) for i in range(1, int(page_num) + 1): comment_url = 'https://rate.mogu.com/jsonp/pc.rate.ratelist/v2?pageSize=20&sort=1&isNewDetail=1&itemId={}&type=1&marketType=market_mogujie&page={}'.format( goods_dict['itemID'], i) # print(comment_url) # response = requests.get(url=comment_url, headers=headers, proxies=random.choice(proxies), timeout=10) try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, allow_redirects=False, timeout=20) except: try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=20) except: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=20) # print(comment_data) comment = re.search(r'{".*"success":true}', response.text) # print(comment.group()) items = json.loads(comment.group())['data']['list'] # print(len(items)) goods_comment = dict() for item in items: # print(item) date_data = item['time'].replace('年', '-').replace( '月', '-').replace('日', '') if len(date_data.split('-')) == 2: date_data_test = time.strftime('%Y') + '-' + date_data else: date_data_test = date_data # print(date_data_test) # 判断评论时间是否在规定的抓取时间内 if self.start_time <= date_data_test.strip(): goods_comment['platform'] = goods_dict['platform'] goods_comment['date'] = date_data_test.strip() goods_comment['time'] = '' goods_comment['keyword'] = goods_dict['keyword'] goods_comment['name'] = goods_dict['name'] goods_comment['imageurl'] = goods_dict['imageurl'] goods_comment['audiourl'] = goods_dict['audiourl'] goods_comment['url'] = goods_dict['url'] goods_comment['shop_name'] = '' goods_comment['user_name'] = item['user']['uname'] goods_comment['content'] = item['content'] goods_comment['content_id'] = item['rateId'] goods_comment['brand'] = goods_dict['brand'] goods_comment['price'] = goods_dict['price'] goods_comment['sales'] = goods_dict['sales'] goods_comment['focus_count'] = goods_dict[ 'focus_count'] goods_comment['comment_num'] = goods_dict[ 'achieve_num'] goods_comment['views'] = '' goods_comment['likes'] = '' goods_comment['comments_count'] = '' goods_comment['reposts_count'] = '' goods_comment['author_id'] = item['user']['uid'] goods_comment['topic_id'] = goods_dict['itemID'] try: goods_comment['type'] = item['style'].split( ':')[1].replace(' 尺码', '') except: goods_comment['type'] = '' try: goods_comment['size'] = item['style'].split(':')[2] except: goods_comment['size'] = '' goods_comment['file_code'] = '177' # logger.log(31, '--------------正在写入符合时间的商品评论-----------------------') # print(goods_comment) item = json.dumps(dict(goods_comment), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_daily/ecommerce/{}/177_{}_{}_MoGujie_nike.json' .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d'), self.time_data), item, encoding='utf-8') if date_data.strip() < self.start_time: is_break = True if is_break: break except: print(3333333333333333333, traceback.format_exc()) # 读取excel获取关键词 def parse_xlsx(self): # 设置路径 path = './快消采集关键词_v3_20200330.xlsx' # 打开execl workbook = xlrd.open_workbook(path) # 根据sheet索引或者名称获取sheet内容 Data_sheet = workbook.sheets()[0] # 通过索引获取 rowNum = Data_sheet.nrows # sheet行数 colNum = Data_sheet.ncols # sheet列数 # 获取所有单元格的内容 list = [] for i in range(rowNum): rowlist = [] for j in range(colNum): rowlist.append(Data_sheet.cell_value(i, j)) list.append(rowlist) for data in list[1::]: brand = data[0] # print(brand) yield { '关键词': brand, } def run(self): key_word_list = [] for item in self.parse_xlsx(): # print(item) key_word_list.append(item) for item_dat in key_word_list: # print(item_dat['关键词']) self.parse_goods(item_dat['关键词'])
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self, redis_example): # 时间部分,按小时抓取 date_time = str(datetime.now() - timedelta(days=1)).split('.')[0] start_time_test = time.strftime('%Y-%m-%d 00:00:00') end_time = time.strftime('%Y-%m-%d %H:%M:%S') a = end_time.split(' ')[1].split(':')[0] if a == '00': start_time_data = date_time hours_name = '22_24' wen_jian_jia_date = str(datetime.now() - timedelta( days=1)).split('.')[0].split(' ')[0].replace('-', '') else: two_hours_ago = int(a) - 2 if len(str(two_hours_ago)) == 1: two_hour_ago = '0' + str(two_hours_ago) else: two_hour_ago = str(two_hours_ago) hours_name = str(two_hour_ago) + '_' + str(a) start_time_data = start_time_test wen_jian_jia_date = time.strftime('%Y%m%d') print('爬取时间段:{}到{}'.format(start_time_data, end_time)) logging.info('爬取时间段:{}到{}'.format(start_time_data, end_time)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = start_time_data # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = end_time # 标记爬虫工作 self.is_break = False self.redis_example = redis_example self.pid = os.getpid() self.h2_name = hours_name self.date_time = wen_jian_jia_date # 链接hdfs self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') self.hdfsclient.makedirs( '/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format( wen_jian_jia_date, hours_name)) # 创建每日文件夹 self.time_data = str(time.time()).split('.')[0] # 替换所有的HTML标签 def re_html(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r'\n', '', ret1) ret3 = re.sub(r'\u3000', '', ret2) ret4 = re.sub(r'品牌:', '', ret3) ret5 = re.sub(r'\xa0', '', ret4) ret6 = re.sub(r'→_→', '', ret5) ret7 = re.sub(r'…', '', ret6) ret8 = re.sub(r'https:', '', ret7) return ret8 except: pass # 过滤月销量里面的非数字 def re_not_number(self, data): try: message = str(data) ret1 = re.sub(r'\D', '', message) return ret1 except: pass # 过滤商品价格 def re_price(self, data): try: message = str(data) ret1 = re.sub(r'pcData\(', '', message) ret2 = re.sub(r'\)', '', ret1) return ret2 except: pass # 过滤商品品牌信息 def re_brand(self, data): try: message = str(data) ret1 = re.sub(r'"brandName":', '', message) ret2 = re.sub(r'&', '', ret1) ret3 = re.sub(r'"', '', ret2) return ret3 except: pass # 根据关键词搜索请求得到商品信息 def parse_goods_url(self, data): goods_dict = dict() goods_dict['平台'] = data['平台'] goods_dict['关键词'] = data['关键词'] goods_dict['URL'] = data['URL'] goods_dict['商品名'] = data['商品名'] goods_dict['商品图片'] = data['商品图片'] goods_dict['shop_name'] = data['shop_name'] # logger.log(31, '--------********正在抓取的商品是:%s********--------' % goods_dict) self.parse_goods_details(goods_dict) # 解析商品品牌信息 def parse_goods_details(self, goods_dict): try: headers = { 'Content-Type': 'text/html;charset=utf-8', # 'Connection': 'keep-alive', # 'Cookie': 'SN_SESSION_ID=c55ac35a-f7d1-4b0c-b48a-f88e8bb896f4; useMp4=1.701108; _snvd=1555383181562rH9y3n/THLV; cityCode=021; districtId=12113; cityId=9264; hm_guid=ac41a4ae-4373-4445-ab29-65e90c29b272; _df_ud=60a62287-237d-4cf0-ada4-d39a276f2c2d; _device_session_id=p_2fb27762-ef79-4f07-9f25-e0acad62907a; _cp_dt=bf4a6a96-909f-450a-b7ca-2d8d0b363cee-86574; _snsr=direct%7Cdirect%7C%7C%7C; _snzwt=THiw3Z16a429d6f24nzVa227f; _snmc=1; city=1000267; province=20; district=10002671; provinceCode=20; districtCode=01; streetCode=0210199; SN_CITY=20_021_1000267_9264_01_12113_1_1; authId=si0BE64747CDCB0EC1B819BB87E6D52FC1; secureToken=E180078268FCC770B6CFC47BFC919E55; _snms=155592217017833779; smhst=651484555|0000000000a10607567457|0000000000a10010138536|0070176294a601763915|0000000000a102374199|0000000000a101822787|0000000000a11012720481|0070752460a11024165323|0070745700a193148008|0000000000a861276981|0000000000a11028136288|0070705161a11002911104|0070756234a101822780|0000000000; _snma=1%7C15553832315961909%7C1555383231596%7C1555923318059%7C1555923324804%7C140%7C9; _snmp=155592332389716467; _snmb=155591411681863515%7C1555923324825%7C1555923324807%7C37', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } goods_url = goods_dict['URL'] # print(goods_url) try: time.sleep(0.2) response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) # print(response.text) # print('正在抓取的页面是: %s' % goods_url) data = response.text # 用正则匹配商品品牌 re_brand = re.search(r'"brandName":".*?"', data) try: brand_name = str(self.re_brand(re_brand.group())) except: brand_name = '' # print(html) # 用正则截取价格和评论链接里需要的两串ID partNumber = re.search(r'"partNumber".*?,', data) vendorCode = re.search(r'"vendorCode".*?,', data) # print(partNumber.group(), vendorCode.group()) goods_dict['品牌'] = brand_name goods_dict['月销量'] = '' goods_dict['partNumber'] = self.re_not_number(partNumber.group()) goods_dict['vendorCode'] = self.re_not_number(vendorCode.group()) # print(goods_dict) self.goods_price(goods_dict) except: print(2222222222222222222, traceback.format_exc()) # 抓取商品价格 def goods_price(self, goods_dict): try: headers = { 'Content-Type': 'text/html; charset=UTF-8', # 'Connection': 'keep-alive', # 'Cookie': '_snvd=1555383181562rH9y3n/THLV; cityCode=021; districtId=12113; cityId=9264; hm_guid=ac41a4ae-4373-4445-ab29-65e90c29b272; _df_ud=60a62287-237d-4cf0-ada4-d39a276f2c2d; _device_session_id=p_2fb27762-ef79-4f07-9f25-e0acad62907a; _cp_dt=bf4a6a96-909f-450a-b7ca-2d8d0b363cee-86574; city=1000267; province=20; district=10002671; provinceCode=20; districtCode=01; streetCode=0210199; SN_CITY=20_021_1000267_9264_01_12113_1_1; smhst=826233089|0000000000a10243606506|0000000000a101822738|0000000000a101822744|0000000000a160764310|0000000000a122819279|0000000000a651484555|0000000000a10607567457|0000000000a10010138536|0070176294a601763915|0000000000a102374199|0000000000a101822787|0000000000a11012720481|0070752460a11024165323|0070745700a193148008|0000000000a861276981|0000000000a11028136288|0070705161a11002911104|0070756234a101822780|0000000000; _snzwt=THr1rb16a47c548027ZyP23a2; authId=si18DAC1A401B32915224307F589BD81BC; secureToken=B185BBC1B1CC477DFED182771566D4D2; _snmc=1; _snsr=direct%7Cdirect%7C%7C%7C; _snms=155598227334334657; _snma=1%7C15553832315961909%7C1555383231596%7C1555982274025%7C1555982278050%7C179%7C10; _snmp=155598227667474202; _snmb=155598227327394678%7C1555982278116%7C1555982278059%7C3', 'Host': 'pas.suning.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } partNumber = goods_dict['partNumber'] vendorCode = goods_dict['vendorCode'] # print(partNumber, vendorCode) price_url = 'https://pas.suning.com/nspcsale_0_{}_{}_{}_20_021_0210101_315587_1000267_9264_12113_Z001___R9000361_1.39_0___000165956__.html?callback=pcData'.format( partNumber, partNumber, vendorCode) try: time.sleep(0.2) response = requests.get(url=price_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response = requests.get(url=price_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response = requests.get(url=price_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) # print(response.text) # print(data) re_price_data = self.re_price(response.text) price_data = json.loads(re_price_data)['data']['price'][ 'saleInfo'][0]['promotionPrice'] # print(price_data) if price_data.find('-') > 0: goods_dict['价格'] = price_data.split('-')[0] else: goods_dict['价格'] = price_data # print(goods_dict) self.parse_comment_num(goods_dict) except: print(33333333333333333333, traceback.format_exc()) # 解析商品评价人数 def parse_comment_num(self, goods_dict): try: headers = { 'Content-Type': 'application/javascript;charset=UTF-8', # 'Connection': 'keep-alive', # 'Cookie': '_snvd=1555383181562rH9y3n/THLV; cityCode=021; districtId=12113; cityId=9264; hm_guid=ac41a4ae-4373-4445-ab29-65e90c29b272; _df_ud=60a62287-237d-4cf0-ada4-d39a276f2c2d; _device_session_id=p_2fb27762-ef79-4f07-9f25-e0acad62907a; _cp_dt=bf4a6a96-909f-450a-b7ca-2d8d0b363cee-86574; city=1000267; province=20; district=10002671; provinceCode=20; districtCode=01; streetCode=0210199; SN_CITY=20_021_1000267_9264_01_12113_1_1; tradeMA=127; smhst=651484540|0000000000a826233089|0000000000a10243606506|0000000000a101822738|0000000000a101822744|0000000000a160764310|0000000000a122819279|0000000000a651484555|0000000000a10607567457|0000000000a10010138536|0070176294a601763915|0000000000a102374199|0000000000a101822787|0000000000a11012720481|0070752460a11024165323|0070745700a193148008|0000000000a861276981|0000000000a11028136288|0070705161a11002911104|0070756234a101822780|0000000000; _snms=155641574161962867; route=3798b42173574ff4536b1645bfa56286; _snzwt=THusFg16a66e65b60nBjXc7ab; authId=siB5D783545E4E209EB3048D028A03A056; secureToken=CE1EA8B12441816214F909A8C65BED87; _snma=1%7C15553832315961909%7C1555383231596%7C1556415743033%7C1556504534582%7C192%7C14; _snmc=1; _snsr=direct%7Cdirect%7C%7C%7C; _snmp=155650453353821025; _snmb=155650453458862957%7C1556504534606%7C1556504534588%7C1', 'Host': 'review.suning.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } comment_num_url = 'https://review.suning.com/ajax/cluster_review_satisfy/style--{}-{}-----satisfy.htm?callback=satisfy'.format( goods_dict['partNumber'], goods_dict['vendorCode']) try: time.sleep(0.2) response = requests.get(url=comment_num_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response = requests.get(url=comment_num_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response = requests.get(url=comment_num_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) re_comment_num = re.search(r'{"reviewCounts".*"reCloudDrill":0}', response.text) goods_dict['评论人数'] = json.loads( re_comment_num.group())['reviewCounts'][0]['totalCount'] # print(goods_dict) if int(self.re_not_number(goods_dict['评论人数'])) == 0: logger.log(31, '-------------没有商品评论--------------') # print(goods_data) # self.write_Nike_index_jsonfile(goods_data) else: # 获取评论页数 page_num = int( math.ceil( float( int(self.re_not_number(goods_dict['评论人数'])) / 10))) # print(goods_dict['评论人数'], page_num) self.goods_comments(goods_dict, page_num) except: print(444444444444444444, traceback.format_exc()) # 解析商品评论 def goods_comments(self, goods_dict, page_num): try: is_break = self.is_break # print(goods_dict) partNumber = goods_dict['partNumber'] vendorCode = goods_dict['vendorCode'] headers = { 'Content-Type': 'application/javascript;charset=UTF-8', # 'Connection': 'keep-alive', # 'Cookie': '_snvd=1555383181562rH9y3n/THLV; cityCode=021; districtId=12113; cityId=9264; hm_guid=ac41a4ae-4373-4445-ab29-65e90c29b272; _df_ud=60a62287-237d-4cf0-ada4-d39a276f2c2d; _device_session_id=p_2fb27762-ef79-4f07-9f25-e0acad62907a; _cp_dt=bf4a6a96-909f-450a-b7ca-2d8d0b363cee-86574; city=1000267; province=20; district=10002671; provinceCode=20; districtCode=01; streetCode=0210199; SN_CITY=20_021_1000267_9264_01_12113_1_1; tradeMA=127; route=3798b42173574ff4536b1645bfa56286; _snzwt=THusFg16a66e65b60nBjXc7ab; _snsr=direct%7Cdirect%7C%7C%7C; _snmc=1; _snms=155652264991095847; authId=si07DE872B7B580CBB2CB11C7105B450A8; secureToken=5C8868551C3103287B59ADEDD6B90567; smhst=192279908|0000000000a600733096|0000000000a600479244|0000000000a10700388709|0070547159a651484540|0000000000a826233089|0000000000a10243606506|0000000000a101822738|0000000000a101822744|0000000000a160764310|0000000000a122819279|0000000000a651484555|0000000000a10607567457|0000000000a10010138536|0070176294a601763915|0000000000a102374199|0000000000a101822787|0000000000a11012720481|0070752460a11024165323|0070745700a193148008|0000000000; _snma=1%7C15553832315961909%7C1555383231596%7C1556524706411%7C1556524786984%7C224%7C15; _snmp=155652478697968344; _snmb=155652102706620667%7C1556524786995%7C1556524786988%7C28', 'Host': 'review.suning.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } if int(page_num) >= 50: pages = 50 else: pages = page_num # 抓取商品评论链接(总共50页,第一页从1开始) for i in range(1, int(pages) + 1): comment_url = 'https://review.suning.com/ajax/cluster_review_lists/style--{}-{}-newest-{}-default-10-----reviewList.htm?callback=reviewList'.format( partNumber, vendorCode, i) # print(comment_url) try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) comment_data = response.text # print(comment_data) comment = re.search(r'{"commodityReviews":.*"reCloudDrill":0}', comment_data) items = json.loads(comment.group())['commodityReviews'] # print(len(items)) if len(items) == 0: break else: goods_comment = dict() for data in items: # print(data) date_data = data['publishTime'].split(' ')[0] time_data = data['publishTime'].split(' ')[1] # print(date_data.strip(), time_data.strip()) try: content = self.re_html(data['content']) except: content = '' # 追加评论 try: content_add = data['againReview']['againContent'] except: content_add = '' # 判断评论时间是否在规定的抓取时间内 if self.start_time <= data['publishTime']: goods_comment['platform'] = goods_dict['平台'] goods_comment['date'] = date_data.strip() goods_comment['time'] = time_data.strip() goods_comment['keyword'] = goods_dict['关键词'] goods_comment['name'] = goods_dict['商品名'] goods_comment['imageurl'] = goods_dict['商品图片'] goods_comment['audiourl'] = '' goods_comment['url'] = goods_dict['URL'] goods_comment['shop_name'] = goods_dict[ 'shop_name'] goods_comment['user_name'] = data['userInfo'][ 'nickName'] goods_comment[ 'content'] = content + ';' + content_add goods_comment['content_id'] = str( data['commodityReviewId']) goods_comment['brand'] = goods_dict['品牌'] goods_comment['price'] = goods_dict['价格'] goods_comment['sales'] = goods_dict['月销量'] goods_comment['focus_count'] = '' goods_comment['comment_num'] = goods_dict['评论人数'] goods_comment['views'] = '' goods_comment['author_id'] = '' goods_comment['reposts_count'] = '' goods_comment['topic_id'] = str( goods_dict['URL'].split('/')[4].replace( '.html', '')) test_data = data['commodityInfo']['charaterId1'] if test_data == '尺码': goods_comment['type'] = data['commodityInfo'][ 'charaterDesc2'] goods_comment['size'] = data['commodityInfo'][ 'charaterDesc1'] else: goods_comment['type'] = data['commodityInfo'][ 'charaterDesc1'] goods_comment['size'] = data['commodityInfo'][ 'charaterDesc2'] self.likes_comments(goods_comment) if date_data.strip() < self.start_time: is_break = True if is_break: break except: print(555555555555555555, traceback.format_exc()) # 解析商品评论的点赞数和回复数 def likes_comments(self, goods_comment): try: comment_id = goods_comment['topic_id'] url = 'https://review.suning.com/ajax/useful_count/{}-usefulCnt.htm'.format( comment_id) headers = { 'Content-Type': 'application/javascript;charset=UTF-8', # 'Cookie': 'tradeMA=55; _snvd=1565067528273QvL8ia7lwZC; SN_CITY=20_021_1000267_9264_01_12113_2_0; cityCode=021; districtId=12113; cityId=9264; hm_guid=ca34f536-186e-4619-aa8f-6c8808ee39a6; _df_ud=e64b917e-c77c-46e0-9d10-d84c86c93f3a; _device_session_id=p_806c72c6-6fa6-462d-bf88-f660c7094c1a; _cp_dt=21f7906b-c341-404f-996b-4d4f2e32e4af-70039; route=e46977517568f7cad53fbfe19eaf4774; _snmc=1; _snsr=direct%7Cdirect%7C%7C%7C; authId=siC700F4CB8ABB1C2E87F1FA1E9650CF7A; secureToken=F9331FD98F503CE8898949382003910A; _snzwt=THs64g16ce02abb69OAUS9a89; _snms=156712934067680848; smhst=690105206|0000000000a10118749983|0000000000a10689501376|0070222946a10949954840|0000000000a10966209249|0000000000a10757523126|0000000000a10620476914|0000000000a11180422688|0000000000a10966225829|0000000000a769909849|0070230352a10580507394|0070222946a826193435|0000000000a10163182478|0000000000a10964625880|0000000000a10571100966|0070074453; _snma=1%7C156506752678869586%7C1565067526788%7C1567129356201%7C1567129676548%7C137%7C12; _snmp=156712967506243164; _snmb=156712899210934272%7C1567129676573%7C1567129676552%7C8', 'Host': 'review.suning.com', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } try: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) likes_comments_data = json.loads( response.text.replace('usefulCnt(', '').replace(')', '')) goods_comment['likes'] = likes_comments_data[ 'reviewUsefuAndReplylList'][0]['usefulCount'] goods_comment['comments_count'] = likes_comments_data[ 'reviewUsefuAndReplylList'][0]['replyCount'] goods_comment['file_code'] = '53' # logger.log(31, '****-------正在写入符合时间的商品评论信息-------****') # print(goods_comment) item = json.dumps(dict(goods_comment), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_2h/ecommerce/{}/{}/53_{}_Suning_nike_{}.json' .format(self.date_time, self.h2_name, time.strftime('%Y%m%d'), self.pid), item, encoding='utf-8') except: print(6666666666666666666666666, traceback.format_exc()) def run(self, lock): for num in range(1000000): lock.acquire() redis_url_num = self.redis_example.llen('su_ning_2h') if str(redis_url_num) == '0': print( '**************\nRedis消息队列中url为空,程序等待中.....\n进程等待中......\n*************' ) item = self.redis_example.brpop('su_ning_2h', timeout=3600)[1] lock.release() item1 = json.loads(item.decode()) # print(item) self.parse_goods_url(item1)
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self, file_path): self.headers_one = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36' } a = str(datetime.now()) hour = a.split(' ')[-1].split(':')[0] num = int(hour) / 3 num = int(num) * 3 if num == 0: # 对于凌晨 0 点的判断 # 时间判断部分 date = datetime.now() - timedelta(days=1) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=1) # 昨天时间 yesterday = str(yesterday).split(' ')[0] else: # 时间判断部分 date = datetime.now() - timedelta(days=0) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=0) # 昨天时间 yesterday = str(yesterday).split(' ')[0] # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # self.start_time = '2019-09-09' # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = yesterday # self.end_time = '2019-09-16' logger.info('爬取时间段:{}到{}'.format(self.start_time, self.end_time)) # logging.info('爬取时间段:{}到{}'.format(self.start_time, self.end_time)) # 定义评论的抓取时间范围 self.comment_start_time = yesterday # 一天回复 # self.comment_start_time = '2019-09-09' # 一天回复 # self.comment_start_time = '' # 不限定时间回复 self.comment_end_time = yesterday # self.comment_end_time = '2019-09-16' # 标记爬虫工作 self.is_work = True self.set_list = [] #去重列表 self.file_name_time = self.get_file_name_time() self.file_path = file_path self.hdfsclient = HdfsClient(url='http://*****:*****@class="detailtz clearall"]/li[@class="clearall"]') for li in li_list: print(11111) title = ''.join(li.xpath('.//p[@class="title"]//text()')).strip() news_date = li.xpath('.//span[@class="postdate"]/text()')[0].strip() news_url = 'https:' + li.xpath('.//p[@class="title"]/a/@href')[0].strip() views = li.xpath('.//span[@class="fr"]/span/text()')[0].split('(')[-1].split(')')[0] replay_no = li.xpath('.//span[@class="fr"]/a/text()')[0].split('(')[-1].split(')')[0] # print(title, news_date, news_url, views, replay_no) time.sleep(3) # # 做时间判断部分--------------- # get_news_time = time.mktime(time.strptime(news_date, "%Y-%m-%d")) # end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d")) # if self.start_time != '': # start_time = time.mktime(time.strptime(self.start_time, "%Y-%m-%d")) # else: # start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d")) # if float(get_news_time) < float(start_time): # self.is_work = False # # if float(start_time) <= float(get_news_time) < float(end_time): # 符合时间段的内容 print(news_date) if '1天' in news_date: print(222222, news_url) if news_url not in self.set_list: # 去重判断 self.get_news_page(news_url, title, views, replay_no, keyword) self.set_list.append(news_url) if data.xpath('.//a[@class="page-next"]/@href') and self.is_work: next_page_url = 'https:' + data.xpath('.//a[@class="page-next"]/@href')[0].strip() self.get_search_page(next_page_url, keyword) def get_news_page(self, url, title, views, replay_no, keyword, is_first=True): """ 帖子详情页 :param url: :return: """ heasers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Cookie': 'f9big=u62; _Z3nY0d4C_=37XgPK9h; JSESSIONID=4AB05FA49E2A1477353FD49E96A7DC94; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216cadbc7b78349-00406e52f50a5c-7373e61-2304000-16cadbc7b797b8%22%2C%22%24device_id%22%3A%2216cadbc7b78349-00406e52f50a5c-7373e61-2304000-16cadbc7b797b8%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _DM_SID_=f9eb430f8631d9542bb6023a0185fd5e; _DM_S_=2446eea1926edb3b390dd3a02a7bcfb2; f19big=ip48; _dm_userinfo=%7B%22uid%22%3A0%2C%22stage%22%3A%22%22%2C%22city%22%3A%22%E4%B8%8A%E6%B5%B7%3A%E4%B8%8A%E6%B5%B7%22%2C%22ip%22%3A%22124.78.53.22%22%2C%22sex%22%3A%22%22%2C%22frontdomain%22%3A%22www.19lou.com%22%2C%22category%22%3A%22%22%7D; pm_count=%7B%7D; dayCount=%5B%5D; Hm_lvt_5185a335802fb72073721d2bb161cd94=1566282908; screen=682; _dm_tagnames=%5B%7B%22k%22%3A%22%E8%80%90%E5%85%8B%22%2C%22c%22%3A1%7D%2C%7B%22k%22%3A%22baoma%22%2C%22c%22%3A3%7D%2C%7B%22k%22%3A%22baoma%22%2C%22c%22%3A2%7D%5D; Hm_lpvt_5185a335802fb72073721d2bb161cd94=1566283069', 'Host': 'www.19lou.com', 'Referer': 'https://www.19lou.com/search/thread?keyword=%E8%80%90%E5%85%8B', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '******', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36' } logger.info(url) response = requests.get(url, headers=heasers, timeout=120) # print(response.text) data = etree.HTML(response.content.decode('gb2312', 'ignore')) if data.xpath('.//div[@class="user-info thread-side"]'): # 文章类帖子 if data.xpath('.//div[@itemprop="replyPost"]'): # 判断文章回复 reply_list = data.xpath('.//div[@itemprop="replyPost"]') for replay in reply_list: content_list = replay.xpath('.//div[@class="post-cont"]//text()') content = ''.join(content_list).strip() # print(floor) # print(content) item = {} item['platform'] = '19楼' date_all = replay.xpath('.//div[@class="u-add link0 clearall"]/span[@class="fl"]/text()')[0] print(11111, date_all) item['date'] = date_all.split(' ')[0].replace('发表于', '') item['time'] = date_all.split(' ')[1] item['author'] = replay.xpath('.//a[@class="name"]/span/text()')[0] try: item['author_id'] = replay.xpath('.//a[@class="name"]/@href')[0].split('-')[1] except: item['author_id'] = '' item['post_client'] = '' item['title'] = title item['content'] = content item['content_id'] = replay.xpath('.//parent::div/@id')[0] item['brand'] = '' item['carseries'] = '' item['from'] = '' item['series_url'] = '' item['url'] = url item['is_topics'] = '否' item['floor'] = replay.xpath('.//a[@itemprop="postSequenceNumber"]/text()')[0] item['identification'] = '' item['favorite'] = '' item['signin_time'] = '' item['reply_no'] = '' item['views'] = '' item['likes'] = '' item['is_elite'] = '' item['topic_count'] = '' item['reply_count'] = '' item['pick_count'] = '' item['follows'] = '' item['topic_categroy'] = '' item['topic_type'] = '' item['insert_time'] = str(datetime.now()).split('.')[0] item['update_time'] = str(datetime.now()).split('.')[0] item['topic_id'] = url.split('-')[3] item['reply_floor'] = '' item['keyword'] = keyword item['file_code'] = '186' item['reposts_count'] = '' # print(item) self.__write_news_jsonfile(item) if is_first: # 文章类帖子,首页要抓取文章内容 content_list = data.xpath('.//div[@class="post-cont"]//text()') content = ''.join(content_list).strip() # print(floor) # print(content) item = {} item['platform'] = '19楼' date_all = data.xpath('.//span[@class="u-add-ft"]/@title')[0] item['date'] = date_all.split(' ')[0].replace('发表于', '') item['time'] = date_all.split(' ')[1] item['author'] = data.xpath('.//div[@class="user-name"]/a/span/text()')[0] try: item['author_id'] = data.xpath('.//div[@class="user-name"]/a/@href')[0].split('-')[1] except: item['author_id'] = '' item['post_client'] = '' item['title'] = title item['content'] = content item['content_id'] = url.split('-')[3] item['brand'] = '' item['carseries'] = '' item['from'] = '' item['series_url'] = '' item['url'] = url item['is_topics'] = '是' item['floor'] = '楼主' item['identification'] = '' item['favorite'] = '' item['signin_time'] = '' item['reply_no'] = replay_no item['views'] = views item['likes'] = '' item['is_elite'] = '' item['topic_count'] = '' item['reply_count'] = '' item['pick_count'] = '' item['follows'] = '' item['topic_categroy'] = '' item['topic_type'] = '' item['insert_time'] = str(datetime.now()).split('.')[0] item['update_time'] = str(datetime.now()).split('.')[0] item['topic_id'] = url.split('-')[3] item['reply_floor'] = '' item['keyword'] = keyword item['file_code'] = '186' item['reposts_count'] = '' # print(item) self.__write_news_jsonfile(item) else: # 论坛类帖子 div_list = data.xpath('.//div[@id="view-bd"]/div[@id and @itemprop]') for div in div_list: content_list = div.xpath('.//div[@class="thread-cont"]//text()') content = ''.join(content_list).strip() floor = div.xpath('.//div[@class="cont-hd clearall"]/a[@data-pid]/text() | .//span[@itemprop="postSequenceNumber"]/text() | .//em[@itemprop="postSequenceNumber"]/text()') floor = ''.join(floor).strip() # print(floor) # print(content) item = {} item['platform'] = '19楼' if floor == '楼主': date_all = div.xpath('.//li[@title]/@title')[0] else: date_all = div.xpath('.//p[@class="fl link1"]/span/text()')[0] item['date'] = date_all.split(' ')[0].replace('发表于', '') item['time'] = date_all.split(' ')[1] item['author'] = div.xpath('.//div[@class="uname"]/a/@title')[0] try: item['author_id'] = div.xpath('.//div[@class="uname"]/a/@href')[0].split('-')[1] except: item['author_id'] = '' try: item['post_client'] = div.xpath('.//p[@class="forum-source fl link0"]/a/text()')[0] except: item['post_client'] = '' item['title'] = title item['content'] = content item['content_id'] = div.xpath('.//div[@id and @class="cont"]/@id')[0].replace('pid', '') item['brand'] = '' item['carseries'] = '' item['from'] = '' item['series_url'] = '' item['url'] = url if floor == '楼主': is_topics ='是' else: is_topics = '否' item['is_topics'] = is_topics item['floor'] = floor item['identification'] = '' item['favorite'] = '' item['signin_time'] = div.xpath('.//dl/dd[@class="color6" and @itemprop]/text()')[0] if is_topics == '是': item['reply_no'] = replay_no item['views'] = views else: item['reply_no'] = '' item['views'] = '' item['likes'] = '' item['is_elite'] = '' item['topic_count'] = '' item['reply_count'] = '' item['pick_count'] = '' item['follows'] = '' item['topic_categroy'] = '' item['topic_type'] = '' item['insert_time'] = str(datetime.now()).split('.')[0] item['update_time'] = str(datetime.now()).split('.')[0] item['topic_id'] = url.split('-')[3] item['reply_floor'] = '' item['keyword'] = keyword item['file_code'] = '186' item['reposts_count'] = '' # print(item) self.__write_news_jsonfile(item) if data.xpath('.//a[@class="page-next"]/@href'): next_page_url = 'https:' + data.xpath('.//a[@class="page-next"]/@href')[0].strip() self.get_news_page(next_page_url, title, views, replay_no, keyword, is_first=False) def get_file_name_time(self): a = str(datetime.now()) hour = a.split(' ')[-1].split(':')[0] num = int(hour) / 3 num = int(num) * 3 if num == 0: num = 24 a = str(datetime.now() - timedelta(days=1)) # 昨天时间 num = a.split(' ')[0] + ' ' + str(num) return num # 写入json文件 def __write_news_jsonfile(self, item): item = json.dumps(dict(item), ensure_ascii=False) + '\n' # with open('./../19/{}_floor_news.json'.format(str(datetime.now()).split(' ')[0]), 'ab') as f: # with open('{}/{}_floor_news_adidas.json'.format(self.file_path, str(datetime.now()).split(' ')[0]), 'ab') as f: # f.write(item.encode("utf-8")) self.hdfsclient.new_write('{}/{}/{}/186_{}_{}_floor_news.json'.format(self.file_path, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name, str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item,encoding='utf-8') def run(self): url = 'https://www.19lou.com/search/thread?keyword={}&sorts=0&timeType=1&fids=undefined&usesearchtype=1' url_list = get_config_para('nike_daily_keywords') logger.log(31, url_list) for item in url_list: # print(1) keyword = item['keywords'] logger.log(31, keyword) if keyword: search_url = url.format(keyword.strip()) logger.info('搜索url:' + search_url) try: self.get_search_page(search_url, keyword) except: logger.error(traceback.format_exc())
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self, file_path, comment_path): self.headers_one = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36' } a = str(datetime.now()) hour = a.split(' ')[-1].split(':')[0] num = int(hour) / 3 num = int(num) * 3 if num == 0: # 对于凌晨 0 点的判断 # 时间判断部分 date = datetime.now() - timedelta(days=1) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=1) # 昨天时间 yesterday = str(yesterday).split(' ')[0] else: # 时间判断部分 date = datetime.now() - timedelta(days=0) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=0) # 昨天时间 yesterday = str(yesterday).split(' ')[0] # print('爬取时间段:{}到{}'.format(news_start_time, yesterday)) # logger.log(31, '爬取时间段:{}到{}'.format(news_start_time, yesterday)) # # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = yesterday # 标记爬虫工作 self.is_work = True self.xhsapi = XhsApi('8ac1d719cd0a2d16') # 代理服务器 proxyHost = "http-cla.abuyun.com" proxyPort = "9030" # 代理隧道验证信息 proxyUser = "******" proxyPass = "******" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, "user": proxyUser, "pass": proxyPass, } self.proxies = { "http": proxyMeta, "https": proxyMeta } self.set_list = [] # self.info = seeeion_id_list self.file_name_time = self.get_file_name_time() pool = redis.ConnectionPool(host='192.168.1.208', port=6379, password='******') # 实现一个Redis连接池 self.redis_example = redis.Redis(connection_pool=pool) self.error_count = 0 with open('./session_id_list_hour.json') as f: session_id = f.read() self.session_id_list = eval(session_id) self.file_path = file_path self.comment_apth = comment_path self.hdfsclient = HdfsClient(url='http://jq-chance-05:9870', user='******') hour = str(datetime.now()).split(' ')[-1].split(':')[0] if str(hour) != '00': two_hour_ago = int(hour) - 2 if len(str(two_hour_ago)) == 1: two_hour_ago = '0' + str(two_hour_ago) self.hour_name = str(two_hour_ago) + '_' + str(hour) else: self.hour_name = '22_24' self.hdfsclient.makedirs('{}/{}/{}'.format(self.file_path, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name)) # 创建每日文件夹 self.hdfsclient.makedirs('{}/{}/{}'.format(self.comment_apth, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name)) # 创建每日文件夹 self.time_time = str(time.time()).split('.')[0] self.session_id_error = [] def change_ip(self): logger.log(31, '开始切换ip') url = 'http://proxy.abuyun.com/switch-ip' time.sleep(random.randint(1, 15)) response = requests.get(url, proxies=self.proxies) logger.log(31, '现使用ip:'+ response.text) def res_ip_three_hour(self): """ 25分钟-3小时 :return: """ headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'webapi.http.zhimacangku.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } # 5-25分 500个ip url = 'http://http.tiqu.alicdns.com/getip3?num=1&type=1&pro=110000&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1®ions=&gm=4' time.sleep(random.randint(3, 6)) ip_pro = requests.get(url, headers=headers) ip = ip_pro.text.strip() return ip def get_serach_list(self, page, keyword): info = random.choice(self.session_id_list) # info = self.sid_info logger.log(31, 'session_id下标: ' + str(self.session_id_list.index(info))) self.xhsapi.set_smid(info['device_fingerprint']) self.xhsapi.set_session_id(info['sid'].split('.')[-1]) search_ret = self.xhsapi.search(keyword, 1, 20) # print(222222222222, search_ret) if '{"msg":"","result":0,"success":true}' in search_ret: self.session_id_error.append(info) if self.session_id_error.count(info) > 5: logger.log(31, '无效id:' + str(info)) del self.session_id_list[self.session_id_list.index(info)] if self.error_count > 5: self.change_ip() self.error_count = 0 self.error_count += 1 self.get_serach_list(page, keyword) return if '参数错误' in search_ret: logger.log(31, '参数错误,重试.....') self.get_serach_list(page, keyword) return json_text = json.loads(search_ret) # print(json_text) note_list = json_text["data"]["notes"] for note in note_list: title = note["title"] if not title: title = note["desc"] id = note["id"] time.sleep(0.1) if id not in self.set_list and not self.redis_example.sismember('xiaohongshu_out_day_url_hour', id): logger.log(31, '标题: ' + title) try: self.get_note(id, keyword) except: print(traceback.format_exc()) self.set_list.append(id) else: logger.log(31, '根据去重列表和从redis中判断时间不符合......' + str(id)) def get_note(self, note_id, keyword): info = random.choice(self.session_id_list) # info = self.sid_info logger.log(31, 'session_id下标: ' + str(self.session_id_list.index(info))) self.xhsapi.set_smid(info['device_fingerprint']) self.xhsapi.set_session_id(info['sid'].split('.')[-1]) note_ret = self.xhsapi.get_note(note_id) # print(333333, note_ret) if '参数错误' in note_ret: logger.log(31, '参数错误,重试.....') self.get_note(note_id, keyword) return # print(response.text) # if '"result":0' in response.text and 'msg:' in response.text: # logger.log(31, '无效id:', info) # del self.session_id_list[self.session_id_list.index(info)] # return if '{"msg":"","result":0,"success":true}' in note_ret: self.session_id_error.append(info) if self.session_id_error.count(info) > 5: logger.log(31, '无效id:' + str(info)) del self.session_id_list[self.session_id_list.index(info)] if self.error_count > 5: self.change_ip() self.error_count = 0 self.error_count += 1 self.get_note(note_id, keyword) return json_text = json.loads(note_ret) # print(11111, json_text) data = json_text["data"][0]['note_list'][0] item = {} item['platform'] = '小红书' # print(222222, data) date_all = data['time'] time_local = time.localtime(float(date_all)) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M", time_local) # "%Y-%m-%d %H:%M:%S" logger.log(31, "时间: " + str(dt)) # # 做时间判断部分--------------- get_news_time = time.mktime(time.strptime(str(dt).split(' ')[0], "%Y-%m-%d")) end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d")) if self.start_time != '': start_time = time.mktime(time.strptime(self.start_time, "%Y-%m-%d")) else: start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d")) if float(get_news_time) < float(start_time): logger.log(31, '不符合时间') self.redis_example.sadd('xiaohongshu_out_day_url_hour', note_id) elif float(start_time) <= float(get_news_time) <= float(end_time): # print('符合时间') news_date = dt.split(' ')[0] news_time = dt.split(' ')[1] item['date'] = news_date item['time'] = news_time title = data['share_info']["title"] item['title'] = title item['content'] = data["desc"] note_id = data["id"] item['content_id'] = note_id item['article_author'] = data["user"]["nickname"] item['clicks'] = '' item['views'] = data['view_count'] comments = data["comments_count"] item['comments_count'] = comments item['likes'] = data["liked_count"] item['dislikes'] = '' item['keyword'] = keyword article_url = data['share_info']["link"] item['article_url'] = article_url item['series_url'] = '' item['list_url'] = '' item['article_type'] = '' item['article_source'] = '' item['insert_time'] = str(datetime.now()).split('.')[0] item['update_time'] = str(datetime.now()).split('.')[0] item['topic_id'] = note_id item['author_id'] = data["user"]["id"] item['file_code'] = '28' item['reposts_count'] = data['shared_count'] # print(item) self.write_news_jsonfile(item) # self.queue.put(item) if int(comments) > 0: try: self.get_note_comment(note_id, keyword, article_url, news_date, news_time, title) except: logging.error(traceback.format_exc()) try: self.get_note_comment(note_id, keyword, article_url, news_date, news_time, title) except: logging.error(traceback.format_exc()) # @retry(stop_max_attempt_number=2, retry_on_exception=retry_if_key_error) def get_note_comment(self, note_id, keyword, article_url, news_date, news_time, title, start='', now_page=1): if start: response = self.xhsapi.get_note_comments(note_id, 20, start) else: response = self.xhsapi.get_note_comments(note_id, 20) # if '"result":0' in response.text and 'msg:' in response.text: # del self.session_id_list[self.session_id_list.index(s)] # return data = json.loads(response) # print(data) comment_list = data['data']["comments"] comment_count = data['data']["comment_count_l1"] last_comment_id = '' total_item = '' for comment in comment_list: item = {} item['platform'] = '小红书' item['source_date'] = news_date item['source_time'] = news_time date_all = comment['time'] # #转换成localtime time_local = time.localtime(float(date_all)) # 转换成新的时间格式(2016-05-05 20:28:54) comment_date = time.strftime("%Y-%m-%d %H:%M", time_local) # "%Y-%m-%d %H:%M:%S" # # 做时间判断部分--------------- # get_news_time = time.mktime(time.strptime(str(comment_date), "%Y-%m-%d %H:%M")) # # end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d %H:%M")) # if self.start_time != '': # start_time = time.mktime(time.strptime(self.start_time, "%Y-%m-%d %H:%M")) # else: # start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d %H:%M")) # if float(get_news_time) < float(start_time): # self.is_work = False # return # # if float(start_time) <= float(get_news_time): get_news_time = time.mktime(time.strptime(str(comment_date).split(' ')[0], "%Y-%m-%d")) end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d")) if self.start_time != '': start_time = time.mktime(time.strptime(self.start_time, "%Y-%m-%d")) else: start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d")) if float(get_news_time) < float(start_time): self.is_get_comment = False # 返回的回答消息是按时间进行排序的,所以当时间小于指定时间时,就停止爬取, break elif float(start_time) <= float(get_news_time) <= float(end_time): item['date'] = comment_date.split(' ')[0] item['time'] = comment_date.split(' ')[1] item['title'] = title item['author'] = comment['user']["nickname"] item['author_id'] = comment['user']["userid"] item['content'] = comment["content"] comment_id = comment["id"] last_comment_id = comment_id item['content_id'] = comment_id item['floor'] = '' item['keyword'] = keyword item['source_url'] = article_url item['comment_url'] = article_url item['views'] = '' item['comments_count'] = '' item['likes'] = comment["like_count"] item['dislikes'] = '' item['insert_time'] = str(datetime.now()).split('.')[0] item['update_time'] = str(datetime.now()).split('.')[0] item['topic_id'] = note_id item['file_code'] = '42' item['reposts_count'] = '' # print(item) # print(11111111, item) item = json.dumps(dict(item), ensure_ascii=False) + '\n' total_item = total_item + item self.write_comment_jsonfile(total_item) # self.comment_queue.put # print(last_comment_id) all_page_num = math.ceil(float(int(comment_count)/20)) if int(all_page_num) > now_page and self.is_work: now_page += 1 time.sleep(0.1) try: self.get_note_comment(note_id, keyword, article_url, news_date, news_time, title, last_comment_id, now_page) except: try: self.get_note_comment(note_id, keyword, article_url, news_date, news_time, title, last_comment_id, now_page) except: pass # 写入json文件 def write_news_jsonfile(self, item): logging.log(31, '写入文章数据,') item = json.dumps(dict(item), ensure_ascii=False) + '\n' # with open('./28_{}_xiaohongshu_article.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f: # f.write(item.encode("utf-8")) self.hdfsclient.new_write('{}/{}/{}/28_{}_{}_xiaohongshu_article.json'.format(self.file_path, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name, str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item,encoding='utf-8') def write_comment_jsonfile(self, item): logging.log(31, '写入评论数据') # item = json.dumps(dict(item), ensure_ascii=False) + '\n' # with open('./42_{}_xiaohongshu_comment.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f: # f.write(item.encode("utf-8")) self.hdfsclient.new_write('{}/{}/{}/42_{}_{}_xiaohongshu_comment.json'.format(self.comment_apth, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name, str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item,encoding='utf-8') def get_file_name_time(self): a = str(datetime.now()) hour = a.split(' ')[-1].split(':')[0] num = int(hour) / 3 num = int(num) * 3 if num == 0: num = 24 a = str(datetime.now() - timedelta(days=1)) # 昨天时间 num = a.split(' ')[0] + ' ' + str(num) return num def run(self): try: url_list = get_config_para('nike_daily_keywords') for index, item in enumerate(url_list): # print(1) keyword = item['keywords'] logger.log(31, '关键词:' + keyword) for i in range(1, 20): print('获取搜索列表页: ', i) try: self.get_serach_list(str(i), keyword) except: logger.error(traceback.format_exc()) except: print(traceback.format_exc()) logger.critical(traceback.format_exc())
class AiQiYi(object): """ 这是一个爬虫模板 """ def __init__(self, file_path, comment_path): # 时间判断部分 date = datetime.now() - timedelta(days=7) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=1) # 昨天时间 yesterday = str(yesterday).split(' ')[0] # print('爬取时间段:{}到{}'.format(news_start_time, yesterday)) logging.info('爬取时间段:{}到{}'.format(news_start_time, yesterday)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # self.start_time = '2019-09-09' # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = yesterday # self.end_time = '2019-09-16' # 标记爬虫工作 self.is_work = True self.is_stop = False self.file_name_time = self.get_file_name_time() self.file_path = file_path self.comment_apth = comment_path self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000', user='******') self.hdfsclient.makedirs('{}/{}'.format( self.file_path, str(datetime.now()).split(' ')[0].replace('-', ''))) # 创建每日文件夹 self.hdfsclient.makedirs('{}/{}'.format( self.comment_apth, str(datetime.now()).split(' ')[0].replace('-', ''))) # 创建每日文件夹 self.time_time = str(time.time()).split('.')[0] def get_video_list(self, url, keyword): logger.info('搜索url: ' + url + ' ' + keyword) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36' } response = requests.get(url, headers=headers, timeout=120) data = etree.HTML(response.content.decode()) # print(response.text) video_list = data.xpath('.//div[@class="qy-search-result-item"]') for video in video_list: try: video_url = video.xpath( './/h3[@class="qy-search-result-tit title-line"]/a/@href' )[0] try: video_time = video.xpath( './/div[@class="qy-search-result-info half"]/span[@class="info-des"]/text()' )[0] except: video_time = str(datetime.now()).split(' ')[0] logger.info('视频时间:' + video_time) # 做时间判断部分--------------- get_news_time = time.mktime( time.strptime(video_time, "%Y-%m-%d")) end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d")) start_time = time.mktime( time.strptime(self.start_time, "%Y-%m-%d")) if float(get_news_time) < float(start_time): self.is_stop = True # 返回的回答消息是按时间进行排序的,所以当时间小于指定时间时,就停止爬取, break elif float(start_time) <= float(get_news_time) <= float( end_time): try: if 'http' not in video_url: video_url = 'https:' + video_url print(2222222, video_url) self.get_video_page(video_url, keyword) except: print(traceback.format_exc()) logger.error(traceback.format_exc()) except: print(traceback.format_exc()) if data.xpath('.//a[@data-key="down"]') and not self.is_stop: next_page = data.xpath('.//a[@data-key="down"]/@href')[0] next_page = 'https://so.iqiyi.com' + next_page.strip() print(next_page) self.get_video_list(next_page, keyword) def get_video_page(self, url, keyword): logger.info('视频url: ' + url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36' } response = requests.get(url, headers=headers, timeout=120) # data = etree.HTML(response.content.decode()) # page_info = data.xpath('.//div[@id="iqiyi-main"]/div/@page-info') # print(page_info) if response.status_code == 200: text = response.text get_page_info = re.search("page-info=[\S\s]*video-info", text)[0] try: page_info = get_page_info[11:][:-13] page_info = json.loads(page_info) except: try: page_info = get_page_info[11:][:-14] page_info = json.loads(page_info) except: # print(get_page_info) logger.error(traceback.format_exc()) # print(page_info) video_info = re.search("video-info=[\S\s]*}'>", text)[0] video_info = video_info[12:][:-2] video_info = json.loads(video_info) item = {} item['platform'] = '爱奇艺' date_all = video_info['firstPublishTime'] date_all = str(date_all)[:-3] # #转换成localtime time_local = time.localtime(float(date_all)) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) # "%Y-%m-%d %H:%M:%S" item['date'] = dt.split(' ')[0] # 发布日期 item['time'] = dt.split(' ')[1] # 发布时间 item['title'] = video_info['name'] # 视频标题 item['description'] = video_info['description'] # 视频描述 try: item['source_author'] = video_info['user']['name'] # 来源/上传者 item['followers_count'] = video_info['user'][ 'followerCount'] # 粉丝数 except: item['source_author'] = '' item['followers_count'] = '' item['clicks'] = '' # 点击数 item['play'] = '' # 播放量 item['keyword'] = keyword # 关键词 item['url'] = url # URL try: item['categroy'] = video_info['categories'][0]['name'] # 视频分类 except KeyError: item['categroy'] = '' video_id = video_info['tvId'] likes = self.get_likes_count(video_id) # 获取点赞数 item['likes'] = likes # 点赞数 page = 1 comment_count = self.get_comment_count(video_id, page) # 获取评论数 item['comments_count'] = comment_count # 评论数 item['topic_id'] = url.split('/')[-1].split('.')[0] # 主贴id item['author_id'] = video_info['user']['id'] # 作者id item['content_id'] = url.split('/')[-1].split('.')[0] item['file_code'] = '111' item['reposts_count'] = '' # print(item) self.write_news_jsonfile(item) if int(comment_count) > 0: self.get_comment(video_id, page, url, video_info['name'], comment_count, keyword, dt.split(' ')[0], dt.split(' ')[1]) # 获取评论 def get_likes_count(self, video_id): url = 'http://iface2.iqiyi.com/like/count?businessType=14&entityId={}&qyid=63204618cb07f6722139214f3b31f1b0&callback=jsonp_1550734824178_93496'.format( str(video_id)) response = requests.get(url) text = response.text text = text[30:][:-2] text = json.loads(text) likes = text['data'] return likes def get_comment_count(self, video_id, page): """ 获取评论数量 :param video_id: :param page: :return: """ # http://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118&agent_version=9.11.5&authcookie=null&business_type=17&content_id=31067882509&hot_size=10&last_id=&page=1&page_size=10&types=hot,time&callback=jsonp_1550734826037_45721 url = 'http://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118&agent_version=9.11.5&authcookie=null&business_type=17&content_id={}&hot_size=10&last_id=&page={}&page_size=20&types=hot,time&callback=jsonp_1550734826037_45721'.format( str(video_id), str(page)) response = requests.get(url) text = response.text text = text[31:][:-14] text = json.loads(text) # print(text) comment_count = text['data']['count'] # print(comment_count) return comment_count def get_comment(self, video_id, page, source_url, title, comment_count, keyword, source_date, source_time): """ 获取评论内容, 和上面的分开写是为了方便调用和修改 :param video_id: :param page: :return: """ # http://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118&agent_version=9.11.5&authcookie=null&business_type=17&content_id=31067882509&hot_size=10&last_id=&page=1&page_size=10&types=hot,time&callback=jsonp_1550734826037_45721 url = 'http://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118&agent_version=9.11.5&authcookie=null&business_type=17&content_id={}&hot_size=10&last_id=&page={}&page_size=20&types=hot,time&callback=jsonp_1550734826037_45721'.format( str(video_id), page) response = requests.get(url) text = response.text text = text[31:][:-14] text = json.loads(text) # print(22222, text) comments_list = text['data']['comments'] for comment in comments_list: # print(comment) item = {} item['platform'] = '爱奇艺' time_all = comment['addTime'] # #转换成localtime time_local = time.localtime(float(time_all)) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) # "%Y-%m-%d %H:%M:%S" # print(dt) item['date'] = str(dt).split(' ')[0] item['time'] = str(dt).split(' ')[1] item['title'] = title item['author'] = comment['userInfo']['uname'] item['content'] = comment['content'] item['floor'] = comment['floor'] item['keyword'] = keyword item['comment_url'] = source_url item['source_url'] = source_url item['comments_count'] = '' item['likes'] = comment['likes'] item['views'] = '' item['topic_id'] = source_url.split('/')[-1].split('.')[0] # 主贴id item['author_id'] = comment['userInfo']['uid'] # 作者id item['content_id'] = comment['id'] # 作者id item['file_code'] = '112' item['source_date'] = source_date item['source_time'] = source_time item['reposts_count'] = '' self.write_comment_jsonfile(item) if int(comment_count) > 20 * page: # 判断评论数量,进行翻页操作 page += 1 self.get_comment(video_id, page, source_url, title, comment_count, keyword, source_date, source_time) def get_file_name_time(self): a = str(datetime.now()) hour = a.split(' ')[-1].split(':')[0] num = int(hour) / 3 num = int(num) * 3 if num == 0: num = 24 a = str(datetime.now() - timedelta(days=1)) # 昨天时间 num = a.split(' ')[0] + ' ' + str(num) return num # 写入json文件 def write_news_jsonfile(self, item): logging.log(31, '写入数据......') item = json.dumps(dict(item), ensure_ascii=False) + '\n' # with open('./../aiqiyi/json_file/{}/{}_aiqiyi_video_adidas.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f: # f.write(item.encode("utf-8")) try: # str_time = time.time() self.hdfsclient.new_write( '{}/{}/111_{}_{}_aiqiyi_video.json'.format( self.file_path, str(datetime.now()).split(' ')[0].replace('-', ''), str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item, encoding='utf-8') # get_time = time.time() - str_time # print('用时:', get_time) except: logging.log(31, '视频数据写入重试中.....') self.write_news_jsonfile(item) def write_comment_jsonfile(self, item): logging.log(31, '写入评论数据......') item = json.dumps(dict(item), ensure_ascii=False) + '\n' # with open('./../aiqiyi/json_file/{}/{}_aiqiyi_video_comment_adidas.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f: # f.write(item.encode("utf-8")) try: self.hdfsclient.new_write( '{}/{}/112_{}_{}_aiqiyi_video_comment.json'.format( self.comment_apth, str(datetime.now()).split(' ')[0].replace('-', ''), str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item, encoding='utf-8') except: logging.log(31, '评论数据写入重试中.....') self.write_comment_jsonfile(item) def run(self): url_list = get_config_para('nike_daily_keywords') logger.log(31, url_list) for item in url_list: # print(1) keyword = item['keywords'] logger.log(31, keyword) # https://so.iqiyi.com/so/q_%E5%A5%A5%E8%BF%AAA3%E4%B8%A4%E5%8E%A2_ctg_%E6%B1%BD%E8%BD%A6_t_0_page_1_p_1_qc_0_rd__site_iqiyi_m_4_bitrate_?af=true # for keyword in cols: url = 'https://so.iqiyi.com/so/q_{}_ctg__t_0_page_1_p_1_qc_0_rd_2_site_iqiyi_m_4_bitrate_?af=true'.format( keyword) self.is_stop = False self.get_video_list(url, keyword)
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self, redis_example): # 时间部分 # 爬虫开始抓取的日期 date = datetime.now() - timedelta(days=1) news_start_time = str(date).split(' ')[0] # 爬虫结束的抓取日期 current_time = datetime.now() # 当前日期 current_day = str(current_time).split(' ')[0] print('爬取时间段:{}到{}'.format(news_start_time, current_day)) logging.info('爬取时间段:{}到{}'.format(news_start_time, current_day)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = current_day # 标记爬虫工作 self.is_work = False self.redis_example = redis_example self.pid = os.getpid() # 链接hdfs self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') self.hdfsclient.makedirs( '/user/cspider_daily/nike_daily/ecommerce/{}'.format( time.strftime('%Y%m%d'))) # 创建每日文件夹 self.time_data = str(time.time()).split('.')[0] # 替换所有的HTML标签 def re_html(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r'\n', '', ret1) ret3 = re.sub(r'\u3000', '', ret2) ret4 = re.sub(r'品牌:', '', ret3) ret5 = re.sub(r'\xa0', '', ret4) ret6 = re.sub(r'→_→', '', ret5) ret7 = re.sub(r'…', '', ret6) ret8 = re.sub(r'https:', '', ret7) ret9 = re.sub(r'\[', '', ret8) ret10 = re.sub(r'\]', '', ret9) ret11 = re.sub(r"'", "", ret10) return ret11 except: pass # 过滤月销量里面的非数字 def re_not_number(self, data): try: message = str(data) ret1 = re.sub(r'\D', '', message) return ret1 except: pass def parse_goods_url(self, items): goods_dict = dict() goods_dict['平台'] = items['平台'] goods_dict['关键词'] = items['关键词'] goods_dict['商品名'] = items['商品名'] goods_dict['商品图片'] = items['商品图片'] goods_dict['URL'] = items['URL'] goods_dict['shop_name'] = items['shop_name'] goods_dict['价格'] = items['价格'] goods_dict['goods_id'] = items['goods_id'] goods_dict['品牌'] = items['品牌'] goods_dict['月销量'] = '' # logger.log(31, '--------********正在抓取的商品是:%s********--------' % goods_dict) self.parse_comment_num(goods_dict) # # 解析商品品牌信息 # def parse_goods_details(self, items): # try: # # print(goods_dict) # goods_url = items['URL'] # # print(goods_url) # # # 截取评论拼接url里面的productId # productId = items['goods_id'] # # print(productId) # headers = { # 'content-type': 'text/html; charset=gbk', # 'cookie': 'shshshfpa=32a16413-dbf0-50ea-e5b3-fc0700600c82-1555380265; shshshfpb=xpfj85AdZf7nEIXa%2FfPnKQA%3D%3D; user-key=76e73b75-478f-450a-843d-e6bc97ab6f57; TrackID=1JkU9AvzDgHTRRBHhgHdYahMQFpg9HwywXxp4mumaDTg3wgCwgl-Om3llO2sZlBTQ7ojPYO3q3E7f1jiEFu3roH67lDo9yP-tEUKh5hPh0R0; pinId=0ng4x50EOTPaVd8k7Hb6MA; pin=t15239619067; unick=t152*****067; _tp=WXVubGec3KjciXDtJzPQhA%3D%3D; _pst=t15239619067; mt_xid=V2_52007VwMWVllaW1scThxaBGIDEFFYXlRbGEwdbARlBkJVVVBVRhwZHV4ZYgRGVEEIVgpMVRxbAWYEQlNfUFQPF3kaXQVvHxNXQVhaSx9JEl8NbAAbYl9oUmoWQRhYBGULEFRVWltTGkkcWgZiMxdb; unpl=V2_ZzNtbRBSRkd2CBFULxxcBmIBFV0SUxYRfFsTAHweWAdiChReclRCFX0UR1FnGVQUZwYZXktcQRRFCEdkeB5fA2AFEFlBZxVLK14bADlNDEY1WnwHBAJfFn0PTlJ7GFQFYwIabXJUQyV1CXZUfx1YB24CEVpHUUIQdQpFUX0fXQJiByJtRWdzJXEMQFF6GGwEVwIiHxYLSxV2CkdTNhlYAWMBG1xBUEYTdA1GVngcWgNmBBdZclZzFg%3d%3d; __jdv=122270672|google-search|t_262767352_googlesearch|cpc|kwd-296971091509_0_c44c21f1e4124361a5d58bde66534872|1555655309636; cn=1; _gcl_au=1.1.1967935789.1555659711; __jdc=122270672; areaId=2; __jdu=15553802647041324770645; __jda=122270672.15553802647041324770645.1555380265.1556415731.1556518168.15; ipLoc-djd=2-2830-51800-0; wlfstk_smdl=zn0664dqolt95jf7g1wjtft1hao7l0yl; 3AB9D23F7A4B3C9B=HPX726VSHMRMSR3STZRR7N5NRDNPYWVN43VETWWM5H7ZKTJNQRUDNAN3OFAJHRA4GMFUVMZ4HQPSNV63PBO6R5QDQI; shshshfp=4a332a1f062877da491a157dabe360b2; shshshsID=60254c5e3d13551f63eed3d934c61d6d_5_1556518922567; __jdb=122270672.8.15553802647041324770645|15.1556518168', # 'upgrade-insecure-requests': '1', # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' # } # try: # time.sleep(0.2) # response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) # except (requests.exceptions.ProxyError, requests.exceptions.ConnectionError, ConnectionResetError): # try: # time.sleep(0.2) # response = requests.get(url=goods_url, headers=headers, allow_redirects=False, timeout=30) # except: # time.sleep(0.2) # response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) # # print(response.content) # # 将响应转换成一个element对象 # html = etree.HTML(response.content) # # print(html) # # 获取所有品牌节点列表 # try: # pin_pai = html.xpath('//div[@class="p-parameter"]/ul[1]/li/@title')[0] # except: # pin_pai = '' # # print(pin_pai_list) # goods_dict = dict() # goods_dict['平台'] = items['平台'] # goods_dict['关键词'] = items['关键词'] # goods_dict['URL'] = items['URL'] # goods_dict['价格'] = items['价格'] # goods_dict['商品名'] = items['商品名'] # goods_dict['品牌'] = pin_pai # goods_dict['月销量'] = '' # goods_dict['shop_name'] = items['shop_name'] # goods_dict['productId'] = productId # # print(goods_dict) # self.parse_comment_num(goods_dict) # except: # print(111111111111111111111111, traceback.format_exc()) # 抓取商品评论数 def parse_comment_num(self, goods_dict): try: productId = goods_dict['goods_id'] referer_url = goods_dict['URL'] comment_url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv46&productId={}&score=0&sortType=6&page=0&pageSize=10&isShadowSku=0&rid=0&fold=1'.format( productId) headers = { 'content-type': 'text/html;charset=GBK', 'authority': 'sclub.jd.com', 'method': 'GET', # 'cookie': 'shshshfpa=32a16413-dbf0-50ea-e5b3-fc0700600c82-1555380265; shshshfpb=xpfj85AdZf7nEIXa%2FfPnKQA%3D%3D; user-key=76e73b75-478f-450a-843d-e6bc97ab6f57; TrackID=1JkU9AvzDgHTRRBHhgHdYahMQFpg9HwywXxp4mumaDTg3wgCwgl-Om3llO2sZlBTQ7ojPYO3q3E7f1jiEFu3roH67lDo9yP-tEUKh5hPh0R0; pinId=0ng4x50EOTPaVd8k7Hb6MA; pin=t15239619067; unick=t152*****067; _tp=WXVubGec3KjciXDtJzPQhA%3D%3D; _pst=t15239619067; mt_xid=V2_52007VwMWVllaW1scThxaBGIDEFFYXlRbGEwdbARlBkJVVVBVRhwZHV4ZYgRGVEEIVgpMVRxbAWYEQlNfUFQPF3kaXQVvHxNXQVhaSx9JEl8NbAAbYl9oUmoWQRhYBGULEFRVWltTGkkcWgZiMxdb; unpl=V2_ZzNtbRBSRkd2CBFULxxcBmIBFV0SUxYRfFsTAHweWAdiChReclRCFX0UR1FnGVQUZwYZXktcQRRFCEdkeB5fA2AFEFlBZxVLK14bADlNDEY1WnwHBAJfFn0PTlJ7GFQFYwIabXJUQyV1CXZUfx1YB24CEVpHUUIQdQpFUX0fXQJiByJtRWdzJXEMQFF6GGwEVwIiHxYLSxV2CkdTNhlYAWMBG1xBUEYTdA1GVngcWgNmBBdZclZzFg%3d%3d; __jdv=122270672|google-search|t_262767352_googlesearch|cpc|kwd-296971091509_0_c44c21f1e4124361a5d58bde66534872|1555655309636; cn=1; _gcl_au=1.1.1967935789.1555659711; __jdc=122270672; areaId=2; __jdu=15553802647041324770645; __jda=122270672.15553802647041324770645.1555380265.1556415731.1556518168.15; ipLoc-djd=2-2830-51800-0; wlfstk_smdl=zn0664dqolt95jf7g1wjtft1hao7l0yl; 3AB9D23F7A4B3C9B=HPX726VSHMRMSR3STZRR7N5NRDNPYWVN43VETWWM5H7ZKTJNQRUDNAN3OFAJHRA4GMFUVMZ4HQPSNV63PBO6R5QDQI; shshshfp=4a332a1f062877da491a157dabe360b2; shshshsID=60254c5e3d13551f63eed3d934c61d6d_8_1556519503209; __jdb=122270672.11.15553802647041324770645|15.1556518168; JSESSIONID=831DC446C63444F227CAFCFFA4085E88.s1', 'referer': referer_url, 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } try: time.sleep(0.1) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.1) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.1) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) comment_data = re.search(r'{"productAttr":null.*]}', response.text) if 'commentCount' in response.text: comment_number = json.loads(comment_data.group( ))['productCommentSummary']['commentCount'] goods_dict['comment_num'] = json.loads(comment_data.group( ))['productCommentSummary']['commentCountStr'] if int(comment_number) == 0: # print('****************该商品没有评论数据*********') # logger.log(31, '****************该商品没有评论数据*********') pass else: pages = int(math.ceil(float(int(comment_number) / 10))) self.goods_comments(goods_dict, pages) except: print(33333333333333333333333, traceback.format_exc()) # 解析商品评论 def goods_comments(self, goods_dict, pages): try: is_break = self.is_work # print(goods_dict) productId = goods_dict['goods_id'] headers = { 'content-type': 'text/html;charset=GBK', 'authority': 'sclub.jd.com', 'method': 'GET', # 'cookie': 'shshshfpa=32a16413-dbf0-50ea-e5b3-fc0700600c82-1555380265; shshshfpb=xpfj85AdZf7nEIXa%2FfPnKQA%3D%3D; user-key=76e73b75-478f-450a-843d-e6bc97ab6f57; TrackID=1JkU9AvzDgHTRRBHhgHdYahMQFpg9HwywXxp4mumaDTg3wgCwgl-Om3llO2sZlBTQ7ojPYO3q3E7f1jiEFu3roH67lDo9yP-tEUKh5hPh0R0; pinId=0ng4x50EOTPaVd8k7Hb6MA; pin=t15239619067; unick=t152*****067; _tp=WXVubGec3KjciXDtJzPQhA%3D%3D; _pst=t15239619067; mt_xid=V2_52007VwMWVllaW1scThxaBGIDEFFYXlRbGEwdbARlBkJVVVBVRhwZHV4ZYgRGVEEIVgpMVRxbAWYEQlNfUFQPF3kaXQVvHxNXQVhaSx9JEl8NbAAbYl9oUmoWQRhYBGULEFRVWltTGkkcWgZiMxdb; unpl=V2_ZzNtbRBSRkd2CBFULxxcBmIBFV0SUxYRfFsTAHweWAdiChReclRCFX0UR1FnGVQUZwYZXktcQRRFCEdkeB5fA2AFEFlBZxVLK14bADlNDEY1WnwHBAJfFn0PTlJ7GFQFYwIabXJUQyV1CXZUfx1YB24CEVpHUUIQdQpFUX0fXQJiByJtRWdzJXEMQFF6GGwEVwIiHxYLSxV2CkdTNhlYAWMBG1xBUEYTdA1GVngcWgNmBBdZclZzFg%3d%3d; __jdv=122270672|google-search|t_262767352_googlesearch|cpc|kwd-296971091509_0_c44c21f1e4124361a5d58bde66534872|1555655309636; cn=1; _gcl_au=1.1.1967935789.1555659711; __jdc=122270672; areaId=2; __jdu=15553802647041324770645; __jda=122270672.15553802647041324770645.1555380265.1556415731.1556518168.15; ipLoc-djd=2-2830-51800-0; wlfstk_smdl=zn0664dqolt95jf7g1wjtft1hao7l0yl; 3AB9D23F7A4B3C9B=HPX726VSHMRMSR3STZRR7N5NRDNPYWVN43VETWWM5H7ZKTJNQRUDNAN3OFAJHRA4GMFUVMZ4HQPSNV63PBO6R5QDQI; shshshfp=4a332a1f062877da491a157dabe360b2; shshshsID=60254c5e3d13551f63eed3d934c61d6d_8_1556519503209; __jdb=122270672.11.15553802647041324770645|15.1556518168; JSESSIONID=831DC446C63444F227CAFCFFA4085E88.s1', 'referer': '{}'.format(goods_dict['URL']), 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } if int(pages) >= 50: pages_num = 59 else: pages_num = pages # 抓取商品评论链接(总共50页,第一页从0开始) for i in range(0, int(pages_num)): comment_url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv46&productId={}&score=0&sortType=6&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1'.format( productId, i) # print(comment_url) try: time.sleep(0.1) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except (requests.exceptions.ProxyError, requests.exceptions.ConnectionError, ConnectionResetError): try: time.sleep(0.1) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.1) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) # logger.log(31, "正在抓取的页面是: %s" % comment_url) comments = response.text comment = re.search( r'{"productAttr":null.*"afterDays":0}]}|{"productAttr":null.*]}', comments) # 总销量 if 'comments' in comments: items = json.loads(comment.group())['comments'] # print(pages_num, len(items)) if int(len(items)) == 0: break else: for item in items: date_data = item['creationTime'].split( ' ')[0].strip() time_data = item['creationTime'].split( ' ')[1].strip() # print(date, time) try: content = self.re_html(item['content']) except: content = '' # 追加评论 try: comments_2 = item['afterUserComment'][ 'content'] except: comments_2 = '' # 判断评论时间是否在规定的抓取时间内 if self.start_time <= date_data: goods_comment_dict = dict() goods_comment_dict['platform'] = goods_dict[ '平台'] goods_comment_dict['date'] = date_data.strip() goods_comment_dict['time'] = time_data.strip() goods_comment_dict['keyword'] = goods_dict[ '关键词'] goods_comment_dict['name'] = goods_dict['商品名'] goods_comment_dict['imageurl'] = goods_dict[ '商品图片'] goods_comment_dict['audiourl'] = '' goods_comment_dict['url'] = goods_dict['URL'] goods_comment_dict['shop_name'] = goods_dict[ 'shop_name'] goods_comment_dict['user_name'] = item[ 'nickname'] goods_comment_dict['author_id'] = '' goods_comment_dict[ 'content'] = content + ';' + comments_2 goods_comment_dict['content_id'] = str( item['id']) goods_comment_dict['brand'] = goods_dict['品牌'] goods_comment_dict['price'] = goods_dict['价格'] goods_comment_dict['sales'] = goods_dict['月销量'] goods_comment_dict['focus_count'] = '' goods_comment_dict['comment_num'] = goods_dict[ 'comment_num'] goods_comment_dict['views'] = '' goods_comment_dict['likes'] = item[ 'usefulVoteCount'] try: goods_comment_dict[ 'comments_count'] = item['replyCount'] except: goods_comment_dict['comments_count'] = '' goods_comment_dict['reposts_count'] = '' goods_comment_dict['topic_id'] = str( goods_dict['goods_id']) try: goods_comment_dict['type'] = item[ 'productColor'] except: goods_comment_dict['type'] = '' try: goods_comment_dict['size'] = item[ 'productSize'] except: goods_comment_dict['size'] = '' goods_comment_dict['file_code'] = '51' # logger.log(31, '-----------正在写入符合时间的商品信息----------------') # print(goods_comment_dict) item = json.dumps(dict(goods_comment_dict), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_daily/ecommerce/{}/51_{}_{}_jingdong_nike{}.json' .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d'), self.time_data, self.pid), item, encoding='utf-8') if date_data.strip() < self.start_time: is_break = True if is_break: break except: print(22222222222222222222222, traceback.format_exc()) def run(self, lock): for num in range(1000000): lock.acquire() redis_url_num = self.redis_example.llen('JingDong_day_url') if str(redis_url_num) == '0': print( '*****************Redis消息队列中url为空,程序等待中.....进程 {} 等待中......******************' .format(str(os.getpid()))) item = self.redis_example.brpop('JingDong_day_url', timeout=600)[1] lock.release() item1 = json.loads(item.decode()) # print(item) self.parse_goods_url(item1)
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self, redis_example): # 时间判断部分 date = datetime.datetime.now() - timedelta(days=1) news_start_time = str(date).split(' ')[0] now = datetime.datetime.now() - timedelta(days=0) # 昨天时间 now_date = str(now).split(' ')[0] print('爬取时间段:{}到{}'.format(news_start_time, now_date)) logging.info('爬取时间段:{}到{}'.format(news_start_time, now_date)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = now_date # 标记爬虫工作 self.is_work = False self.redis_example = redis_example self.pid = os.getpid() # 链接hdfs self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') self.hdfsclient.makedirs( '/user/cspider_daily/nike_daily/ecommerce/{}'.format( time.strftime('%Y%m%d'))) # 创建每日文件夹 self.time_data = str(time.time()).split('.')[0] # 替换所有的HTML标签 def re_html(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r'\[', '', ret1) ret3 = re.sub(r'\]', '', ret2) ret4 = re.sub(r'广告这些是您在亚马逊上看到的商品广告。点击广告,您将前往商品所在页面。了解更多关于广告的信息', '', ret3) ret5 = re.sub(r'\\xa0', '', ret4) ret6 = re.sub(r'海外购满200元免运费', '', ret5) ret7 = re.sub(r'更多购买选择', '', ret6) ret8 = re.sub(r'品牌', '', ret7) ret9 = re.sub(r'"append","#cm_cr-review_list",', '', ret8) ret10 = re.sub(r'"', '', ret9) return ret10 except: pass # 过滤月销量里面的非数字 def re_not_number(self, data): try: message = str(data) ret1 = re.sub(r'\D', '', message) return ret1 except: pass # 过滤user_name def re_user_name(self, data): try: message = str(data) ret1 = re.sub(r'前\d+名评论人', '', message) ret2 = re.sub(r'\n', '', ret1) return ret2 except: pass # 匹配具体时间 def clean_date(self, x): now = datetime.datetime.now() if str(x).find('昨天') != -1: x = datetime.datetime.strftime(now + datetime.timedelta(days=-1), '%Y-%m-%d %H:%M:%S') elif str(x).find('前天') != -1: x = datetime.datetime.strftime(now + datetime.timedelta(days=-2), '%Y-%m-%d %H:%M:%S') elif str(x).find('天前') != -1: x = datetime.datetime.strftime( now + datetime.timedelta(days=-int(str(x).replace('天前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('小时前') != -1: x = datetime.datetime.strftime( now + datetime.timedelta(hours=-int(str(x).replace('小时前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('分钟前') != -1: x = datetime.datetime.strftime( now + datetime.timedelta(minutes=-int(str(x).replace('分钟前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('今天') != -1: x = str(x).replace('今天', now.strftime('%Y-%m-%d') + ' ') elif str(x).find('刚刚') != -1: x = now.strftime('%Y-%m-%d %H:%M:%S') elif str(x).find('秒前') != -1: x = now.strftime('%Y-%m-%d %H:%M:%S') elif str(x).find('月前') != -1: x = datetime.datetime.strftime( now + datetime.timedelta(weeks=-4 * int(str(x).replace('月前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('周前') != -1: x = datetime.datetime.strftime( now + datetime.timedelta(weeks=-int(str(x).replace('周前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('[') != -1: x = x.replace('[', '').replace(']', '') elif str(x).find('月') != -1: x = x.replace('月', '-').replace('日', '') return x # 抓取商品品牌信息 def parse_goods_brand(self, goods_dict): try: # print(goods_dict) url = goods_dict['url'] # print('*************************商品详情页' + url) headers = { # 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', # 'Cookie': 'x-wl-uid=1TZhgwHTJAuLn8oreMzKQn1F14u+yWLnkVnV1mHxoFBZVluB35GzI3vNZyOaUXm1eXxDdVSvG/jk=; session-id=461-0953337-2517812; ubid-acbcn=462-0558053-9620064; i18n-prefs=CNY; lc-acbcn=zh_CN; x-amz-captcha-1=1565689220474259; x-amz-captcha-2=O0HfV0HAdNq8q0k6ODz5yA==; session-token=Masvfy+QDMESO49Iacs+I77sObVPwrSbsVNucyNsgXupKeHI3zVO2/zgQTAUsJUOOcC8swOMHILZfrVmo85e45fYuETObv3I2N3CYtSgBaET4WZ1l7qnzkzQ0yWNVcqvgtSbNDZXWNii93OIcke5QSx0Y3kmJZaGk5+H9Nn2rD7c2YStoxaV/0yQ0UsfRfwj; csm-hit=tb:s-SKSGNJDF9HE5MK9C3DDT|1566530133484&t:1566530133820&adb:adblk_yes; session-id-time=2082729601l', 'Host': 'www.amazon.cn', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } try: # time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: # time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: # time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) # response.encoding = 'utf-8' # print(response.text) asin_id = response.url.split('/')[4] # print(asin_id) # 将响应转换成一个element对象 html = etree.HTML(response.text) # 获取商品评论数 re_comments_num = html.xpath( '//span[@class="a-size-base a-color-secondary"]/text()') # print(re_comments_num) if re_comments_num == []: # logger.log(31, '--------------没有商品评论信息----------------') pass else: comments_num = self.re_not_number(re_comments_num[0]) # print('评论数: ', comments_num) # 评价人数 goods_dict['achieve_num'] = comments_num # 获取商品品牌信息 brand_data1 = re.search(r'品牌</td><td class="value">.*?</td>', response.text) if brand_data1 != None: brand_name = self.re_html(brand_data1.group()) else: brand_data2 = html.xpath( '//div[@id="ppd"]/div[2]/div[2]/div/div/div[1]/div[1]/div/a/text()' ) if brand_data2 == []: brand_name = '' else: try: brand_name = brand_data2[0].split(' ')[0] except: brand_name = '' # 商品品牌 goods_dict['brand'] = brand_name # 销量 goods_dict['sales'] = '' goods_dict['asin_id'] = asin_id # print(goods_dict) # 抓取页数 page_num = int( math.ceil( float( int(self.re_not_number(goods_dict['achieve_num'])) / 10))) # print('***---回复数: %s,页数:%s ***---' % (comments_num, page_num)) # 抓取评论量 self.parse_amazon_comment(page_num, goods_dict) except: print(22222222222222222222, traceback.format_exc()) # 抓取页数大于0的评论 def parse_amazon_comment(self, page_num, goods_dict): try: is_break = self.is_work # print(goods_dict['url']) headers = { 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Cookie': 'session-id=457-6049818-5407509; i18n-prefs=CNY; ubid-acbcn=461-1543774-5730813; x-wl-uid=1D2HfAfNoe4eUdJ6ZzyM2fnvna5QixxATqyW5m655FgD9MFQ0BQOrYAub+2t2juEPWKvSIO9wETU=; lc-acbcn=zh_CN; session-token=q7jDZTzYPSN0ujucLEDRVnx7QbLwQdbfOyVVn5sdYL1XaQm73hM1Kf01JGRuR/3AZ1IX24BUDL3mq5DGmIiN1UAQ/DtCP/HhHNLIw9ct8KzziVH+J5r2FrvA0ObuVLGlqYbghQbq2Ddhm8zB/AHX7OHvGD0LWTIaDpfYJ62e2fz813rIz0IkwKLvoFjSiT+G; session-id-time=2082729601l; csm-hit=tb:Q0KNXH65T2X9SHESP2YH+s-9R0M13527VFRJHPP284C|1574144443485&t:1574144443485&adb:adblk_yes', 'Host': 'www.amazon.cn', 'Origin': 'https://www.amazon.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.amazon.cn/Nike-%E8%80%90%E5%85%8B-Revolution-4-%E7%94%B7%E5%A3%AB%E8%B7%91%E6%AD%A5%E9%9E%8B/product-reviews/B079QP634Q/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } if int(page_num) >= 50: pages = 50 else: pages = page_num for i in range(1, int(pages) + 1): # print('***---抓取评论页为第%s页***---' % i) url = 'https://www.amazon.cn/hz/reviews-render/ajax/reviews/get/ref=cm_cr_getr_d_paging_btm_prev_{}'.format( i) # print(url) form_data = { 'sortBy': 'recent', 'reviewerType': 'all_reviews', 'formatType': '', 'mediaType': '', 'filterByStar': '', 'pageNumber': i, 'filterByLanguage': '', 'filterByKeyword': '', 'shouldAppend': 'undefined', 'deviceType': 'desktop', 'reftag': 'cm_cr_getr_d_paging_btm_prev_{}'.format(i), 'pageSize': '10', 'asin': goods_dict['asin_id'], 'scope': 'reviewsAjax{}'.format(int(i) - 1) } try: # time.sleep(0.1) response = requests.post(url=url, headers=headers, data=form_data, proxies=proxies, allow_redirects=False, timeout=30) except: try: # time.sleep(0.1) response = requests.post(url=url, headers=headers, data=form_data, proxies=proxies, allow_redirects=False, timeout=30) except: # time.sleep(0.1) response = requests.post(url=url, headers=headers, data=form_data, proxies=proxies, allow_redirects=False, timeout=30) comment_data = response.text.split('&&&')[5:-5] # print(comment_data) comment_dict = dict() for item in comment_data: # print(goods_dict['url']) data = self.re_html(item.replace(' ', '')) # print(data) # 帖子id topic_id = re.search(r'<divid=\\".*?\\', item.replace(' ', '')).group().replace( '<divid=\\"', '').replace( '\\', '') # 评价用户名 user_name = self.re_user_name( re.sub(r'\d.\d颗星,最多5颗星', '-', data).split('-')[0].replace('\\n', '')) # 评论日期 date_data_test = re.search( r'\d{1,4}年\d{1,4}月\d{1,4}日', data).group().replace( '年', '-').replace('月', '-').replace('日', '') a = date_data_test.split('-')[1] b = date_data_test.split('-')[2] if int(len(a)) == 1 and int(len(b)) != 1: date_data = date_data_test.split( '-')[0] + '-0' + date_data_test.split( '-')[1] + '-' + date_data_test.split('-')[2] elif int(len(a)) != 1 and int(len(b)) == 1: date_data = date_data_test.split( '-')[0] + '-' + date_data_test.split( '-')[1] + '-0' + date_data_test.split('-')[2] elif int(len(a)) == 1 and int(len(b)) == 1: date_data = date_data_test.split( '-')[0] + '-0' + date_data_test.split( '-')[1] + '-0' + date_data_test.split('-')[2] else: date_data = date_data_test # 评价时间 time_data = '' # 作者id author_id = '' # print(achieve_content_data) # print(data) test_type_sisz = re.search( '\d{1,4}年\d{1,2}月\d{1,2}日.*?有帮助', data) # print(test_type_sisz.group()) # 鞋子类型 try: type_data = test_type_sisz.group().split(':')[2].split( '已确认购买')[0].replace('颜色', '') except: type_data = '' # 鞋子尺码 try: size = data.split(':')[1].replace('颜色', '') except: size = '' # print(type) # print(size) # 判断评论时间是否在规定的抓取时间内 if self.start_time <= date_data.strip(): comment_dict['platform'] = goods_dict['platform'] comment_dict['date'] = date_data.strip() comment_dict['time'] = time_data.strip() comment_dict['keyword'] = goods_dict['keyword'] comment_dict['name'] = goods_dict['name'] comment_dict['imageurl'] = goods_dict['商品图片'] comment_dict['audiourl'] = '' comment_dict['url'] = goods_dict['url'] comment_dict['shop_name'] = '' comment_dict['user_name'] = self.re_user_name( user_name) try: comment_dict['content'] = data.split( '已确认购买')[1].split('有帮助')[0].split('\\n')[0] except: comment_dict['content'] = '' comment_dict['content_id'] = str(topic_id) comment_dict['brand'] = goods_dict['brand'] comment_dict['price'] = goods_dict['price'] comment_dict['sales'] = goods_dict['sales'] comment_dict['focus_count'] = '' comment_dict['comment_num'] = goods_dict['achieve_num'] comment_dict['views'] = '' comment_dict['likes'] = '' comment_dict['comments_count'] = '' comment_dict['reposts_count'] = '' comment_dict['author_id'] = str(author_id) comment_dict['topic_id'] = str( goods_dict['url'].split('/')[4]) comment_dict['type'] = type_data comment_dict['size'] = size comment_dict['file_code'] = '54' # print('***********正在写入符合时间的评论*******************') # print(comment_dict) # items = json.dumps(dict(comment_dict), ensure_ascii=False) + '\n' # with open('./json_data/54_{}_{}_amazon_nike{}.json'.format(time.strftime('%Y%m%d'), self.time_data, self.pid), 'ab') as f: # f.write(items.encode("utf-8")) item = json.dumps(dict(comment_dict), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_daily/ecommerce/{}/54_{}_{}_amazon_nike{}.json' .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d'), self.time_data, self.pid), item, encoding='utf-8') if date_data.strip() < self.start_time: is_break = True if is_break: break except: print(444444444444444444444, traceback.format_exc()) def run(self, lock): for num in range(100000): lock.acquire() redis_url_num = self.redis_example.llen('anazon_day_url') if str(redis_url_num) == '0': print( '********************\nRedis消息队列中url为空.....\n进程 {} 抓取结束......\n********************' .format(str(os.getpid()))) item = self.redis_example.brpop('anazon_day_url', timeout=600)[1] lock.release() item1 = json.loads(item.decode()) # print(item1) self.parse_goods_brand(item1)
class Spider(object): """ get 文章 """ def __init__(self, file_path, comment_path, need_time): self.headers_one = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36' } # # 时间判断部分 # date = datetime.now() - timedelta(days=300) # news_start_time = str(date).split(' ')[0] # yesterday = datetime.now() - timedelta(days=1) # 昨天时间 # yesterday = str(yesterday).split(' ')[0] # # # 定义开始时间 y-m-d 离现在时间远 news_start_time # self.start_time = news_start_time # # 定义结束时间 y-m-d 离现在时间近 yesterday # self.end_time = yesterday # print('爬取时间段:{}到{}'.format(self.start_time, self.end_time)) # # logging.info('爬取时间段:{}到{}'.format(self.start_time, self.end_time)) # # # 定义评论的抓取时间范围 # # self.comment_start_time = yesterday # 一天回复 # self.comment_start_time = '2019-08-01' # 一天回复 # # self.comment_start_time = '' # 不限定时间回复 # # self.comment_end_time = yesterday # # self.comment_end_time = yesterday # get_now_time = time.time() - 86400 get_now_time = time.time() - int(need_time) time_local = time.localtime(float(get_now_time)) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M", time_local) # "%Y-%m-%d %H:%M:%S" end_t = time.time() time_local = time.localtime(float(end_t)) # 转换成新的时间格式(2016-05-05 20:28:54) end_dt = time.strftime("%Y-%m-%d %H:%M", time_local) # "%Y-%m-%d %H:%M:%S" # end_time = str(end_time).split(' ')[0] logger.log(31, '爬取时间段:{}到{}'.format(dt, end_dt)) # 定义开始时间 y-m-d 离现在时间远 self.start_time = dt # self.start_time = '2019-09-09 12:01' # 定义结束时间 y-m-d 离现在时间近 self.end_time = end_dt # self.end_time = '2019-09-16 12:01' # 标记爬虫工作 self.is_work = True self.file_name_time = self.get_file_name_time() self.file_path = file_path self.comment_apth = comment_path self.hdfsclient = HdfsClient(url='http://*****:*****@class="clearfix"]/li') self.is_work = True for li in li_list: title = li.xpath('.//div[@class="news-intro"]/text()')[0] # print(title) views = li.xpath('.//span[@class="tip-view"]/text()')[0] comments_count = li.xpath( './/span[@class="tip-comment"]/text()')[0] date_all = li.xpath('.//span[@class="tip-date"]/text()')[0] date_all = self.time_change(date_all) # 做时间判断部分--------------- logger.log(31, '时间' + date_all) if len(date_all) == 10: date_all += ' 12:01:01' if len(date_all) == 7: date_all += '-01 12:01:01' get_news_time = time.mktime( time.strptime(date_all[:-3], "%Y-%m-%d %H:%M")) end_time = time.mktime( time.strptime(self.end_time, "%Y-%m-%d %H:%M")) if self.start_time != '': start_time = time.mktime( time.strptime(self.start_time, "%Y-%m-%d %H:%M")) else: start_time = time.mktime( time.strptime('2010-1-1', "%Y-%m-%d %H:%M")) if float(get_news_time) < float(start_time): self.is_work = False # 返回的回答消息是按时间进行排序的,所以当时间小于指定时间时,就停止爬取, elif float(start_time) <= float(get_news_time) <= float(end_time): # print(views, comments_count, date_all) news_url = 'http://www.dunkhome.com' + li.xpath( './/a[@title]/@href')[0].strip() self.get_news_info(news_url, views, comments_count, title) if self.is_work: # 判断是否要继续进行翻页 # 列表页翻页 next_page = data.xpath('.//a[@rel="next"]/@href')[0] # print(next_page) next_page_url = 'http://www.dunkhome.com' + next_page self.get_news_page_list_two(next_page_url) else: logger.log(31, '版块抓取到指定时间......') def get_news_page_list_two(self, url): """ 社区版块 列表页 :param url: :return: """ response = requests.get(url, headers=self.headers_one) data = etree.HTML(response.content.decode()) li_list = data.xpath( './/div[@class="items"]/div[@class="evaluation-item s-object-item s-eva-item"]' ) self.is_work = True for li in li_list: title = li.xpath('.//h6/text()')[0] # print(title) views = '' comments_count = li.xpath('.//a[@class="item-comment"]/text()')[0] date_all = li.xpath('.//span[@class="item-time" ]/text()')[0] date_all = self.time_change(date_all) # 做时间判断部分--------------- # print(date_all) if len(date_all) == 16: date_all += ':12' get_news_time = time.mktime( time.strptime(date_all[:-3], "%Y-%m-%d %H:%M")) end_time = time.mktime( time.strptime(self.end_time, "%Y-%m-%d %H:%M")) if self.start_time != '': start_time = time.mktime( time.strptime(self.start_time, "%Y-%m-%d %H:%M")) else: start_time = time.mktime( time.strptime('2010-1-1', "%Y-%m-%d %H:%M")) if float(get_news_time) < float(start_time): self.is_work = False # 返回的回答消息是按时间进行排序的,所以当时间小于指定时间时,就停止爬取, elif float(start_time) <= float(get_news_time) <= float(end_time): # print(views, comments_count, date_all) news_url = 'http://www.dunkhome.com' + li.xpath( './/div[@class="item-content"]/a/@href')[0].strip() try: self.get_news_info(news_url, views, comments_count, title) except: pass if self.is_work: # 判断是否要继续进行翻页 # 列表页翻页 next_page = data.xpath('.//a[@rel="next"]/@href')[0] # print(next_page) next_page_url = 'http://www.dunkhome.com' + next_page self.get_news_list_page(next_page_url) else: logger.log(31, '版块抓取到指定时间......') def get_news_info(self, url, views, comments_count, title): logger.log(31, url) response = requests.get(url, headers=self.headers_one) data = etree.HTML(response.content.decode()) content_list = data.xpath( './/div[@class="content s-news-content"]//p/text() | .//div[@class="show-content"]//p/text()' ) # print(content_list) item = {} item['platform'] = 'get' try: date_all = data.xpath('.//div[@class="fl"]/span/text()')[0] item['date'] = date_all.split(' ')[0] except: date_all = data.xpath('.//span[@class="i-time"]/text()')[0] item['date'] = date_all.split(' ')[0] item['time'] = date_all.split(' ')[1] item['title'] = title item['content'] = ''.join(content_list) item['content_id'] = url.split('/')[-1] try: item['article_author'] = data.xpath( './/span[@class="s-name"]/text()')[0] except: try: item['article_author'] = data.xpath( './/span[@class="i-nickname"]/text()')[0] except: item['article_author'] = '' item['clicks'] = '' item['views'] = views item['comments_count'] = comments_count try: item['likes'] = data.xpath('.//span[@class="item-like"]/text()')[0] except: item['likes'] = '' item['dislikes'] = '' item['keyword'] = '' item['article_url'] = url item['series_url'] = '' item['list_url'] = '' item['article_type'] = '' item['article_source'] = '' item['insert_time'] = str(datetime.now()).split('.')[0] item['update_time'] = str(datetime.now()).split('.')[0] topic_id = url.split('/')[-1] item['topic_id'] = url.split('/')[-1] item['content_id'] = url.split('/')[-1] item['reposts_count'] = '' item['file_code'] = '154' try: item['author_id'] = data.xpath( './/div[@class="t-user-avator"]/a/@href')[0].split('/')[-1] except: try: item['author_id'] = data.xpath( './/div[@class="avator"]/img/@src')[0].split( '/')[-1].split('.')[0].split('_')[-1] except: item['author_id'] = '' # print(item) self.write_news_jsonfile(item) if int(comments_count) > 0: all_page = math.ceil(float(int(comments_count)) / 10) for i in range(1, int(all_page) + 1): comment_url = url + '?page=' + str(i) self.get_comment(comment_url, url, title, topic_id) def get_comment(self, url, news_url, title, topic_id): # print(111111111111111111111111) response = requests.get(url, headers=self.headers_one) data = etree.HTML(response.content.decode()) li_list = data.xpath('.//div[@class="comment-list"]/ul/li') for li in li_list: content_id = li.xpath('.//parent::li/@data-id')[0] # print(etree.tostring(li)) content = li.xpath('.//div[@class="c-message"]//p/text()')[0] item = {} item['platform'] = 'get' item['source_date'] = '' item['source_time'] = '' date_all = li.xpath( './/div[@class="c-nickname"]/text()')[0].strip() date_all = self.time_change(date_all) # 评论部分做时间判断部分--------------- get_news_time = time.mktime( time.strptime(date_all[:-3], "%Y-%m-%d %H:%M")) end_time = time.mktime( time.strptime(self.end_time, "%Y-%m-%d %H:%M")) if self.start_time != '': start_time = time.mktime( time.strptime(self.start_time, "%Y-%m-%d %H:%M")) else: start_time = time.mktime( time.strptime('2010-1-1', "%Y-%m-%d %H:%M")) if float(get_news_time) < float(start_time): self.is_get_comment = False # 返回的回答消息是按时间进行排序的,所以当时间小于指定时间时,就停止爬取, break elif float(start_time) <= float(get_news_time) <= float(end_time): item['date'] = date_all.split(' ')[0] item['time'] = '' item['title'] = title item['author'] = li.xpath( './/div[@class="c-nickname"]/span/text()')[0].strip() item['author_id'] = li.xpath( './/div[@data-user-id]/@data-user-id')[0] item['content'] = content item['content_id'] = content_id item['floor'] = '' item['keyword'] = '' item['source_url'] = news_url item['comment_url'] = '' item['views'] = '' item['comments_count'] = '' try: item['likes'] = li.xpath( './/a[@class="item-like"]/text()')[0] except: item['likes'] = '' item['dislikes'] = '' item['insert_time'] = str(datetime.now()).split('.')[0] item['update_time'] = str(datetime.now()).split('.')[0] item['topic_id'] = topic_id item['reposts_count'] = '' item['file_code'] = '155' # print(item) self.write_comment_jsonfile(item) def time_change(self, str_time): """ 时间可是转换, 将‘分钟前’,‘小时前’,‘昨天’,‘前天’,转换成标准时间格式Y-m-d h:m:s :param str_time: :return: """ # print(str_time, 55555555555) if '秒' in str_time or '刚刚' in str_time: get_time = str(datetime.now()).split('.')[0] return get_time elif '分钟' in str_time: get_time_num = re.search('\d{1,2}', str_time).group(0) get_time_num = int(get_time_num) * 60 # print(get_time_num) int_time = int(str(time.time()).split('.')[0]) - get_time_num # #转换成localtime time_local = time.localtime(float(int_time)) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) # "%Y-%m-%d %H:%M:%S" return dt elif '小时' in str_time: get_time_num = re.search('\d{1,2}', str_time).group(0) get_time_num = int(get_time_num) * 60 * 60 # print(get_time_num) int_time = int(str(time.time()).split('.')[0]) - get_time_num # #转换成localtime time_local = time.localtime(float(int_time)) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) # "%Y-%m-%d %H:%M:%S" return dt elif '今天' in str_time: part_time = str_time.split(' ')[1] yesterday = datetime.now() - timedelta(days=0) # 今天时间 dt = str(yesterday).split(' ')[0] + ' ' + part_time return dt elif '昨天' in str_time: part_time = str_time.split(' ')[1] yesterday = datetime.now() - timedelta(days=1) # 昨天时间 yesterday = str(yesterday).split(' ')[0] + ' ' + part_time return yesterday elif '前天' in str_time: part_time = str_time.split(' ')[1] two_days_ago = datetime.now() - timedelta(days=2) # 前天时间 two_days_ago = str(two_days_ago).split( ' ')[0] + ' ' + part_time.replace('点', ':').replace('分', '') return two_days_ago elif '天前' in str_time: get_time_num = re.search('\d{1,2}', str_time).group(0) get_time_num = int(get_time_num) * 60 * 60 * 24 # print(get_time_num) int_time = int(str(time.time()).split('.')[0]) - get_time_num # #转换成localtime time_local = time.localtime(float(int_time)) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) # "%Y-%m-%d %H:%M:%S" return dt elif '201' not in str_time: str_time = '2019-' + str_time return str_time else: return str_time # 写入json文件 def write_news_jsonfile(self, item): item = json.dumps(dict(item), ensure_ascii=False) + '\n' # with open('./../get/json_file/{}/{}_get_news.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f: # f.write(item.encode("utf-8")) self.hdfsclient.new_write('{}/{}/{}/154_{}_{}_get_news.json'.format( self.file_path, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name, str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item, encoding='utf-8') def write_comment_jsonfile(self, item): item = json.dumps(dict(item), ensure_ascii=False) + '\n' # with open('./../get/json_file/{}/{}_get_news_comments.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f: # f.write(item.encode("utf-8")) self.hdfsclient.new_write( '{}/{}/{}/155_{}_{}_get_news_comments.json'.format( self.comment_apth, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name, str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item, encoding='utf-8') def get_file_name_time(self): a = str(datetime.now()) hour = a.split(' ')[-1].split(':')[0] num = int(hour) / 3 num = int(num) * 3 if num == 0: num = 24 a = str(datetime.now() - timedelta(days=1)) # 昨天时间 num = a.split(' ')[0] + ' ' + str(num) return num def run(self): url = 'http://www.dunkhome.com/news' self.get_news_list_page(url) url = 'http://www.dunkhome.com/evaluations' self.get_news_page_list_two(url)
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self): # 时间判断部分 date = datetime.datetime.now() - timedelta(days=1) news_start_time = str(date).split(' ')[0] now_date = datetime.datetime.now() - timedelta(days=0) # 当前时间 now_time = str(now_date).split(' ')[0] print('爬取时间段:{}到{}'.format(news_start_time, now_time)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = now_time self.is_break = False self.pid = os.getpid() # 链接hdfs self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000', user='******') self.hdfsclient.makedirs( '/user/cspider_daily/nike_daily/weibo/{}'.format( time.strftime('%Y%m%d'))) # 创建每日文件夹 # 使用try清理数据 def clean_data(self, data): try: clean_data = data except: clean_data = '' return clean_data # 时间格式转换 def changetime(self, timestr): fmt2 = '%a %b %d %H:%M:%S %z %Y' timestrp = time.strptime(timestr, fmt2) # temp_time = time.strftime("%Y-%m-%d %H:%M:%S", timestrp) # logger.info(f"last time {temp_time}, continue request") timestampstr = time.mktime(timestrp) timeArray = time.localtime(int(timestampstr)) otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) # print(otherStyleTime) # 2013--10--10 23:40:00 return otherStyleTime # 请求获取用户age def parse_age(self, uid): try: weibo_dict = dict() # 请求接口需要的携带参数 data = { 'access_token': code, # 访问许可 'uid': '{}'.format(uid), # 搜索关键词 } url = 'https://c.api.weibo.com/2/users/birthday/other.json' try: time.sleep(0.1) response = requests.get(url, data, timeout=30) except: try: time.sleep(0.1) response = requests.get(url, data, timeout=30) except: time.sleep(0.1) response = requests.get(url, data, timeout=30) # print('000000000000000000', response.text) if 'birthday_visible' in response.text: # print('有用户年龄') age_data = json.loads(response.text)['birthday'] birthday_visible = json.loads( response.text)['birthday_visible'] if int(birthday_visible) == 3: if age_data == '': weibo_dict['age'] = '' # print('111111111111111111', weibo_dict) self.parse_weibo(weibo_dict, uid) elif int(age_data) >= 1900: weibo_dict['age'] = age_data # print('111111111111111111', weibo_dict) self.parse_weibo(weibo_dict, uid) else: weibo_dict['age'] = '' # print('111111111111111111', weibo_dict) self.parse_weibo(weibo_dict, uid) else: weibo_dict['age'] = '' # print('111111111111111111', weibo_dict) self.parse_weibo(weibo_dict, uid) except: self.parse_age(uid) try: self.parse_age(uid) except: print(00000000000000, traceback.format_exc()) # 根据关键词搜索请求得到微博信息 def parse_weibo(self, weibo_dict, uid): try: is_break = self.is_break date = time.strftime("%Y%m%d") st = int(time.mktime(time.strptime( date, '%Y%m%d'))) # 自定义起始时间 '2019-10-21 00:00:00' et = st - 86400 # 自定义终止时间 '2018-11-26 00:00:00' url = 'https://c.api.weibo.com/2/statuses/user_timeline/other.json' # 接口链接 # 请求接口需要的携带参数 data = { 'access_token': code, # 访问许可 'uid': '{}'.format(uid), 'endtime': '{}'.format(st), # 首次采集终止点为当前日期的零点,'2019-10-23 00:00:00' 'count': 20 } # 单页返回的记录条数,最大不超过100,超过100以100处理,默认为20。数据是从当前零点往前一天零点时间递减出现 try: time.sleep(0.1) response = requests.get(url, data, timeout=30) except: try: time.sleep(0.1) response = requests.get(url, data, timeout=30) except: time.sleep(0.1) response = requests.get(url, data, timeout=30) # print(weibo_dict) # print(response.text) if 'statuses' in response.text: data_list = json.loads(response.text, strict=False)['statuses'] # print(len(data_list)) for item in data_list: date_time_data = item['created_at'] # print(self.changetime(date_time_data)) try: date_data = self.changetime(date_time_data).split( ' ')[0] except: date_data = '' try: time_data = self.changetime(date_time_data).split( ' ')[1] except: time_data = '' # print(date_data, time_data) weibo_dict['platform'] = '微博' weibo_dict['keyword'] = str(uid) weibo_dict['date'] = date_data.strip() weibo_dict['time'] = time_data.strip() weibo_dict['weibo_id'] = str(item['id']) weibo_dict['mid'] = str(item['mid']) weibo_dict['idstr'] = str(item['idstr']) try: weibo_dict['content'] = item['longText'][ 'longTextContent'].replace('\u200b', ' ').replace( '\u200e', ' ').replace('\u200c', ' ').replace('\n', ' ') except: weibo_dict['content'] = item['text'].replace( '\u200b', ' ').replace('\u200e', ' ').replace('\u200c', ' ').replace('\n', ' ') weibo_dict['source'] = item['source'] weibo_dict['favorited'] = item['favorited'] weibo_dict['truncated'] = item['truncated'] try: location_data = item['user']['location'] except: location_data = '' try: weibo_dict['province_name'] = location_data.split( ' ')[0] weibo_dict['address'] = location_data.split(' ')[1] except: weibo_dict['province_name'] = location_data weibo_dict['address'] = '' # print(weibo_dict['province_name'], weibo_dict['address']) try: weibo_dict['pinyin'] = item['pinyin'] except: weibo_dict['pinyin'] = '' weibo_dict['uid'] = str(item['user']['id']) try: weibo_dict['screen_name'] = item['user']['screen_name'] except: weibo_dict['screen_name'] = '' try: weibo_dict['name'] = item['user']['name'] except: weibo_dict['name'] = '' try: weibo_dict['province'] = item['user']['province'] except: weibo_dict['province'] = '' try: weibo_dict['city'] = item['user']['city'] except: weibo_dict['city'] = '' try: weibo_dict['location'] = item['user']['location'] except: weibo_dict['location'] = '' try: weibo_dict['gender'] = item['user']['gender'] except: weibo_dict['gender'] = '' try: weibo_dict['allow_all_act_msg'] = item['user'][ 'allow_all_act_msg'] except: weibo_dict['allow_all_act_msg'] = '' try: weibo_dict['geo_enabled'] = item['user']['geo_enabled'] except: weibo_dict['geo_enabled'] = '' try: weibo_dict['verified'] = item['user']['verified'] except: weibo_dict['verified'] = '' try: weibo_dict['verified_reason'] = item['user'][ 'verified_reason'] except: weibo_dict['verified_reason'] = '' weibo_dict['likes'] = item['attitudes_count'] try: weibo_dict['views'] = item['views'] except: weibo_dict['views'] = '' try: weibo_dict['retweeted_status'] = str( item['retweeted_status']) except: weibo_dict['retweeted_status'] = '' weibo_dict['reposts_count'] = item['reposts_count'] weibo_dict['comments_count'] = item['comments_count'] weibo_dict['attitudes_count'] = item['attitudes_count'] weibo_dict['visible'] = str(item['visible']) weibo_dict['pic_ids'] = str(item['pic_ids']) try: weibo_dict['ad'] = item['ad'] except: weibo_dict['ad'] = '' weibo_dict['isLongText'] = item['isLongText'] weibo_dict['url'] = 'http://m.weibo.cn/' + str( item['user']['id']) + '/' + str(item['idstr']) try: weibo_dict['followers_count'] = item['user'][ 'followers_count'] except: weibo_dict['followers_count'] = '' try: weibo_dict['favourites_count'] = item['user'][ 'favourites_count'] except: weibo_dict['favourites_count'] = '' try: weibo_dict['friends_count'] = item['user'][ 'friends_count'] except: weibo_dict['friends_count'] = '' try: weibo_dict['statuses_count'] = item['user'][ 'statuses_count'] except: weibo_dict['statuses_count'] = '' try: weibo_dict['bi_followers_count'] = item['user'][ 'bi_followers_count'] except: weibo_dict['bi_followers_count'] = '' try: weibo_dict['avatar_large'] = item['user'][ 'avatar_large'] except: weibo_dict['avatar_large'] = '' try: weibo_dict['avatar_hd'] = item['user']['avatar_hd'] except: weibo_dict['avatar_hd'] = '' try: weibo_dict['retweeted_time'] = item[ 'retweeted_status']['created_at'] except: weibo_dict['retweeted_time'] = '' try: weibo_dict['retweeted_post_id'] = item[ 'retweeted_status']['id'] except: weibo_dict['retweeted_post_id'] = '' try: weibo_dict['retweeted_author'] = item[ 'retweeted_status']['in_reply_to_screen_name'] except: weibo_dict['retweeted_author'] = '' try: weibo_dict['retweeted_author_id'] = item[ 'retweeted_status']['in_reply_to_status_id'] except: weibo_dict['retweeted_author_id'] = '' try: weibo_dict['profile_url'] = item['user']['profile_url'] except: weibo_dict['profile_url'] = '' try: weibo_dict['domain'] = item['user']['domain'] except: weibo_dict['domain'] = '' try: weibo_dict['user_url'] = item['user']['domain'] except: weibo_dict['user_url'] = '' weibo_dict['author_url'] = 'http://m.weibo.cn/' + str( item['user']['id']) weibo_dict['tags'] = self.parse_tags(weibo_dict) # 图片列表判断 img_list = item['pic_ids'] if len(img_list) == 0: weibo_dict['imageurl'] = '' weibo_dict['audiourl'] = '' else: weibo_img = [] original_pic = item['original_pic'].split( 'large/')[0] + 'large/' for img in img_list: img_data = original_pic + img + '.jpg' weibo_img.append(img_data) weibo_dict['imageurl'] = weibo_img weibo_dict['audiourl'] = '' # print(weibo_dict['imageurl']) self.write_goods_jsonfile(weibo_dict) index_num = data_list.index(item) if index_num == len(data_list) - 1: # print(index_num) last_time = self.changetime( data_list[int(index_num)]['created_at']) last_date = self.changetime(data_list[int( index_num)]['created_at']).split(' ')[0] # print(last_time) # print(last_date) if self.start_time <= last_date: # 将其转换为时间数组 timeArray = time.strptime(last_time, "%Y-%m-%d %H:%M:%S") # 转换为时间戳: timeStamp = int(time.mktime(timeArray)) # print('最后一个时间%s转换成时间戳是: ' % last_time, timeStamp) self.parse_weibo_data(weibo_dict, uid, timeStamp) # pass if self.start_time > last_date: is_break = True if is_break: break except: print(111111111111111111111111, traceback.format_exc()) # 根据关键词搜索请求得到微博信息 def parse_weibo_data(self, weibo_dict, uid, timeStamp): try: is_break = self.is_break url = 'https://c.api.weibo.com/2/search/statuses/limited.json' # 接口链接 # 请求接口需要的携带参数 data = { 'access_token': code, # 访问许可 'uid': '{}'.format(uid), 'endtime': '{}'.format( timeStamp), # 首次采集终止点为当前日期的零点,'2019-10-23 00:00:00' 'count': 20 } # 单页返回的记录条数,最大不超过100,超过100以100处理,默认为20。数据是从当前零点往前一天零点时间递减出现 try: time.sleep(0.1) response = requests.get(url, data, timeout=30) except: try: time.sleep(0.1) response = requests.get(url, data, timeout=30) except: time.sleep(0.1) response = requests.get(url, data, timeout=30) # print(response.text) if 'statuses' in response.text: data_list = json.loads(response.text, strict=False)['statuses'] # print(len(data_list)) for item in data_list: date_time_data = item['created_at'] # print(self.changetime(date_time_data)) try: date_data = self.changetime(date_time_data).split( ' ')[0] except: date_data = '' try: time_data = self.changetime(date_time_data).split( ' ')[1] except: time_data = '' # print(date_data, time_data) weibo_dict['platform'] = '微博' weibo_dict['keyword'] = str(uid) weibo_dict['date'] = date_data.strip() weibo_dict['time'] = time_data.strip() weibo_dict['weibo_id'] = str(item['id']) weibo_dict['mid'] = str(item['mid']) weibo_dict['idstr'] = str(item['idstr']) try: weibo_dict['content'] = item['longText'][ 'longTextContent'].replace('\u200b', ' ').replace( '\u200e', ' ').replace('\u200c', ' ').replace('\n', ' ') except: weibo_dict['content'] = item['text'].replace( '\u200b', ' ').replace('\u200e', ' ').replace('\u200c', ' ').replace('\n', ' ') weibo_dict['source'] = item['source'] weibo_dict['favorited'] = item['favorited'] weibo_dict['truncated'] = item['truncated'] try: location_data = item['user']['location'] except: location_data = '' try: weibo_dict['province_name'] = location_data.split( ' ')[0] weibo_dict['address'] = location_data.split(' ')[1] except: weibo_dict['province_name'] = location_data weibo_dict['address'] = '' # print(weibo_dict['province_name'], weibo_dict['address']) try: weibo_dict['pinyin'] = item['pinyin'] except: weibo_dict['pinyin'] = '' weibo_dict['uid'] = str(item['user']['id']) try: weibo_dict['screen_name'] = item['user']['screen_name'] except: weibo_dict['screen_name'] = '' try: weibo_dict['name'] = item['user']['name'] except: weibo_dict['name'] = '' try: weibo_dict['province'] = item['user']['province'] except: weibo_dict['province'] = '' try: weibo_dict['city'] = item['user']['city'] except: weibo_dict['city'] = '' try: weibo_dict['location'] = item['user']['location'] except: weibo_dict['location'] = '' try: weibo_dict['gender'] = item['user']['gender'] except: weibo_dict['gender'] = '' try: weibo_dict['allow_all_act_msg'] = item['user'][ 'allow_all_act_msg'] except: weibo_dict['allow_all_act_msg'] = '' try: weibo_dict['geo_enabled'] = item['user']['geo_enabled'] except: weibo_dict['geo_enabled'] = '' try: weibo_dict['verified'] = item['user']['verified'] except: weibo_dict['verified'] = '' try: weibo_dict['verified_reason'] = item['user'][ 'verified_reason'] except: weibo_dict['verified_reason'] = '' weibo_dict['likes'] = item['attitudes_count'] try: weibo_dict['views'] = item['views'] except: weibo_dict['views'] = '' try: weibo_dict['retweeted_status'] = str( item['retweeted_status']) except: weibo_dict['retweeted_status'] = '' weibo_dict['reposts_count'] = item['reposts_count'] weibo_dict['comments_count'] = item['comments_count'] weibo_dict['attitudes_count'] = item['attitudes_count'] weibo_dict['visible'] = str(item['visible']) weibo_dict['pic_ids'] = str(item['pic_ids']) try: weibo_dict['ad'] = item['ad'] except: weibo_dict['ad'] = '' weibo_dict['isLongText'] = item['isLongText'] weibo_dict['url'] = 'http://m.weibo.cn/' + str( item['user']['id']) + '/' + str(item['idstr']) try: weibo_dict['followers_count'] = item['user'][ 'followers_count'] except: weibo_dict['followers_count'] = '' try: weibo_dict['favourites_count'] = item['user'][ 'favourites_count'] except: weibo_dict['favourites_count'] = '' try: weibo_dict['friends_count'] = item['user'][ 'friends_count'] except: weibo_dict['friends_count'] = '' try: weibo_dict['statuses_count'] = item['user'][ 'statuses_count'] except: weibo_dict['statuses_count'] = '' try: weibo_dict['bi_followers_count'] = item['user'][ 'bi_followers_count'] except: weibo_dict['bi_followers_count'] = '' try: weibo_dict['avatar_large'] = item['user'][ 'avatar_large'] except: weibo_dict['avatar_large'] = '' try: weibo_dict['avatar_hd'] = item['user']['avatar_hd'] except: weibo_dict['avatar_hd'] = '' try: weibo_dict['retweeted_time'] = item[ 'retweeted_status']['created_at'] except: weibo_dict['retweeted_time'] = '' try: weibo_dict['retweeted_post_id'] = item[ 'retweeted_status']['id'] except: weibo_dict['retweeted_post_id'] = '' try: weibo_dict['retweeted_author'] = item[ 'retweeted_status']['in_reply_to_screen_name'] except: weibo_dict['retweeted_author'] = '' try: weibo_dict['retweeted_author_id'] = item[ 'retweeted_status']['in_reply_to_status_id'] except: weibo_dict['retweeted_author_id'] = '' try: weibo_dict['profile_url'] = item['user']['profile_url'] except: weibo_dict['profile_url'] = '' try: weibo_dict['domain'] = item['user']['domain'] except: weibo_dict['domain'] = '' try: weibo_dict['user_url'] = item['user']['domain'] except: weibo_dict['user_url'] = '' weibo_dict['author_url'] = 'http://m.weibo.cn/' + str( item['user']['id']) weibo_dict['tags'] = self.parse_tags(weibo_dict) # 图片列表判断 img_list = item['pic_ids'] if len(img_list) == 0: weibo_dict['imageurl'] = '' weibo_dict['audiourl'] = '' else: weibo_img = [] original_pic = item['original_pic'].split( 'large/')[0] + 'large/' for img in img_list: img_data = original_pic + img + '.jpg' weibo_img.append(img_data) weibo_dict['imageurl'] = weibo_img weibo_dict['audiourl'] = '' # print(weibo_dict['imageurl']) self.write_goods_jsonfile(weibo_dict) index_num = data_list.index(item) if index_num == len(data_list) - 1: # print(index_num) last_time = self.changetime( data_list[int(index_num)]['created_at']) last_date = self.changetime(data_list[int( index_num)]['created_at']).split(' ')[0] # print(last_time) # print(last_date) if self.start_time <= last_date: # a = "2019-10-27 23:37:07" # 将其转换为时间数组 timeArray = time.strptime(last_time, "%Y-%m-%d %H:%M:%S") # 转换为时间戳: timeStamp1 = int(time.mktime(timeArray)) # print('最后一个时间%s转换成时间戳是: ' % last_time, timeStamp) self.parse_weibo_data(weibo_dict, uid, timeStamp1) if self.start_time > last_date: is_break = True if is_break: break except: print(22222222222222222222, traceback.format_exc()) # 请求获取tags def parse_tags(self, weibo_dict): try: # 请求接口需要的携带参数 data = { 'access_token': code, # 访问许可 'uids': '{}'.format(weibo_dict['uid']), # 搜索关键词 } url = 'https://c.api.weibo.com/2/tags/tags_batch/other.json' # 接口链接 try: time.sleep(0.1) response = requests.get(url, data, timeout=30) except: try: time.sleep(0.1) response = requests.get(url, data, timeout=30) except: time.sleep(0.1) response = requests.get(url, data, timeout=30) # print(response.text) if 'tags' in response.text: tags = re.search(r'"tags":\[{.*?"}\]', response.text).group().replace('"tags":', '') return tags else: return '' except: print(555555555555555555555555, traceback.format_exc()) # 写入json文件 def write_goods_jsonfile(self, item): # print(item) item_data = json.dumps(dict(item), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_daily/weibo/{}/104_{}_weibo_nike_uid.json' .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d')), item_data, encoding='utf-8') # item = json.dumps(dict(item), ensure_ascii=False) + '\n' # with open('./104_new_weibo_uid_{}.json'.format(time.strftime('%Y%m%d')), 'ab') as f: # f.write(item.encode("utf-8")) def run(self, keyword): print(keyword) self.parse_age(keyword)
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self, file_path, comment_path): self.headers_one = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36' } # 时间判断部分 date = datetime.now() - timedelta(days=7) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=0) # 昨天时间 yesterday = str(yesterday).split(' ')[0] # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = yesterday logging.log(31, '爬取时间段:{}到{}'.format(self.start_time, self.end_time)) # 定义评论的抓取时间范围 # self.comment_start_time = yesterday # 一天回复 # self.comment_start_time = '2019-08-01' # 一天回复 self.comment_start_time = '' # 不限定时间回复 self.comment_end_time = yesterday # self.comment_end_time = yesterday # 标记爬虫工作 self.is_work = True self.commnet_port_url = 'http://comment.sina.com.cn/page/info?version=1&format=json&channel=ty&newsid=comos-{}&group=0&compress=0&ie=utf-8&oe=utf-8&page={}&page_size=10&t_size=3&h_size=3&thread=1&callback=jsonp_1542676393124&_=1542676393124' self.page_num = 1 self.file_path = file_path self.comment_apth = comment_path self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000', user='******') self.hdfsclient.makedirs('{}/{}'.format( self.file_path, str(datetime.now()).split(' ')[0].replace('-', ''))) # 创建每日文件夹 self.hdfsclient.makedirs('{}/{}'.format( self.comment_apth, str(datetime.now()).split(' ')[0].replace('-', ''))) # 创建每日文件夹 self.time_time = str(time.time()).split('.')[0] self.article_queue = Queue() self.comment_queue = Queue() self.total_item = '' def get_list_page(self, url): logger.log(31, '列表页url: ' + url) response = requests.get(url, headers=self.headers_one) data = json.loads(response.text[46:-14]) list_data = data['result']['data'] for li_data in list_data: news_url = li_data['url'] ctime = li_data['ctime'] time_local = time.localtime(float(ctime)) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) # "%Y-%m-%d %H:%M:%S" # try: self.get_news_info( news_url, '', 'http://sports.sina.com.cn/roll/index.shtml#pageid=13&lid=2503&k=&num=50&page=1', dt) except: logger.error(traceback.format_exc()) def get_news_info(self, url, news_type, page_list, date_all): logger.log(31, '新闻url: ' + url) item = dict() response = requests.get(url, headers=self.headers_one, timeout=60) try: data = etree.HTML(response.content.decode()) # 网站 item['platform'] = '新浪新闻' # 标题 try: title = data.xpath('.//h1[@class="main-title"]/text()')[0] except: title = data.xpath('.//h2/text()')[0] item['title'] = title # date_all = data.xpath('.//div[@class="date-source"]/span/text()')[0].replace('年', '-').replace('月', '-').replace('日', '') date = date_all.split(' ')[0] news_time = date_all.split(' ')[1] # print(date) item['date'] = date item['time'] = news_time # 文章来源 try: article_source = data.xpath( './/div[@class="date-source"]/a/text()')[0] except: article_source = data.xpath( './/p[@class="from"]/span[2]//text()') article_source = ''.join(article_source) item['article_source'] = article_source # article_author try: article_author = data.xpath( './/div[@class="show_author"]/text()') except: article_author = '' if article_author: item['article_author'] = article_author[0] else: item['article_author'] = '' # 内容 try: content = data.xpath( './/div[@id="article_content"]/div[1]/div/p/text()') except: content = data.xpath('.//em[@class="vdiCont"]//text()') content = ''.join(content) # 翻页数据 next_page = data.xpath( './/div[@data-sudaclick="content_pagination_p"]/a/@href') if len(next_page) > 3: next_page = next_page[1:][:-2] for page_url in next_page: print('获取翻页数据') next_content = self.get_next_page(page_url) content = content + next_content item['content'] = content # 从接口处获取评论数 news_id = re.search('(\w{7}\d{7})', url).group(0) try: comment_count = self.get_commnet_count(news_id) except AttributeError: comment_count = '0' item['comments_count'] = comment_count item['clicks'] = '' item['views'] = '' item['likes'] = '' item['keyword'] = '' item['article_url'] = url # 文章详情URL item['dislikes'] = '' # 踩人数 item['series_url'] = '' # 车系首页 item['list_url'] = page_list # 文章列表URL # item['article_type'] = news_type # 文章类型 item['article_type_1st'] = news_type # 文章类型 item['article_type_2nd'] = '' # 文章类型 item['insert_time'] = str(datetime.now()).split('.')[0] # 初始爬取时间 item['update_time'] = str(datetime.now()).split('.')[0] # 最后爬取时间 content_id = url.split('/')[-1].split('.')[0].split('_')[-1].split( '-')[-1] # content_id = re.search('\d{5,8}', content_id).group(0) item['content_id'] = str(content_id) # 文章id item['topic_id'] = str(content_id) # 主贴id item['author_id'] = '' # 作者id item['file_code'] = '17' item['reposts_count'] = '' # 做时间判断部分--------------- get_news_time = time.mktime(time.strptime(date, "%Y-%m-%d")) end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d")) if self.start_time != '': start_time = time.mktime( time.strptime(self.start_time, "%Y-%m-%d")) else: start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d")) if float(get_news_time) < float(start_time): self.is_work = False return if float(start_time) <= float(get_news_time) <= float(end_time): self.write_news_jsonfile(item) # self.article_queue.put(item) if int(comment_count) > 0: self.is_get_comment = True while True: if self.is_get_comment: self.get_comments_info(news_id, title, date, news_time, url) else: self.page_num = 1 break except IndexError: time.sleep(5) logger.error('网页请求404 url: {}, {}'.format( url, traceback.format_exc())) # 获取翻页数据 def get_next_page(self, url): response = requests.get(url, headers=self.headers_one, timeout=60) try: data = etree.HTML(response.content) # 内容 content = data.xpath( './/div[@id="article_content"]/div[1]/div/p/text()') content = ''.join(content) return content except: content = '' return content # 获取评论数 def get_commnet_count(self, news_id): response = requests.get(self.commnet_port_url.format(news_id, str(1))) data = response.content.decode() data = data[20:][:-1] # print(11111,data) data = json.loads(data) # print(222222,data) # data = re.search('"qreply": \d{0,9}', data).group(0) try: comment_count = data['result']['count']['show'] except: comment_count = 0 return comment_count # 获取评论信息 def get_comments_info(self, news_id, title, source_date, source_time, source_url, page_id="1"): url = self.commnet_port_url.format(news_id, str(self.page_num)) response = requests.get(url) data = response.content.decode() # data = re.search(r'{"result.*}\)', data).group(0) data = data[20:][:-1] data = json.loads(data) comments_list = data['result']['cmntlist'] if comments_list: for comment in comments_list: item = {} item['platform'] = u'新浪新闻' item['source_date'] = source_date item['source_time'] = source_time date_all = comment['time'] date = date_all.split(' ')[0] commnet_time = date_all.split(' ')[1] item['date'] = date item['time'] = commnet_time # 评论部分做时间判断部分--------------- get_news_time = time.mktime( time.strptime(str(date), "%Y-%m-%d")) end_time = time.mktime( time.strptime(self.comment_end_time, "%Y-%m-%d")) if self.comment_start_time != '': start_time = time.mktime( time.strptime(self.comment_start_time, "%Y-%m-%d")) else: start_time = time.mktime( time.strptime('2010-1-1', "%Y-%m-%d")) if float(get_news_time) < float(start_time): self.is_get_comment = False # 返回的回答消息是按时间进行排序的,所以当时间小于指定时间时,就停止爬取, break elif float(start_time) <= float(get_news_time) <= float( end_time): item['title'] = title author = comment['nick'] item['author'] = author item['author_id'] = comment['uid'] # 用户id content = comment['content'] item['content'] = content item['floor'] = '' item['keyword'] = '' item['source_url'] = source_url comment_url = 'http://comment5.news.sina.com.cn/comment/skin/default.html?channel=ty&newsid=comos-{}&group=0'.format( news_id) item['comment_url'] = comment_url item['views'] = '' item['comments_count'] = '' likes = comment['agree'] item['likes'] = likes item['dislikes'] = '' # 踩人数 item['insert_time'] = str( datetime.now()).split('.')[0] # 初始爬取时间 item['update_time'] = str( datetime.now()).split('.')[0] # 最后爬取时间 item['content_id'] = str(uuid.uuid4()).replace('-', '') topic_id = source_url.split('/')[-1].split('.')[0].split( '_')[-1].split('-')[-1] # topic_id = re.search('\d{5,8}', topic_id).group(0) item['topic_id'] = topic_id # 主贴id item['file_code'] = '31' item['reposts_count'] = '' # self.write_comment_jsonfile(item) # print(11111111, item) item = json.dumps(dict(item), ensure_ascii=False) + '\n' self.total_item = self.total_item + item # self.comment_queue.put(item) self.write_comment_jsonfile() if self.is_get_comment: self.page_num += 1 # self.get_comments_info(news_id, title, source_date, source_time, source_url,page_id=str(self.page_num)) else: self.page_num = 1 logger.log(31, '评论抓取完毕 ' + url) self.is_get_comment = False # ------------------------------------------------新能源模块-------------------------------------------------------- def write_news_jsonfile(self, item): # q_size = self.article_queue.qsize() # total_item = '' # if q_size > 0: # for i in range(q_size): # item = self.article_queue.get() # # print('写入数据中......') print('写入新闻数据......') item = json.dumps(dict(item), ensure_ascii=False) + '\n' # total_item += item try: self.hdfsclient.new_write('{}/{}/17_{}_{}_sina_news.json'.format( self.file_path, str(datetime.now()).split(' ')[0].replace('-', ''), str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item, encoding='utf-8') except: logging.error(traceback.format_exc()) self.write_news_jsonfile(item) return # else: # pass def write_comment_jsonfile(self): # q_size = self.comment_queue.qsize() # total_item = '' # if q_size > 0: # print(q_size) # for i in range(q_size): # item = self.comment_queue.get() # print(2222222, item) # # print('写入数据中......') # item = json.dumps(dict(item), ensure_ascii=False) + '\n' # total_item = total_item + item # # # try: # # # # self.hdfsclient.new_write('{}/{}/31_{}_{}_sina_comment.json'.format(self.comment_apth, str(datetime.now()).split(' ')[0].replace('-', ''), str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), total_item,encoding='utf-8') # # except: # # logging.error(traceback.format_exc()) # else: # pass # print(3333333, total_item) try: self.hdfsclient.new_write( '{}/{}/31_{}_{}_sina_comment.json'.format( self.comment_apth, str(datetime.now()).split(' ')[0].replace('-', ''), str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), self.total_item, encoding='utf-8') except: print('写入重试中......') self.write_comment_jsonfile() return print('写入成功......') self.total_item = '' def run(self): for i in range(1, 100): if self.is_work: url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=13&lid=2503&k=&num=50&page={}&r=0.6019004029484454&callback=jQuery311016308312964736538_1566799251373&_=1566799251388'.format( str(i)) self.get_list_page(url) # self.write_news_jsonfile() else: logger.log(31, '爬取到指定时间......')
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self, redis_example): # 时间部分,按小时抓取 date_time = str(datetime.now() - timedelta(days=1)).split('.')[0] start_time_test = time.strftime('%Y-%m-%d 00:00:00') end_time = time.strftime('%Y-%m-%d %H:%M:%S') a = end_time.split(' ')[1].split(':')[0] if a == '00': start_time_data = date_time hours_name = '22_24' wen_jian_jia_date = str(datetime.now() - timedelta( days=1)).split('.')[0].split(' ')[0].replace('-', '') else: two_hours_ago = int(a) - 2 if len(str(two_hours_ago)) == 1: two_hour_ago = '0' + str(two_hours_ago) else: two_hour_ago = str(two_hours_ago) hours_name = str(two_hour_ago) + '_' + str(a) start_time_data = start_time_test wen_jian_jia_date = time.strftime('%Y%m%d') print('爬取时间段:{}到{}'.format(start_time_data, end_time)) logging.info('爬取时间段:{}到{}'.format(start_time_data, end_time)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = start_time_data # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = end_time # 标记爬虫工作 self.is_work = True self.redis_example = redis_example self.pid = os.getpid() self.h2_name = hours_name self.date_time = wen_jian_jia_date # 链接hdfs self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') self.hdfsclient.makedirs( '/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format( wen_jian_jia_date, hours_name)) # 创建每日文件夹 self.time_data = str(time.time()).split('.')[0] # 替换所有的HTML标签 def re_html(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r'\n', '', ret1) ret3 = re.sub(r'\u3000', '', ret2) ret4 = re.sub(r'品牌:', '', ret3) ret5 = re.sub(r'\xa0', '', ret4) ret6 = re.sub(r'→_→', '', ret5) ret7 = re.sub(r'……', '', ret6) ret8 = re.sub(r'":', '', ret7) return ret8 except: pass # 过滤月销量里面的非数字 def re_not_number(self, data): try: message = str(data) ret1 = re.sub(r'\D', '', message) return ret1 except: pass # 13位时间戳转换成日期 def time_change(self, data): timeStamp = float(int(data) / 1000) timeArray = time.localtime(timeStamp) otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) return otherStyleTime # 获取品牌信息 def parse_brand(self, goods_dict): try: headers = { 'content-type': 'text/html;charset=UTF-8', 'cookie': 'kaola_user_key=b87e28b9-e7fc-43ba-8ca7-42abae97a079; _ntes_nnid=116c0ca91001bfb53c23f45f9e55ac87,1568617522153; _ga=GA1.2.290138937.1568617522; _klhtxd_=31; _ga=GA1.3.290138937.1568617522; __da_ntes_utma=2525167.417648162.1568617522.1568617522.1568617522.1; davisit=1; __da_ntes_utmz=2525167.1568617522.1.1.; __da_ntes_utmfc=utmcsr%3D(direct)%7Cutmccn%3D(direct)%7Cutmcmd%3D(none); usertrack=CrGZAV2DFzgLhl54AwtSAg==; KAOLA_NEW_USER_COOKIE=yes; cna=MQj5FQMZD0sCAXxONRZeF0y0; WM_TID=beYPJ03r5ilFBUFUFEZo5jCUV1mKk4PC; t=cf5d799c2331f5cabed38ae64e05e79e; KAOLA_USER_ID=109999078912652422; [email protected]; JSESSIONID-WKL-8IO=0zc3WMz%2Bz0rQe5Jcv1xai4OAOScJJgZviUPXMI3RUo2IYlneCBZYhem2pXj85vvoJ8Z%2B2yMxkJZ%2BDbqGhohayCkj0RWfrbvXgwt00Wju%2BMWVg7WjBsfPPuM6Bq0yJI1vkeq%5C17ndJLsLrHGeY1Sf0k231zopBvGmtXomvGZ5J9TWLbPq%3A1586842936344; davisit=2; __da_ntes_utmb=2525167.1.10.1586756536; _samesite_flag_=true; cookie2=1f50b0bd27965ea6d4731440eb0ab6b2; _tb_token_=57e48eee49e7; csg=7c23ee4b; NTES_OSESS=REPpP5MMDS0ti.Kjs4kXCagwqwIe5DsWd2J6spGZnnoVWWhz6L9pI2HlXPVp_85PuZGCsnYofZ0FK56aZ.uX88iBgdi0zJZsRBB8fdi_YIZfYxQlVYg4kvmcVqVCqK9kxhu.Yzv4Avj3rW.UPrCYFGfnrd5TZovCzX0lNqe3j5rAEWHpYRLXj1PsCx_75evCuvl01iv5jej2sgH2yqYAm2a0p; kaola_csg=93dad892; kaola-user-beta-traffic=12217883524; firstLogin=0; hb_MA-AE38-1FCC6CD7201B_source=search.kaola.com; NTES_KAOLA_RV=1537539_1586756945560_0|2884042_1586756792280_0|5522516_1586513810003_0|5705591_1585881322711_0|8317307_1585880658885_0|5553701_1585880652352_0|8517421_1585879009306_0|1467929_1571291229258_0|5218698_1569811431977_0|5536790_1569811422334_0|5457794_1569811411408_0|5115159_1569811404628_0|2843760_1569566707083_0|5481268_1569489750583_0|2723610_1569488978899_0|2546067_1569485553114_0|1758828_1569485116618_0|1616628_1569482665961_0|5111078_1569482641632_0|2482224_1569482624326_0; isg=BHV1IQtJR6edB6MO8FzlgBdJhPHvWigPBiZuAfeb4ewRzpfAv0AP1GEMGNLdjkG8', 'pragma': 'no-cache', 'referer': 'https://search.kaola.com/search.html?key=AlphaBounce&oldQuery=AIR%2520MAX&searchRefer=searchbutton&zn=top', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } url = goods_dict['url'] try: # time.sleep(0.2) response1 = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: # time.sleep(0.2) response1 = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: # time.sleep(0.2) response1 = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) html1 = etree.HTML(response1.text) # 品牌 try: goods_dict['brand'] = html1.xpath( '//dt[@class="orig-country"]/a/text()')[0].spilt(' ')[0] except: goods_dict['brand'] = '' # print(goods_dict) self.goods_comments(goods_dict) except: print(9999999999999999999999, traceback.format_exc()) # 抓取第一页商品评论 def goods_comments(self, goods_dict): try: if int(goods_dict['achieve_num']) == 0: pass # logger.log(31, '**********---------没有商品评论------************') else: goods_id = goods_dict['goods_id'] comment_url = 'https://goods.kaola.com/commentAjax/comment_list_new.json' # print(comment_url, goods_id) headers = { 'authority': 'goods.kaola.com', 'method': 'POST', 'path': '/commentAjax/comment_list_new.json', 'scheme': 'https', 'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'no-cache', 'content-length': '220', 'content-type': 'application/x-www-form-urlencoded', 'cookie': 'kaola_user_key=b87e28b9-e7fc-43ba-8ca7-42abae97a079; _ntes_nnid=116c0ca91001bfb53c23f45f9e55ac87,1568617522153; _ga=GA1.2.290138937.1568617522; _klhtxd_=31; _ga=GA1.3.290138937.1568617522; __da_ntes_utma=2525167.417648162.1568617522.1568617522.1568617522.1; davisit=1; __da_ntes_utmz=2525167.1568617522.1.1.; __da_ntes_utmfc=utmcsr%3D(direct)%7Cutmccn%3D(direct)%7Cutmcmd%3D(none); usertrack=CrGZAV2DFzgLhl54AwtSAg==; KAOLA_NEW_USER_COOKIE=yes; cna=MQj5FQMZD0sCAXxONRZeF0y0; WM_TID=beYPJ03r5ilFBUFUFEZo5jCUV1mKk4PC; t=cf5d799c2331f5cabed38ae64e05e79e; KAOLA_USER_ID=109999078912652422; [email protected]; JSESSIONID-WKL-8IO=0zc3WMz%2Bz0rQe5Jcv1xai4OAOScJJgZviUPXMI3RUo2IYlneCBZYhem2pXj85vvoJ8Z%2B2yMxkJZ%2BDbqGhohayCkj0RWfrbvXgwt00Wju%2BMWVg7WjBsfPPuM6Bq0yJI1vkeq%5C17ndJLsLrHGeY1Sf0k231zopBvGmtXomvGZ5J9TWLbPq%3A1586842936344; davisit=2; __da_ntes_utmb=2525167.1.10.1586756536; _samesite_flag_=true; cookie2=1f50b0bd27965ea6d4731440eb0ab6b2; _tb_token_=57e48eee49e7; csg=7c23ee4b; NTES_OSESS=REPpP5MMDS0ti.Kjs4kXCagwqwIe5DsWd2J6spGZnnoVWWhz6L9pI2HlXPVp_85PuZGCsnYofZ0FK56aZ.uX88iBgdi0zJZsRBB8fdi_YIZfYxQlVYg4kvmcVqVCqK9kxhu.Yzv4Avj3rW.UPrCYFGfnrd5TZovCzX0lNqe3j5rAEWHpYRLXj1PsCx_75evCuvl01iv5jej2sgH2yqYAm2a0p; kaola_csg=93dad892; kaola-user-beta-traffic=12217883524; firstLogin=0; hb_MA-AE38-1FCC6CD7201B_source=search.kaola.com; NTES_KAOLA_RV=1537539_1586756945560_0|2884042_1586756792280_0|5522516_1586513810003_0|5705591_1585881322711_0|8317307_1585880658885_0|5553701_1585880652352_0|8517421_1585879009306_0|1467929_1571291229258_0|5218698_1569811431977_0|5536790_1569811422334_0|5457794_1569811411408_0|5115159_1569811404628_0|2843760_1569566707083_0|5481268_1569489750583_0|2723610_1569488978899_0|2546067_1569485553114_0|1758828_1569485116618_0|1616628_1569482665961_0|5111078_1569482641632_0|2482224_1569482624326_0; isg=BHV1IQtJR6edB6MO8FzlgBdJhPHvWigPBiZuAfeb4ewRzpfAv0AP1GEMGNLdjkG8', 'origin': 'https://goods.kaola.com', 'pragma': 'no-cache', 'referer': 'https://goods.kaola.com/review/{}.html'.format( str(goods_id)), 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 'x-requested-with': 'XMLHttpRequest' } form_data = { 'goodsId': '{}'.format(str(goods_id)), 'grade': '0', 'tagType': '0', 'hasContent': '0', 'paginationContext': 'null', 'pageNo': '1', 'pageSize': '20', } try: # time.sleep(0.2) response = requests.post(url=comment_url, headers=headers, data=form_data, proxies=proxies, allow_redirects=False, timeout=30) except: try: # time.sleep(0.2) response = requests.post(url=comment_url, headers=headers, data=form_data, proxies=proxies, allow_redirects=False, timeout=30) except: # time.sleep(0.2) response = requests.post(url=comment_url, headers=headers, data=form_data, proxies=proxies, allow_redirects=False, timeout=30) # print(response.text) data = json.loads(response.text) # 获取评论列表 comments_list = data['data']['commentPage']['result'] if int(len(comments_list)) == 0: return else: # 获取当前页数 page_data = data['data']['commentPage']['pageNo'] # 评价总页数 pages_num = data['data']['commentPage']['totalPage'] # logger.log(31, '*******************第1页评论****************') for item in comments_list: kao_la_dict = dict() time_data = self.time_change(item['createTime']) # print(data_time_data) try: content = item['commentContent'].replace('\n', ' ') except: content = '' # 追加评论 try: comments_2 = item['replyList'][0]['replyContent'] except: comments_2 = '' if self.start_time <= time_data: kao_la_dict['platform'] = goods_dict['platform'] kao_la_dict['date'] = time_data.split(' ')[0] kao_la_dict['time'] = time_data.split(' ')[1] kao_la_dict['keyword'] = goods_dict['keyword'] kao_la_dict['name'] = goods_dict['name'] kao_la_dict['imageurl'] = goods_dict['商品图片'] kao_la_dict['audiourl'] = '' kao_la_dict['url'] = goods_dict['url'] kao_la_dict['shop_name'] = goods_dict['shop_name'] kao_la_dict['user_name'] = '' kao_la_dict['content'] = content + ';' + comments_2 kao_la_dict['content_id'] = str( item['goodsCommentId']) kao_la_dict['brand'] = goods_dict['brand'] kao_la_dict['price'] = goods_dict['price'] kao_la_dict['sales'] = goods_dict['sales'] kao_la_dict['focus_count'] = '' kao_la_dict['comment_num'] = goods_dict[ 'achieve_num'] kao_la_dict['views'] = '' kao_la_dict['likes'] = item['zanCount'] kao_la_dict['comments_count'] = '' kao_la_dict['author_id'] = '' kao_la_dict['reposts_count'] = '' kao_la_dict['topic_id'] = str(item['goodsId']) try: kao_la_dict['type'] = item['skuPropertyList'][ 1]['propertyValue'] except: kao_la_dict['type'] = '' try: kao_la_dict['size'] = item['skuPropertyList'][ 0]['propertyValue'] except: kao_la_dict['size'] = '' kao_la_dict['file_code'] = '176' # print(kao_la_dict) item = json.dumps(dict(kao_la_dict), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_2h/ecommerce/{}/{}/176_{}_KaoLa_nike{}.json' .format(self.date_time, self.h2_name, time.strftime('%Y%m%d'), self.pid), item, encoding='utf-8') else: pass if int(page_data) < int(pages_num): # 获取第一页评论最后一个的id以及下一页从哪页跳转参数 lastId = data['data']['paginationContext']['lastId'] lastPage = data['data']['paginationContext'][ 'lastPage'] # print(lastId, lastPage) self.goods_comments_2(lastId, lastPage, goods_id, goods_dict, int(page_data) + 1) else: pass except: print(22222222222222222, traceback.format_exc()) # 获取第一页之后的所有页面评论 def goods_comments_2(self, lastId, lastPage, goods_id, goods_dict, i): try: comment_url = 'https://goods.kaola.com/commentAjax/comment_list_new.json' # print(comment_url, goods_id, lastId, lastPage) headers = { 'authority': 'goods.kaola.com', 'method': 'POST', 'path': '/commentAjax/comment_list_new.json', 'scheme': 'https', 'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'no-cache', 'content-length': '247', 'content-type': 'application/x-www-form-urlencoded', 'cookie': 'kaola_user_key=b87e28b9-e7fc-43ba-8ca7-42abae97a079; _ntes_nnid=116c0ca91001bfb53c23f45f9e55ac87,1568617522153; _ga=GA1.2.290138937.1568617522; _klhtxd_=31; _ga=GA1.3.290138937.1568617522; __da_ntes_utma=2525167.417648162.1568617522.1568617522.1568617522.1; davisit=1; __da_ntes_utmz=2525167.1568617522.1.1.; __da_ntes_utmfc=utmcsr%3D(direct)%7Cutmccn%3D(direct)%7Cutmcmd%3D(none); usertrack=CrGZAV2DFzgLhl54AwtSAg==; KAOLA_NEW_USER_COOKIE=yes; cna=MQj5FQMZD0sCAXxONRZeF0y0; WM_TID=beYPJ03r5ilFBUFUFEZo5jCUV1mKk4PC; t=cf5d799c2331f5cabed38ae64e05e79e; KAOLA_USER_ID=109999078912652422; [email protected]; JSESSIONID-WKL-8IO=0zc3WMz%2Bz0rQe5Jcv1xai4OAOScJJgZviUPXMI3RUo2IYlneCBZYhem2pXj85vvoJ8Z%2B2yMxkJZ%2BDbqGhohayCkj0RWfrbvXgwt00Wju%2BMWVg7WjBsfPPuM6Bq0yJI1vkeq%5C17ndJLsLrHGeY1Sf0k231zopBvGmtXomvGZ5J9TWLbPq%3A1586842936344; davisit=2; __da_ntes_utmb=2525167.1.10.1586756536; _samesite_flag_=true; cookie2=1f50b0bd27965ea6d4731440eb0ab6b2; _tb_token_=57e48eee49e7; csg=7c23ee4b; NTES_OSESS=REPpP5MMDS0ti.Kjs4kXCagwqwIe5DsWd2J6spGZnnoVWWhz6L9pI2HlXPVp_85PuZGCsnYofZ0FK56aZ.uX88iBgdi0zJZsRBB8fdi_YIZfYxQlVYg4kvmcVqVCqK9kxhu.Yzv4Avj3rW.UPrCYFGfnrd5TZovCzX0lNqe3j5rAEWHpYRLXj1PsCx_75evCuvl01iv5jej2sgH2yqYAm2a0p; kaola_csg=93dad892; kaola-user-beta-traffic=12217883524; firstLogin=0; hb_MA-AE38-1FCC6CD7201B_source=search.kaola.com; NTES_KAOLA_RV=1537539_1586756945560_0|2884042_1586756792280_0|5522516_1586513810003_0|5705591_1585881322711_0|8317307_1585880658885_0|5553701_1585880652352_0|8517421_1585879009306_0|1467929_1571291229258_0|5218698_1569811431977_0|5536790_1569811422334_0|5457794_1569811411408_0|5115159_1569811404628_0|2843760_1569566707083_0|5481268_1569489750583_0|2723610_1569488978899_0|2546067_1569485553114_0|1758828_1569485116618_0|1616628_1569482665961_0|5111078_1569482641632_0|2482224_1569482624326_0; isg=BHV1IQtJR6edB6MO8FzlgBdJhPHvWigPBiZuAfeb4ewRzpfAv0AP1GEMGNLdjkG8', 'origin': 'https://goods.kaola.com', 'pragma': 'no-cache', 'referer': 'https://goods.kaola.com/review/{}.html'.format(str(goods_id)), 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 'x-requested-with': 'XMLHttpRequest' } form_data = { 'goodsId': '{}'.format(str(goods_id)), 'grade': '0', 'tagType': '0', 'hasContent': '0', 'showSelfGoodsComment': 'false', 'paginationContext': { "lastId": '{}'.format(lastId), "lastPage": '{}'.format(lastPage) }, 'pageNo': '{}'.format(i), 'pageSize': '20', 'hasInitCommentTab': 'true' } try: # time.sleep(0.2) response = requests.post(url=comment_url, headers=headers, data=form_data, proxies=proxies, allow_redirects=False, timeout=30) except: try: # time.sleep(0.2) response = requests.post(url=comment_url, headers=headers, data=form_data, proxies=proxies, allow_redirects=False, timeout=30) except: # time.sleep(0.2) response = requests.post(url=comment_url, headers=headers, data=form_data, proxies=proxies, allow_redirects=False, timeout=30) data = json.loads(response.text) # print(data) # 获取评论列表 comments_list = data['data']['commentPage']['result'] # logger.log(31, '**********************第{}页评论**********************'.format(i)) if int(len(comments_list)) == 0: return else: # 获取当前页数 page_data = data['data']['commentPage']['pageNo'] # 评价总页数 pages_num = data['data']['commentPage']['totalPage'] for item in comments_list: kao_la_goods = dict() time_data = self.time_change(item['createTime']) try: content = item['commentContent'].replace('\n', ' ') except: content = '' # 追加评论 try: comments_2 = item['replyList'][0]['replyContent'] except: comments_2 = '' if self.start_time <= time_data: kao_la_goods['platform'] = goods_dict['platform'] kao_la_goods['date'] = time_data.split(' ')[0] kao_la_goods['time'] = time_data.split(' ')[1] kao_la_goods['keyword'] = goods_dict['keyword'] kao_la_goods['name'] = goods_dict['name'] kao_la_goods['imageurl'] = goods_dict['商品图片'] kao_la_goods['audiourl'] = '' kao_la_goods['url'] = goods_dict['url'] kao_la_goods['shop_name'] = goods_dict['shop_name'] kao_la_goods['user_name'] = '' kao_la_goods['content'] = content + ';' + comments_2 kao_la_goods['content_id'] = str( item['goodsCommentId']) kao_la_goods['brand'] = goods_dict['brand'] kao_la_goods['price'] = goods_dict['price'] kao_la_goods['sales'] = goods_dict['sales'] kao_la_goods['focus_count'] = '' kao_la_goods['comment_num'] = goods_dict['achieve_num'] kao_la_goods['views'] = '' kao_la_goods['likes'] = item['zanCount'] kao_la_goods['comments_count'] = '' kao_la_goods['author_id'] = '' kao_la_goods['reposts_count'] = '' kao_la_goods['topic_id'] = str(item['goodsId']) try: kao_la_goods['type'] = item['skuPropertyList'][1][ 'propertyValue'] except: kao_la_goods['type'] = '' try: kao_la_goods['size'] = item['skuPropertyList'][0][ 'propertyValue'] except: kao_la_goods['size'] = '' kao_la_goods['file_code'] = '176' # print(kao_la_goods) item = json.dumps(dict(kao_la_goods), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_2h/ecommerce/{}/{}/176_{}_KaoLa_nike{}.json' .format(self.date_time, self.h2_name, time.strftime('%Y%m%d'), self.pid), item, encoding='utf-8') else: pass if int(page_data) < int(pages_num): # 获取第2页评论最后一个的id以及下一页从哪页跳转参数 lastId = data['data']['paginationContext']['lastId'] lastPage = data['data']['paginationContext']['lastPage'] i += 1 self.goods_comments_2(lastId, lastPage, goods_id, goods_dict, i) else: pass except: print(3333333333333333333, traceback.format_exc()) # # 读取excel获取关键词 # def parse_xlsx(self): # # 设置路径 # path = './快消采集关键词_0916_v3-1.xlsx' # # 打开execl # workbook = xlrd.open_workbook(path) # # # 根据sheet索引或者名称获取sheet内容 # Data_sheet = workbook.sheets()[0] # 通过索引获取 # # rowNum = Data_sheet.nrows # sheet行数 # colNum = Data_sheet.ncols # sheet列数 # # # 获取所有单元格的内容 # list = [] # for i in range(rowNum): # rowlist = [] # for j in range(colNum): # rowlist.append(Data_sheet.cell_value(i, j)) # list.append(rowlist) # # for data in list[1::]: # brand = data[0] # # print(brand) # yield { # '关键词': brand, # } def run(self, lock): for num in range(1000000): lock.acquire() redis_url_num = self.redis_example.llen('kaola_2h_url') if str(redis_url_num) == '0': print( '**********Redis消息队列中url为空.....进程 {} 抓取结束......************' .format(str(os.getpid()))) return item = self.redis_example.brpop('kaola_2h_url', timeout=3600)[1] lock.release() item1 = json.loads(item.decode()) # print(item) self.parse_brand(item1)
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self, redis_example): # 时间部分 # 爬虫开始抓取的日期 date = datetime.now() - timedelta(days=1) news_start_time = str(date).split(' ')[0] # 爬虫结束的抓取日期 current_time = datetime.now() # 当前日期 current_day = str(current_time).split(' ')[0] print('爬取时间段:{}到{}'.format(news_start_time, current_day)) logging.info('爬取时间段:{}到{}'.format(news_start_time, current_day)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = current_day # 标记爬虫工作 self.is_break = False self.redis_example = redis_example self.pid = os.getpid() # 链接hdfs self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000', user='******') self.hdfsclient.makedirs( '/user/cspider_daily/nike_daily/ecommerce/{}'.format( time.strftime('%Y%m%d'))) # 创建每日文件夹 self.time_data = str(time.time()).split('.')[0] # 替换所有的HTML标签 def re_html(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r'\n', '', ret1) ret3 = re.sub(r'\u3000', '', ret2) ret4 = re.sub(r'品牌:', '', ret3) ret5 = re.sub(r'\xa0', '', ret4) ret6 = re.sub(r'→_→', '', ret5) ret7 = re.sub(r'……', '', ret6) ret8 = re.sub(r'":', '', ret7) return ret8 except: pass # 过滤月销量里面的非数字 def re_not_number(self, data): try: message = str(data) ret1 = re.sub(r'\D', '', message) return ret1 except: pass # 过滤url里面的#detail def re_detail(self, data): try: message = str(data) ret1 = re.sub(r'#detail', '', message) return ret1 except: pass # 解析请求得到的商品信息 def parse_goods_url(self, data): goods_dict = dict() goods_dict['平台'] = '天猫' goods_dict['URL'] = data['URL'] goods_dict['商品名'] = data['商品名'] goods_dict['价格'] = data['价格'] goods_dict['shop_name'] = data['shop_name'] goods_dict['月销量'] = data['月销量'].replace('人付款', '') goods_dict['关键词'] = data['关键词'] goods_dict['品牌'] = data['品牌'] goods_dict['itemId'] = data['itemId'] goods_dict['sellerId'] = data['sellerId'] goods_dict['imageurl'] = data['商品图片'] goods_dict['audiourl'] = '' # logger.log(31, '***************************正在抓取的商品是:%s.................' % goods_dict) self.parse_goods_details(goods_dict) # 解析商品品牌信息 def parse_goods_details(self, goods_dict): try: url = goods_dict['URL'] headers = { 'content-type': 'text/html;charset=GBK', # 'authority': 'detail.tmall.com', # 'method': 'GET', # 'path': path, # 'scheme': 'https', # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', # 'accept-encoding': 'gzip, deflate, br', # 'accept-language': 'zh-CN,zh;q=0.9', # 'cache-control': 'no-cache', 'cookie': 'cq=ccp%3D1; cookie2=1f727f7b9a023d3336775fda77aa6c64; t=e0b1a5b3f801e8ad7974356321ff2384; _tb_token_=33173e5033eee; csg=0cc83e88; dnk=tb9619067_2012; tracknick=tb9619067_2012; lid=tb9619067_2012; enc=x%2FOcdI7JLsr6CpzvtdqWU16H2R3aBtna09TzEkBv9ziurQwe2F0J9TpnrtPlgFZW95bG4nGzcBhhVk6VKxhnTw%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; uc1=cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&cookie21=UtASsssme%2BBq&cookie15=URm48syIIVrSKA%3D%3D&existShop=false&pas=0&cookie14=UoTaECbNEKyHyQ%3D%3D&tag=8&lng=zh_CN; UM_distinctid=16d142a6ebc31-00f1b1ad240fa2-37c143e-1fa400-16d142a6ebda91; cna=MQj5FQMZD0sCAXxONRZeF0y0; isg=BAgI5_cywVJRgi3av8Xnqb_92XYasWy7u5UjZsK5VAN2nagHasE8S56fETVItiST; l=cBQfGYbVq1scMYebBOCanurza77OSIRYYuPzaNbMi_5pY6TsXPQOkP9r_F96VjWd9vYB41hTyPJ9-etkqPrdT9dbHZ9R.; CNZZDATA1000427971=618518977-1568000449-https%253A%252F%252Fs.taobao.com%252F%7C1568254167; pnm_cku822=098%23E1hvuvvUvbpvUvCkvvvvvjiPRFLUljtVn2sUQjivPmPWljYWRF5vQjDvRLLyAjlhiQhvCvvvpZpEvpCW9HKm93zEn1m65kx%2FQjZJ%2Bu0XjobyACuXjBrQpWkQRqJ6WeCpqU0QKfUpwy2IAfUTKFyK2ixrV4TJVVQHYWpXei%2BffCuYiLUpVE6FpdyCvm9vvhCvvvvvvvvvBGwvvUjZvvCj1Qvvv3QvvhNjvvvmmvvvBGwvvvUUkphvC9hvpyPwl8yCvv9vvhh%2BFHx7CphCvvOv9hCvvvvPvpvhvv2MMsyCvvpvvhCv', 'pragma': 'no-cache', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } try: time.sleep(0.3) response = requests.get(url=url, headers=headers, proxies=proxies, timeout=30) except: try: time.sleep(0.3) response = requests.get(url=url, headers=headers, proxies=proxies, timeout=30) except: time.sleep(0.3) response = requests.get(url=url, headers=headers, proxies=proxies, timeout=30) # response.encoding = 'gbk' # print("正在抓取的链接是: %s" % url) # print(response.text) spuid = re.search( r'"spuId":".*","categoryId"|"spuId":.*?,|amp;spuId=.*?"|spuId=.*?"', response.text) re_spuid = self.re_not_number(spuid.group()) # logger.log(31, 'spuid:', re_spuid) # 将响应转换成一个element对象 # html = etree.HTML(response.text) # # print(html) # # 获取所有品牌节点列表 # pin_pai_list = html.xpath('//li[@id="J_attrBrandName"]/text()|//ul[@class="attributes-list"]/li[1]/text()')[0] # if '品牌' not in pin_pai_list: # pin_pai = html.xpath('//li[@id="J_attrBrandName"]/text()|//ul[@class="attributes-list"]/li[2]/text()')[0] # else: # pin_pai = pin_pai_list # print(goods_url, pin_pai_list[0]) # data_list = [] # 遍历品牌节点列表 goods_dict['spuid'] = re_spuid # print(goods_dict) response.close() self.goods_collection_num(goods_dict) except: print(33333333333333333333, traceback.format_exc()) # 抓取商品收藏数(人气) def goods_collection_num(self, goods_dict): try: url = 'https://count.taobao.com/counter3?callback=jsonp243&keys=SM_368_dsr-{},ICCP_1_{}'.format( goods_dict['sellerId'], goods_dict['itemId']) headers = { 'content-type': 'application/x-javascript', 'cookie': 't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; _uab_collina=155540168306791903478476; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=f32553e159b195a4f17c00010f2bcd2e_1564547678304; _m_h5_tk_enc=3268f7bf49fd78b94768c96e3ef51817; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b227365617263686170703b32223a2266303666326434356635336264366335613639393662663834646632366531644349582b684f6f46454c47473571714e726f325955526f4c4f4459314d5467794d6a4d334f7a453d227d; l=cBaOcPD7qg21z_uyBOfZKurza779uIdf1sPzaNbMiICPO_fh5wONWZFb8t8MCnGVLsI2535t6zUaBXYaGyUIh2nk8b8CgsDd.; isg=BOzsOXiZnf59JomJ--wm9a9SvcreDZFEZ8nHSkYsbxe0UY9baraQ30WjcVnMWcin; JSESSIONID=A9F406FD84CDFD576728A12ECBD98A53', 'upgrade-insecure-requests': '1', 'user-agent': random.choice(user_agent_list) } try: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) re_collection_num = re.search(r'":\d{1,20}', response.text) # print(re_collection_num.group()) goods_dict['人气数'] = self.re_html(re_collection_num.group()) # print(goods_dict) response.close() self.parse_goods_comment_num(goods_dict) except: print(444444444444444444, traceback.format_exc()) # 抓取商品评论数 def parse_goods_comment_num(self, goods_dict): try: url = 'https://dsr-rate.tmall.com/list_dsr_info.htm?itemId={}&spuId={}&sellerId={}&groupId&_ksTS=1564105737969_212&callback=jsonp213'.format( goods_dict['itemId'], goods_dict['spuid'], goods_dict['sellerId']) headers = { 'cookie': 't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; _uab_collina=155540168306791903478476; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=f32553e159b195a4f17c00010f2bcd2e_1564547678304; _m_h5_tk_enc=3268f7bf49fd78b94768c96e3ef51817; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b227365617263686170703b32223a2266303666326434356635336264366335613639393662663834646632366531644349582b684f6f46454c47473571714e726f325955526f4c4f4459314d5467794d6a4d334f7a453d227d; l=cBaOcPD7qg21z_uyBOfZKurza779uIdf1sPzaNbMiICPO_fh5wONWZFb8t8MCnGVLsI2535t6zUaBXYaGyUIh2nk8b8CgsDd.; isg=BOzsOXiZnf59JomJ--wm9a9SvcreDZFEZ8nHSkYsbxe0UY9baraQ30WjcVnMWcin; JSESSIONID=A9F406FD84CDFD576728A12ECBD98A53', 'pragma': 'no-cache', 'upgrade-insecure-requests': '1', 'Referer': 'https://detail.tmall.com/item.htm?spm=a230r.1.14.6.17c44e4c0Tr15d&id=572069385580&cm_id=140105335569ed55e27b&abbucket=1', 'User-Agent': random.choice(user_agent_list) } try: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) re_comment_num = re.search(r'{"dsr":{.*}}', response.text) goods_dict['评价人数'] = json.loads( re_comment_num.group())['dsr']['rateTotal'] if int(self.re_not_number(goods_dict['评价人数'])) == 0: # logger.log(31, '---------该商品没有评价数据-------') response.close() pass else: headers1 = { 'cookie': 't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB', 'pragma': 'no-cache', 'upgrade-insecure-requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'https://detail.tmall.com/item.htm?spm=a230r.1.14.16.26804e4ck29eWS&id=597034992998&ns=1&abbucket=1', 'User-Agent': random.choice(user_agent_list) } comment_url = 'https://rate.tmall.com/list_detail_rate.htm?itemId={}&spuId={}&sellerId={}&order=1¤tPage=1'.format( goods_dict['itemId'], goods_dict['spuid'], goods_dict['sellerId']) try: time.sleep(0.2) response1 = requests.get(url=comment_url, headers=headers1, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response1 = requests.get(url=comment_url, headers=headers1, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response1 = requests.get(url=comment_url, headers=headers1, proxies=proxies, allow_redirects=False, timeout=30) comment_data = response1.text comment = re.search( r'{"rateDetail":{"rateCount":{"total":.*"tags":\[]}}', comment_data) pages_data = json.loads( comment.group())['rateDetail']['paginator']['items'] if int(pages_data) == 0: pass else: pages_num = int(math.ceil(float(int(pages_data) / 20))) response1.close() response.close() self.goods_comments(goods_dict, pages_num) except: print(5555555555555555555555, traceback.format_exc()) # 解析商品评论 def goods_comments(self, goods_dict, pages_num): try: is_break = self.is_break # print(goods_dict) itemId = goods_dict['itemId'] sellerId = goods_dict['sellerId'] spuId = goods_dict['spuid'] headers = { 'cookie': 't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB', 'pragma': 'no-cache', 'upgrade-insecure-requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'https://detail.tmall.com/item.htm?spm=a230r.1.14.16.26804e4ck29eWS&id=597034992998&ns=1&abbucket=1', 'User-Agent': random.choice(user_agent_list) } if int(pages_num) >= 99: pages = 99 else: pages = pages_num # logger.log(31, '-------------评论总页数是:%s --------------' % pages) # 抓取商品评论链接(总共99页,从1开始) for i in range(1, int(pages) + 1): comment_url = 'https://rate.tmall.com/list_detail_rate.htm?itemId={}&spuId={}&sellerId={}&order=1¤tPage={}'.format( itemId, spuId, sellerId, i) # print(comment_url) # response = requests.get(url=comment_url, headers=headers, proxies=random.choice(proxies), timeout=10) try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) comment_data = response.text # logger.log(31, '开始抓取评论') # print(comment_data) comment = re.search( r'{"rateDetail":{"rateCount":{"total":.*"tags":\[]}}', comment_data) # print(comment.group()) items = json.loads(comment.group())['rateDetail']['rateList'] # print(items) goods_data = dict() for item in items: date_data = item['rateDate'].split(' ', 1)[0] time_data = item['rateDate'].split(' ', 1)[1] # print('评论时间', date_data, time_data) try: content = item['appendComment']['content'] except: content = '' # 判断评论时间是否在规定的抓取时间内 if self.start_time <= date_data.strip(): goods_data['platform'] = goods_dict['平台'] goods_data['date'] = date_data.strip() goods_data['time'] = time_data.strip() goods_data['keyword'] = goods_dict['关键词'] goods_data['name'] = goods_dict['商品名'] goods_data['url'] = goods_dict['URL'] goods_data['shop_name'] = goods_dict['shop_name'] goods_data['user_name'] = item['displayUserNick'] goods_data['content'] = self.re_html( item['rateContent']) + ';' + str( self.re_html(content)) goods_data['content_id'] = str(item['id']) goods_data['brand'] = goods_dict['品牌'] goods_data['price'] = goods_dict['价格'] goods_data['sales'] = goods_dict['月销量'] goods_data['focus_count'] = goods_dict['人气数'] goods_data['comment_num'] = goods_dict['评价人数'] goods_data['views'] = '' goods_data['likes'] = '' goods_data['comments_count'] = '' goods_data['author_id'] = '' goods_data['reposts_count'] = '' goods_data['topic_id'] = str(goods_dict['itemId']) # 判断size和type test_data = item['auctionSku'] if '分类' in test_data: goods_data['type'] = test_data.split( ':')[1].replace(';尺码:', '').replace(';鞋码', '').replace( ';尺码', '') try: goods_data['size'] = test_data.split( ':')[2].split(';')[0] except: try: goods_data['size'] = test_data.split( ':')[2] except: goods_data['size'] = '' else: goods_data['type'] = '' goods_data['size'] = '' goods_data['imageurl'] = goods_dict['imageurl'] goods_data['audiourl'] = goods_dict['audiourl'] goods_data['file_code'] = '50' # logger.log(31, '--------********开始写入商品数据********--------') # print(goods_data) item = json.dumps(dict(goods_data), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_daily/ecommerce/{}/50_{}_{}_Tmall_nike{}.json' .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d'), self.time_data, self.pid), item, encoding='utf-8') if date_data.strip() < self.start_time: is_break = True if is_break: break except: print(7777777777777777777, traceback.format_exc()) def run(self, lock): # f = open('E:/chance/电商2/taobao/parse_keyword/pk2_Tmall_url_NIKE.json', 'r', encoding='utf-8') # goods_data_list = [] # for line in f.readlines(): # dic_data = json.loads(line) # goods_data_list.append(dic_data) # for data in goods_data_list: # self.parse_goods_url(data) for num in range(1000000): lock.acquire() redis_url_num = self.redis_example.llen('Tmall_day_url') if str(redis_url_num) == '0': print('*******Redis消息队列中url为空,程序等待中...进程 {} 等待中....******'. format(str(os.getpid()))) item = self.redis_example.brpop('Tmall_day_url', timeout=600)[1] lock.release() item1 = json.loads(item.decode()) # print(item) self.parse_goods_url(item1)
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self, redis_example): # 时间部分,按小时抓取 date_time = str(datetime.now() - timedelta(days=1)).split('.')[0] start_time_test = time.strftime('%Y-%m-%d 00:00:00') end_time = time.strftime('%Y-%m-%d %H:%M:%S') a = end_time.split(' ')[1].split(':')[0] if a == '00': start_time_data = date_time hours_name = '22_24' wen_jian_jia_date = str(datetime.now() - timedelta( days=1)).split('.')[0].split(' ')[0].replace('-', '') else: two_hours_ago = int(a) - 2 if len(str(two_hours_ago)) == 1: two_hour_ago = '0' + str(two_hours_ago) else: two_hour_ago = str(two_hours_ago) hours_name = str(two_hour_ago) + '_' + str(a) start_time_data = start_time_test wen_jian_jia_date = time.strftime('%Y%m%d') print('爬取时间段:{}到{}'.format(start_time_data, end_time)) logging.info('爬取时间段:{}到{}'.format(start_time_data, end_time)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = start_time_data # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = end_time # 标记爬虫工作 self.is_break = False self.redis_example = redis_example self.pid = os.getpid() self.h2_name = hours_name self.date_time = wen_jian_jia_date # 链接hdfs self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') self.hdfsclient.makedirs( '/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format( wen_jian_jia_date, hours_name)) # 创建每日文件夹 self.time_data = str(time.time()).split('.')[0] # 替换所有的HTML标签 def re_html(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r'\n', '', ret1) ret3 = re.sub(r'\u3000', '', ret2) ret4 = re.sub(r'品牌:', '', ret3) ret5 = re.sub(r'\xa0', '', ret4) ret6 = re.sub(r'→_→', '', ret5) ret7 = re.sub(r'……', '', ret6) ret8 = re.sub(r'":', '', ret7) return ret8 except: pass # 过滤月销量里面的非数字 def re_not_number(self, data): try: message = str(data) ret1 = re.sub(r'\D', '', message) return ret1 except: pass # 过滤url里面的#detail def re_detail(self, data): try: message = str(data) ret1 = re.sub(r'#detail', '', message) return ret1 except: pass # 过滤品牌 def re_pin_pai(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r'<li title.*?>', '', ret1) ret3 = re.sub(r'品牌: ', '', ret2) return ret3 except: pass # 解析请求得到的商品信息 def parse_goods_url(self, data): goods_dict = dict() goods_dict['平台'] = '淘宝' goods_dict['URL'] = data['URL'] goods_dict['商品名'] = data['商品名'] try: goods_dict['品牌'] = data['品牌'] except: goods_dict['品牌'] = '' goods_dict['价格'] = data['价格'] goods_dict['shop_name'] = data['shop_name'] goods_dict['月销量'] = data['月销量'].replace('人付款', '') goods_dict['关键词'] = data['关键词'] goods_dict['itemId'] = data['itemId'] goods_dict['sellerId'] = data['sellerId'] goods_dict['imageurl'] = data['商品图片'] goods_dict['audiourl'] = '' # logger.log(31, '************************正在抓取的商品是:%s................' % goods_dict) self.goods_collection_num(goods_dict) # 抓取商品收藏数(人气) def goods_collection_num(self, goods_dict): try: url = 'https://count.taobao.com/counter3?callback=jsonp235&keys=ICCP_1_{}'.format( goods_dict['itemId']) headers = { 'content-type': 'application/x-javascript', 'cookie': 't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; _uab_collina=155540168306791903478476; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=f32553e159b195a4f17c00010f2bcd2e_1564547678304; _m_h5_tk_enc=3268f7bf49fd78b94768c96e3ef51817; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b227365617263686170703b32223a2266303666326434356635336264366335613639393662663834646632366531644349582b684f6f46454c47473571714e726f325955526f4c4f4459314d5467794d6a4d334f7a453d227d; l=cBaOcPD7qg21z_uyBOfZKurza779uIdf1sPzaNbMiICPO_fh5wONWZFb8t8MCnGVLsI2535t6zUaBXYaGyUIh2nk8b8CgsDd.; isg=BOzsOXiZnf59JomJ--wm9a9SvcreDZFEZ8nHSkYsbxe0UY9baraQ30WjcVnMWcin; JSESSIONID=A9F406FD84CDFD576728A12ECBD98A53', 'upgrade-insecure-requests': '1', 'user-agent': random.choice(user_agent_list) } try: time.sleep(0.3) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.3) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.3) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) re_collection_num = re.search(r'":\d{1,20}', response.text) # print(re_collection_num.group()) goods_dict['人气数'] = self.re_html(re_collection_num.group()) # print(goods_dict) response.close() self.parse_goods_comment_num(goods_dict) except: print(444444444444444444, traceback.format_exc()) # 抓取商品评论数 def parse_goods_comment_num(self, goods_dict): try: url = 'https://rate.taobao.com/detailCommon.htm?auctionNumId={}&userNumId={}'.format( goods_dict['itemId'], goods_dict['sellerId']) headers = { 'cookie': 't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB', 'pragma': 'no-cache', 'upgrade-insecure-requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'https://item.taobao.com/item.htm?spm=a230r.1.14.31.26804e4c03W4qw&id=563490255667&ns=1&abbucket=1', 'User-Agent': random.choice(user_agent_list) } try: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) # print('11111') # print(response.text) if 'total' in response.text: re_comment_num = json.loads( response.text.replace('(', '').replace(')', '')) # print(re_comment_num) goods_dict['评价人数'] = re_comment_num['data']['count']['total'] # print(goods_dict['评价人数']) if int(self.re_not_number(goods_dict['评价人数'])) == 0: # logger.log(31, '-----------该商品没有评论数据--------------') pass else: itemId = goods_dict['itemId'] sellerId = goods_dict['sellerId'] headers1 = { 'cookie': 't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB', 'pragma': 'no-cache', 'upgrade-insecure-requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'https://item.taobao.com/item.htm?spm=a230r.1.14.31.26804e4c03W4qw&id=563490255667&ns=1&abbucket=1', 'User-Agent': random.choice(user_agent_list) } comment_url = 'https://rate.taobao.com/feedRateList.htm?auctionNumId={}&userNumId={}¤tPageNum=1&pageSize=20&rateType=&orderType=feedbackdate&attribute=&sku=&hasSku=false&folded=0'.format( itemId, sellerId) try: time.sleep(0.3) response1 = requests.get(url=comment_url, headers=headers1, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.3) response1 = requests.get(url=comment_url, headers=headers1, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.3) response1 = requests.get(url=comment_url, headers=headers1, proxies=proxies, allow_redirects=False, timeout=30) re_pages = re.search( r'{"qnaDisabled":true,"watershed":.*"maxPage":.*}', response1.text) comment_nums = json.loads(re_pages.group())['total'] if int(comment_nums) == 0: pass else: pages_num = int( math.ceil(float(int(comment_nums) / 20))) response.close() response1.close() self.goods_comments(goods_dict, pages_num) except: print(5555555555555555555555, traceback.format_exc()) # 解析商品评论 def goods_comments(self, goods_dict, pages_num): try: is_break = self.is_break # print(goods_dict) itemId = goods_dict['itemId'] sellerId = goods_dict['sellerId'] headers = { 'cookie': 't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB', 'pragma': 'no-cache', 'upgrade-insecure-requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'https://item.taobao.com/item.htm?spm=a230r.1.14.31.26804e4c03W4qw&id=563490255667&ns=1&abbucket=1', 'User-Agent': random.choice(user_agent_list) } # print('----------------商品评论总页数是: %s -----------------------' % pages_num) # 抓取商品评论链接(总共99页,从1开始) for i in range(1, int(pages_num) + 1): comment_url = 'https://rate.taobao.com/feedRateList.htm?auctionNumId={}&userNumId={}¤tPageNum={}&pageSize=20&rateType=&orderType=feedbackdate&attribute=&sku=&hasSku=false&folded=0'.format( itemId, sellerId, i) # print(comment_url) # response = requests.get(url=comment_url, headers=headers, proxies=random.choice(proxies), timeout=10) try: time.sleep(0.3) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.3) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.3) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) comment_data = response.text # print('开始抓取评论') # print(comment_data) comment = re.search( r'{"qnaDisabled":true,"watershed":.*"maxPage":.*}', comment_data) # print(comment.group()) items = json.loads(comment.group())['comments'] # print(items) goods_data = dict() for item in items: # if item['date'] != None: # time_test = item['date'].split(' ')[0].replace('年', '-').replace('月', '-').replace('日', '') + ' ' + item['date'].split(' ')[1] + ':00' date_data = item['date'].split(' ')[0].replace( '年', '-').replace('月', '-').replace('日', '') try: time_data = item['date'].split(' ')[1] + ':00' except: time_data = '00:00:00' # print('评论时间', date_data, time_data) try: content = item['content'] except: content = '' # 追加评论 try: comments_2 = item['appendList'][0]['content'] except: comments_2 = '' time_test = date_data + ' ' + time_data # 判断评论时间是否在规定的抓取时间内 if self.start_time <= time_test: goods_data['platform'] = goods_dict['平台'] goods_data['date'] = date_data.strip() goods_data['time'] = time_data.strip() goods_data['keyword'] = goods_dict['关键词'] goods_data['name'] = goods_dict['商品名'] goods_data['url'] = goods_dict['URL'] goods_data['shop_name'] = goods_dict['shop_name'] goods_data['user_name'] = item['user']['nick'] goods_data['content'] = content + ';' + comments_2 goods_data['content_id'] = str(item['rateId']) goods_data['brand'] = goods_dict['品牌'] goods_data['price'] = goods_dict['价格'] goods_data['sales'] = goods_dict['月销量'] goods_data['focus_count'] = goods_dict['人气数'] goods_data['comment_num'] = goods_dict['评价人数'] goods_data['views'] = '' goods_data['likes'] = item['useful'] goods_data['comments_count'] = '' goods_data['author_id'] = '' goods_data['reposts_count'] = '' goods_data['topic_id'] = str(goods_dict['itemId']) # 判断size和type 颜色分类:黑色高帮  尺码:37 test_data = item['auction']['sku'] if '码' in test_data: goods_data['type'] = test_data.split( ':')[1].replace('尺码', '').replace( '  ', '').replace('鞋码', '').replace(';尺码', '') goods_data['size'] = test_data.split(':')[2] else: goods_data['type'] = '' goods_data['size'] = '' # print('--------********开始写入商品数据********--------') # print(goods_data) goods_data['imageurl'] = goods_dict['imageurl'] goods_data['audiourl'] = goods_dict['audiourl'] goods_data['file_code'] = '55' # logger.log(31, '--------********开始写入商品数据********--------') # print(goods_data) item = json.dumps(dict(goods_data), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_2h/ecommerce/{}/{}/55_{}_TaoBao_nike{}.json' .format(self.date_time, self.h2_name, time.strftime('%Y%m%d'), str(self.pid)), item, encoding='utf-8') if date_data.strip() < self.start_time: is_break = True if is_break: break except: print(7777777777777777777, traceback.format_exc()) def run(self, lock): for i in range(1000000): lock.acquire() redis_url_num = self.redis_example.llen('tao_bao_2h') if str(redis_url_num) == '0': print( '******Redis消息队列中url为空,程序等待中...进程{}等待中...*********'.format( str(os.getpid()))) item = self.redis_example.brpop('tao_bao_2h', timeout=3600)[1] lock.release() item1 = json.loads(item.decode()) # print(item) self.parse_goods_url(item1)
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self): # 时间部分 # 爬虫开始抓取的日期 date = datetime.now() - timedelta(days=7) news_start_time = str(date).split(' ')[0] # 爬虫结束的抓取日期 current_time = datetime.now() # 当前日期 current_day = str(current_time).split(' ')[0] print('爬取时间段:{}到{}'.format(news_start_time, current_day)) logging.info('爬取时间段:{}到{}'.format(news_start_time, current_day)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = current_day # 标记爬虫工作1 self.is_break = False # 标记爬虫工作2 self.is_work = False self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') self.hdfsclient.makedirs( '/user/cspider_daily/nike_daily/forum/{}'.format( time.strftime('%Y%m%d'))) # 创建每日文件夹 self.time_data = str(time.time()).split('.')[0] # 替换所有的HTML标签 def re_html(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r' -', '', ret1) ret3 = re.sub( r' ', '', ret2) ret4 = re.sub(r"hot\(.*\d?','", '', ret3) ret5 = re.sub(r'\[', '', ret4) ret6 = re.sub(r'\]', '', ret5) ret7 = re.sub(r"',", "", ret6) ret8 = re.sub(r"'", "", ret7) return ret8 except: pass # 过滤月销量里面的非数字 def re_not_number(self, data): try: message = str(data) ret1 = re.sub(r'\D', '', message) return ret1 except: pass # 匹配具体时间 def clean_date(self, x): now = datetime.now() if str(x).find('昨天') != -1: x = datetime.strftime(now + timedelta(days=-1), '%Y-%m-%d %H:%M:%S') elif str(x).find('前天') != -1: x = datetime.strftime(now + timedelta(days=-2), '%Y-%m-%d %H:%M:%S') elif str(x).find('天前') != -1: x = datetime.strftime( now + timedelta(days=-int(str(x).replace('天前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('小时前') != -1: x = datetime.strftime( now + timedelta(hours=-int(str(x).replace('小时前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('分钟前') != -1: x = datetime.strftime( now + timedelta(minutes=-int(str(x).replace('分钟前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('今天') != -1: x = str(x).replace('今天', now.strftime('%Y-%m-%d') + ' ') elif str(x).find('刚刚') != -1: x = now.strftime('%Y-%m-%d %H:%M:%S') elif str(x).find('秒前') != -1: x = now.strftime('%Y-%m-%d %H:%M:%S') elif str(x).find('月前') != -1: x = datetime.strftime( now + timedelta(weeks=-4 * int(str(x).replace('月前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('周前') != -1: x = datetime.strftime( now + timedelta(weeks=-int(str(x).replace('周前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('[') != -1: x = x.replace('[', '').replace(']', '') elif str(x).find('月') != -1: x = x.replace('月', '-').replace('日', '') return x # 根据关键词搜索请求得到帖子信息 def parse_goods(self, key_word): try: print('正在抓取的关键词是:%s' % key_word) insert_time = time.strftime('%Y-%m-%d %H:%M:%S') key_word_data = urllib.parse.quote(key_word) is_break = self.is_break url = 'http://so.hualongxiang.com/?keyword={}&desc=time' # print(url) headers = { # 'Content-Type': 'text/html; charset=UTF-8', # 'Host': 'so.hualongxiang.com', # 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } proxies_list = [ { "http": "222.89.32.136:9999" }, # {"http": "117.80.86.239:3128"} ] # print('调用代理是:%s' % random.choice(proxies_list)) time.sleep(10) response = requests.get(url=url.format(key_word), headers=headers, allow_redirects=False) # 将响应转换成一个element对象 html = etree.HTML(response.text) # 获取帖子总数 topic_num = self.re_not_number( html.xpath('//div[@class="wapper"]/p/text()')[0].split(',')[0]) # print(topic_num) if int(topic_num) == 0: logger.log( 31, '*******-------关键词:%s 搜索不到内容-------*******' % key_word) else: # 获取帖子页数 pages_num = int(math.ceil(float(int(topic_num) / 20))) # logger.log(31, '关键词: %s , 搜索帖子总数是: %s , 帖子总页数是:%s' % (key_word, topic_num, pages_num)) for i in range(1, int(pages_num) + 1): topic_url = 'http://so.hualongxiang.com/search/index?keyword={}&desc=time&page={}'.format( key_word, i) # print(topic_url, '调用代理是:%s' % random.choice(proxies_list)) time.sleep(10) response1 = requests.get(url=topic_url, headers=headers, allow_redirects=False) # 将响应转换成一个element对象 html1 = etree.HTML(response1.text) # 获取帖子列表 topic_list = html1.xpath( '//div[@class="shopper-list-long"]/ul/li') # print(len(topic_list)) hua_long_xiang = dict() # 遍历帖子列表 for data in topic_list: date_time_data = self.clean_date( self.re_html( data.xpath('./div[@class="time"]/span/text()') [1])) # print(date_time_data) date_data = date_time_data.split(' ')[0].strip() if self.start_time <= date_data: hua_long_xiang['platform'] = '化龙巷' hua_long_xiang['date'] = date_data hua_long_xiang['insert_time'] = insert_time hua_long_xiang['author'] = data.xpath( './div[@class="time"]/a/text()')[0] hua_long_xiang['author_url'] = data.xpath( './div[@class="time"]/a/@href')[0] hua_long_xiang['post_client'] = '化龙巷APP' hua_long_xiang['title'] = self.re_html( data.xpath('./div[@class="title"]/a/@onclick') [0]).replace("'", '').replace(')', '') hua_long_xiang['url'] = data.xpath( './div[@class="title"]/a/@href')[0] hua_long_xiang['content_id'] = self.re_not_number( hua_long_xiang['url']) hua_long_xiang['brand'] = '' hua_long_xiang['carseries'] = '' hua_long_xiang['series_url'] = '' # print(hua_long_xiang) response.close() response1.close() self.parse_topic_data(hua_long_xiang['url'], hua_long_xiang) if date_data < self.start_time: is_break = True if is_break: logger.log(31, '没有符合时间的帖子') break except: print(111111111111111111111, traceback.format_exc()) # 解析帖子内容 def parse_topic_data(self, url, hua_long_xiang): try: headers = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', # 'Accept-Encoding': 'gzip, deflate', # 'Accept-Language': 'zh-CN,zh;q=0.9', # 'Cache-Control': 'no-cache', # 'Connection': 'keep-alive', 'Cookie': 'srcurl=687474703a2f2f7777772e6875616c6f6e677869616e672e636f6d2f6368617a756f2f3135303537343135;f04e6_lastpos=T15057415;f04e6_ipstate=1573461495;security_session_verify=acc65a1e29d3f4b165840dab4d94db31;security_session_mid_verify=428b31ce793e13908b5c599759e876a4;f04e6_lastvisit=19992%091573462167%09%2Frewrite.php%3Fpychazuo%2F15057415;f04e6_ci=read%091573462167%0915057415%09103;Hm_lpvt_82d62f38b0397423b12572434961fe6c=1573462167', # 'Host': 'www.hualongxiang.com', # 'Pragma': 'no-cache', # 'Referer': 'http://www.hualongxiang.com/chazuo/14994153?security_verify_data=313932302c31303830', # 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } proxies_list = [ { "http": "222.89.32.136:9999" }, # {"http": "117.80.86.239:3128"} ] # print(url, '调用代理是:%s' % random.choice(proxies_list)) try: time.sleep(10) response = requests.get(url=url, headers=headers, allow_redirects=False) except: try: time.sleep(10) response = requests.get(url=url, headers=headers, allow_redirects=False) except: time.sleep(10) response = requests.get(url=url, headers=headers, allow_redirects=False) response.encoding = 'gbk' if '发表于' not in response.text: logger.log(31, '主贴:' + url + '请求失败,重新发起请求') time.sleep(20) self.parse_topic_data(url, hua_long_xiang) else: # 将响应转换成一个element对象 html = etree.HTML(response.text) # print(response.text) # 获取发帖时间 time_data_test = re.search(r'发表于:.*?</p>', response.text) time_test = time_data_test.group().replace('发表于:', '').replace( '</p>', '').split(' ')[1] lang = len(time_test.split(':')) if int(lang) == 3: time_data = time_data_test else: time_data = time_test.split( ':')[0] + ':' + time_test.split(':')[1] + ':' + '00' hua_long_xiang['time'] = time_data hua_long_xiang['content'] = self.re_html( html.xpath( '//div[@class="fs16 mb10" and @id="read_tpc"]//text()') ).replace('\\r', '').replace('\\n', '').replace('\\t', '').replace('\\xa0', '') hua_long_xiang['imageurl'] = html.xpath( '//div[@class="fs16 mb10" and @id="read_tpc"]//img/@src') hua_long_xiang['audiourl'] = '' hua_long_xiang['from'] = '' hua_long_xiang['is_topics'] = '是' hua_long_xiang['floor'] = html.xpath( '//div[@class="fr gc6"]/a[2]/text()')[0] hua_long_xiang['author_id'] = self.re_not_number( re.search( r"uid=.*?'", html.xpath('//div[@class="fr gc6"]/a[1]/@onclick') [0]).group()) hua_long_xiang['identification'] = '' hua_long_xiang['favorite'] = '' hua_long_xiang['signin_time'] = '' hua_long_xiang['reply_no'] = html.xpath( '//ul[@class="data"]/li[2]/span/text()')[0] hua_long_xiang['views'] = html.xpath( '//ul[@class="data"]/li[1]/span/text()')[0] hua_long_xiang['likes'] = '' hua_long_xiang['is_elite'] = '' hua_long_xiang['topic_count'] = html.xpath( '//span[@class="user-info2" and @id="showface_0"]/ul/li[1]/a/text()' )[0] hua_long_xiang['reply_count'] = '' hua_long_xiang['pick_count'] = '' hua_long_xiang['follows'] = '' hua_long_xiang['topic_categroy'] = '' hua_long_xiang['topic_type'] = '' hua_long_xiang['reposts_count'] = '' hua_long_xiang['update_time'] = time.strftime( '%Y-%m-%d %H:%M:%S') hua_long_xiang['topic_id'] = hua_long_xiang['content_id'] hua_long_xiang['file_code'] = '187' # logger.log(31, '----------------正在写入主贴----------------') # print(hua_long_xiang) response.close() item = json.dumps(dict(hua_long_xiang), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_daily/forum/{}/187_{}_{}_hualongxiang_Nike.json' .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d'), self.time_data), item, encoding='utf-8') if int(hua_long_xiang['reply_no']) == 0: logger.log(31, '没有回帖') else: # 获取回帖页数 pages_num = int( math.ceil(float(int(hua_long_xiang['reply_no']) / 25))) for i in range(pages_num, 0, -1): url_topic = 'http://www.hualongxiang.com/read.php?tid={}&pd=0&page={}'.format( hua_long_xiang['content_id'], i) self.parse_reply(url_topic, hua_long_xiang) except: print(url, '请求主贴失败,重新发起请求') time.sleep(20) self.parse_topic_data(url, hua_long_xiang) print(222222222222222222222, traceback.format_exc()) # 抓取回帖内容 def parse_reply(self, url_topic, hua_long_xiang): try: is_work = self.is_work start_time = time.strftime('%Y-%m-%d %H:%M:%S') headers = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', # 'Accept-Encoding': 'gzip, deflate', # 'Accept-Language': 'zh-CN,zh;q=0.9', # 'Cache-Control': 'no-cache', # 'Connection': 'keep-alive', 'Cookie': 'srcurl=687474703a2f2f7777772e6875616c6f6e677869616e672e636f6d2f6368617a756f2f3135303537343135;f04e6_lastpos=T15057415;f04e6_ipstate=1573461495;security_session_verify=acc65a1e29d3f4b165840dab4d94db31;security_session_mid_verify=428b31ce793e13908b5c599759e876a4;f04e6_lastvisit=19992%091573462167%09%2Frewrite.php%3Fpychazuo%2F15057415;f04e6_ci=read%091573462167%0915057415%09103;Hm_lpvt_82d62f38b0397423b12572434961fe6c=1573462167', # 'Host': 'www.hualongxiang.com', # 'Pragma': 'no-cache', # 'Referer': 'http://www.hualongxiang.com/chazuo/14994153?security_verify_data=313932302c31303830', # 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } proxies_list = [ { "http": "222.89.32.136:9999" }, # {"http": "117.80.86.239:3128"} ] # print('调用代理是:%s' % random.choice(proxies_list)) try: time.sleep(10) response = requests.get(url=url_topic, headers=headers, allow_redirects=False) except: try: time.sleep(10) response = requests.get(url=url_topic, headers=headers, allow_redirects=False) except: time.sleep(10) response = requests.get(url=url_topic, headers=headers, allow_redirects=False) response.encoding = 'gbk' if '发表于' not in response.text: logger.log(31, '回帖:' + url_topic + '请求失败,重新发起请求') time.sleep(20) self.parse_reply(url_topic, hua_long_xiang) else: # 将响应转换成一个element对象 html = etree.HTML(response.text) reply_dict = dict() # 获取回帖列表 reply_list = html.xpath( '//div[@class="read_t"]/table[@class="floot"]') for item in reply_list[::-1]: floor_data = item.xpath( './tr[1]/td[2]/div[2]/div/a[2]/text()')[0] print(floor_data) if floor_data == '楼主' or floor_data == '置顶': pass else: url_data = response.url floor_test = floor_data date_time_test = item.xpath( './tr[1]/td[2]/div[2]/p/text()')[0].replace( '发表于 ', '').strip() print(date_time_test) # 发表日期 date_data = date_time_test.split(' ')[0].strip() # 发表时间 time_data_test = date_time_test.split(' ')[1] lang = len(time_data_test.split(':')) if int(lang) == 3: time_data = time_data_test.strip() else: time_data = (time_data_test.split(':')[0] + ':' + time_data_test.split(':')[1] + ':' + '00').strip() if self.start_time <= date_data: reply_dict['platform'] = hua_long_xiang['platform'] reply_dict['date'] = date_data reply_dict['time'] = time_data reply_dict['author'] = item.xpath( './tr[1]/td[1]/div/div[1]/span[3]/text()')[0] reply_dict['author_url'] = item.xpath( './tr[1]/td[1]/div/div[2]/a/@href')[0] reply_dict['author_id'] = self.re_not_number( re.search( r"uid=.*?'", item.xpath( './tr[1]/td[2]/div[2]/div/a[1]/@onclick' )[0]).group()) reply_dict['post_client'] = hua_long_xiang[ 'post_client'] reply_dict['title'] = hua_long_xiang['title'] reply_dict['content'] = item.xpath( './tr[1]/td[2]/div[4]/div/div[2]/text()')[0] reply_dict['imageurl'] = '' reply_dict['audiourl'] = '' reply_dict['content_id'] = self.re_not_number( item.xpath( './tr[1]/td[2]/div[4]/div/div[2]/@id')[0]) reply_dict['brand'] = '' reply_dict['carseries'] = '' reply_dict['from'] = '' reply_dict['series_url'] = '' reply_dict['url'] = url_data reply_dict['is_topics'] = '否' reply_dict['floor'] = floor_test reply_dict['identification'] = '' reply_dict['favorite'] = '' reply_dict['signin_time'] = '' reply_dict['reply_no'] = '' reply_dict['views'] = '' reply_dict['likes'] = '' reply_dict['is_elite'] = '' reply_dict['topic_count'] = item.xpath( './tr[1]/td[1]/div/span/ul/li[1]/a/text()')[0] reply_dict['reply_count'] = '' reply_dict['pick_count'] = '' reply_dict['follows'] = '' reply_dict['topic_categroy'] = '' reply_dict['topic_type'] = '' reply_dict['reposts_count'] = '' reply_dict['insert_time'] = start_time reply_dict['update_time'] = time.strftime( '%Y-%m-%d %H:%M:%S') reply_dict['topic_id'] = hua_long_xiang['topic_id'] reply_dict['file_code'] = '187' # logger.log(31, '******************开始写入回帖数据**********************') # print(reply_dict) response.close() item = json.dumps(dict(reply_dict), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_daily/forum/{}/187_{}_{}_hualongxiang_Nike.json' .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d'), self.time_data), item, encoding='utf-8') if date_data < self.start_time: is_work = True if is_work: break except: print(url_topic, '请求回贴失败,重新发起请求') time.sleep(20) self.parse_topic_data(url_topic, hua_long_xiang) print(333333333333333333333, traceback.format_exc()) # 读取excel获取关键词 def parse_xlsx(self): # 设置路径 path = './快消采集关键词_v12_20200119.xlsx' # 打开execl workbook = xlrd.open_workbook(path) # 根据sheet索引或者名称获取sheet内容 Data_sheet = workbook.sheets()[0] # 通过索引获取 rowNum = Data_sheet.nrows # sheet行数 colNum = Data_sheet.ncols # sheet列数 # 获取所有单元格的内容 list = [] for i in range(rowNum): rowlist = [] for j in range(colNum): rowlist.append(Data_sheet.cell_value(i, j)) list.append(rowlist) for data in list[1::]: brand = data[0] # print(brand) yield { '关键词': brand, } def run(self): key_word_list = [] for item in self.parse_xlsx(): key_word_list.append(item) for item_data in key_word_list: self.parse_goods(item_data['关键词'])
class Spider(object): """ 网易体育新闻 """ def __init__(self, file_path, comment_path, need_time): self.headers_one = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36' } # 评论接口模板 self.comment_port_url = 'http://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/{}/comments/newList?ibc=newspc&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&offset={}&callback=jsonp_1542355418897&_=1542355418898' # # get_now_time = time.time() - 86400 # get_now_time = time.time() - int(need_time) # time_local = time.localtime(float(get_now_time)) # # 转换成新的时间格式(2016-05-05 20:28:54) # dt = time.strftime("%Y-%m-%d %H:%M", time_local) # "%Y-%m-%d %H:%M:%S" # end_t = time.time() # time_local = time.localtime(float(end_t)) # # 转换成新的时间格式(2016-05-05 20:28:54) # end_dt = time.strftime("%Y-%m-%d %H:%M", time_local) # "%Y-%m-%d %H:%M:%S" # # end_time = str(end_time).split(' ')[0] # logging.log(31, '爬取时间段:{}到{}'.format(dt, str(datetime.now()))) # # 定义开始时间 y-m-d 离现在时间远 # self.start_time = dt # # self.start_time = '2019-09-09 00:01' # # 定义结束时间 y-m-d 离现在时间近 # self.end_time = end_dt # # self.end_time = '2019-09-16 12:57' a = str(datetime.now()) hour = a.split(' ')[-1].split(':')[0] num = int(hour) / 3 num = int(num) * 3 if num == 0: # 对于凌晨 0 点的判断 # 时间判断部分 date = datetime.now() - timedelta(days=1) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=1) # 昨天时间 yesterday = str(yesterday).split(' ')[0] else: # 时间判断部分 date = datetime.now() - timedelta(days=0) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=0) # 昨天时间 yesterday = str(yesterday).split(' ')[0] # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time + ' 0:00' # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = yesterday + ' 23:59' # 标记爬虫工作 self.is_work = True self.file_name_time = self.get_file_name_time() self.file_path = file_path self.comment_apth = comment_path self.hdfsclient = HdfsClient(url='http://*****:*****@class="articleList"]/li') for li in li_list: title = li.xpath('.//a/text()')[0] news_url = li.xpath('.//a/@href')[0] try: self.get_news_info_page(news_url, '', '') except: try: self.get_news_info_page(news_url, '', '') except: logger.error(traceback.format_exc()) # 获取新闻详情页 def get_news_info_page(self, news_url, comment_count, page_list): logger.log(31, '文章url: ' + news_url) item = {} response = requests.get(news_url, headers=self.headers_one) status_code = response.status_code if status_code == 200: try: data = response.content.decode('gbk') except (UnicodeDecodeError, ): data = response.content.decode('utf-8') data = etree.HTML(data) news_id = news_url.split('/')[-1].split('.')[0] try: title = data.xpath('.//div[@id="epContentLeft"]/h1/text()')[0] except: title = data.xpath('.//h1/text()')[0] try: date_all = data.xpath( './/div[@class="post_time_source"]/text()')[0] date_all = re.findall('\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', date_all)[0] except: date_all = data.xpath( './/div[@class="headline"]/span/text()')[0] # 获取评论数 try: comment_response = requests.get('http://comment.tie.163.com/' + str(news_id) + '.html', headers=self.headers_one) # print('http://comment.tie.163.com/' + str(news_id) + '.html') # comment_data = comment_response.content.decode() count = re.search('"tcount":\d{0,10}', comment_response.text).group(0) count = count.split(":")[1] comment_id = news_id except AttributeError: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36', 'Host': 'comment.tie.163.com', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Upgrade-Insecure-Requests': '1', 'Accept-Language': 'zh-CN,zh;q=0.9', } comment_id = re.search('docId" : "(.*)?",', response.text).group(1) # print(comment_id) # print('http://comment.tie.163.com/' + str(comment_id) + '.html') comment_response = requests.get('http://comment.tie.163.com/' + str(comment_id) + '.html', headers=headers) count = re.search('"tcount":\d{0,10}', comment_response.text).group(0) count = count.split(":")[1] except: # print(traceback.format_exc()) count = '' # 网站 item['platform'] = '网易新闻' # 日期date # 评论部分做时间判断部分--------------- get_news_time = time.mktime( time.strptime(str(date_all.split(' ')[0]).strip(), "%Y-%m-%d")) # end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d %H:%M")) if self.start_time != '': start_time = time.mktime( time.strptime(self.start_time.split(' ')[0], "%Y-%m-%d")) else: start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d")) if float(get_news_time) < float(start_time): print('时间不符合') elif float(start_time) <= float(get_news_time): date = date_all.strip().split(' ')[0] item['date'] = date news_time = date_all.strip().split(' ')[1] item['time'] = news_time item['title'] = title # 来源 try: source = data.xpath( './/div[@class="post_time_source"]/a/text()')[0] item['article_source'] = source # 文章来源 except: item['article_source'] = '' try: item['article_author'] = data.xpath( './/span[@class="ep-editor"]/text()')[0] except: item['article_author'] = '' # 正文内容 content = data.xpath( './/div[@id="endText"]/p/text() | .//div[@id="endText"]/p/a/text() |.//div[@class="overview"]//p/text()' ) images_url = data.xpath('.//div[@id="endText"]//img/@src') content = ''.join(content) content = content.replace('\n', '') content = content.replace(' ', '') item['content'] = content item['keyword'] = '' item['views'] = '' item['comments_count'] = count item['likes'] = '' item['clicks'] = '' item['article_url'] = news_url # 文章详情URL item['dislikes'] = '' # 踩人数 item['series_url'] = '' # 车系首页 item['list_url'] = page_list # 文章列表URL if 'buy' in page_list: news_type = '购车' elif 'nauto' in page_list: news_type = '新车' elif 'drive' in page_list: news_type = '试驾' elif 'buyers_guides' in page_list: news_type = '导购' elif 'auto_newenergy' in page_list: news_type = '新能源' elif 'news' in page_list: news_type = '行业' else: news_type = '' item['article_type_1st'] = news_type # 文章类型 item['article_type_2nd'] = '' # 文章类型 item['insert_time'] = str( datetime.now()).split('.')[0] # 初始爬取时间 item['update_time'] = str( datetime.now()).split('.')[0] # 最后爬取时间 content_id = news_url.split('/')[-1].split('.')[0] item['content_id'] = content_id item['topic_id'] = str(content_id) # 主贴id item['author_id'] = '' # 作者id item['content_id'] = str(content_id) item['file_code'] = '15' item['reposts_count'] = '' item['imageurl'] = images_url item['audiourl'] = [] # print(item) self.__write_news_jsonfile(item) # 调用爬取评论的函数 # http://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/E0IBEEA10008856S/comments/newList?ibc=newspc&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&offset=0&callback=jsonp_1542355418897&_=1542355418898 self.is_get_comment = True self.comment_page_num = 30 self.get_comment_info( self.comment_port_url.format(comment_id, "0"), news_id, date, news_time, title, news_url) # with open('./../wangyi/json_file/{}/{}_news_id.json'.format(self.file_name_time.split(' ')[0],self.file_name_time.split(' ')[0]),'a') as f: # com_item = {} # com_item['url'] = self.comment_port_url.format(comment_id, "0") # com_item['news_id'] = news_id # com_item['date'] = date # com_item['news_time'] = news_time # com_item['title'] = title # com_item['news_url'] = news_url # f.write(str(com_item) + '\n') # 获取评论信息 def get_comment_info(self, url, news_id, source_date, source_time, source_title, source_url): # time.sleep(1) s = requests.session() s.keep_alive = False respnse = requests.get(url, headers=self.headers_one) status_code = respnse.status_code if status_code == 200: data = respnse.content.decode() try: data = re.findall(r'{"commentIds.*newListSize":\d{0,10}}', data)[0] data = json.loads(data) comment_data = data['comments'] comment_id = data['commentIds'] if comment_id: total_item = '' for comment_info in comment_data.items(): # print(comment_info) item = {} comment_info = comment_info[1] # 网站 item['platform'] = '网易新闻' # 日期时间 date_all = comment_info['createTime'] get_date = date_all[:-3] # 评论部分做时间判断部分--------------- logger.log(31, date_all) logger.log(31, get_date) get_news_time = time.mktime( time.strptime(str(get_date), "%Y-%m-%d %H:%M")) end_time = time.mktime( time.strptime(self.end_time, "%Y-%m-%d %H:%M")) if self.start_time != '': start_time = time.mktime( time.strptime(self.start_time, "%Y-%m-%d %H:%M")) else: start_time = time.mktime( time.strptime('2010-1-1', "%Y-%m-%d %H:%M")) if float(get_news_time) < float(start_time): self.is_get_comment = False # 返回的回答消息是按时间进行排序的,所以当时间小于指定时间时,就停止爬取, break elif float(start_time) <= float( get_news_time) <= float(end_time): item['date'] = get_date comment_time = date_all.split(' ')[1] item['time'] = comment_time # 发帖作者 try: author = comment_info['user']['nickname'] except KeyError: author = comment_info['user']['location'] + '网友' item['author'] = author item['author_id'] = comment_info['user'][ 'userId'] # 用户id # 内容 content = comment_info['content'] item['content'] = content # 点赞数 item['likes'] = comment_info['vote'] # 原文发布日期时间 item['source_date'] = source_date item['source_time'] = source_time # 原文标题 item['title'] = source_title # 原文url item['source_url'] = source_url item['keyword'] = '' item['floor'] = '' item[ 'comment_url'] = 'http://comment.tie.163.com/' + str( news_id) + '.html' item['comments_count'] = '' item['views'] = '' item['dislikes'] = comment_info['against'] # 踩人数 item['insert_time'] = str( datetime.now()).split('.')[0] # 初始爬取时间 item['update_time'] = str( datetime.now()).split('.')[0] # 最后爬取时间 item['content_id'] = comment_info['commentId'] content_id = source_url.split('/')[-1].split( '.')[0] item['topic_id'] = str(content_id) # 主贴id item['content_id'] = comment_info[ 'commentId'] # 主贴id item['file_code'] = '29' item['reposts_count'] = '' item = json.dumps(dict(item), ensure_ascii=False) + '\n' total_item += item # print(item) self.__write_comment_jsonfile(total_item) if self.is_get_comment: self.comment_page_num += 30 # print(self.comment_page_num, '111111111111111111111111') self.get_comment_info( self.comment_port_url.format( news_id, str(self.comment_page_num)), news_id, source_date, source_time, source_title, source_url) else: logger.log(31, '评论爬取完毕') self.comment_page_num = 30 except: logger.error(traceback.format_exc()) def get_file_name_time(self): a = str(datetime.now()) hour = a.split(' ')[-1].split(':')[0] num = int(hour) / 3 num = int(num) * 3 if num == 0: num = 24 a = str(datetime.now() - timedelta(days=1)) # 昨天时间 num = a.split(' ')[0] + ' ' + str(num) return num # 写入json文件 def __write_news_jsonfile(self, item): item = json.dumps(dict(item), ensure_ascii=False) + '\n' # with open('./../wangyi/json_file/{}/{}_wangyi_news.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f: # f.write(item.encode("utf-8")) self.hdfsclient.new_write('{}/{}/{}/15_{}_{}_wangyi_news.json'.format( self.file_path, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name, str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item, encoding='utf-8') def __write_comment_jsonfile(self, item): # with open('./../wangyi/json_file/{}/{}_wangyi_news_comment.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f: # f.write(item.encode("utf-8")) self.hdfsclient.new_write( '{}/{}/{}/29_{}_{}_wangyi_news_comment.json'.format( self.comment_apth, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name, str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item, encoding='utf-8') def run(self): # self.get_list_page('http://sports.163.com/special/0005rt/news_json.js?0.4744335570460496') # self.get_list_page_two( 'http://sports.163.com/special/0005rt/sportsgd.html') for i in range(2, 5): if len(str(i)) == 1: i = '0' + str(i) self.get_list_page_two( 'http://sports.163.com/special/0005rt/sportsgd_{}.html'.format( str(i)))
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self, redis_example): # 时间部分 # 爬虫开始抓取的日期 date = datetime.now() - timedelta(days=1) news_start_time = str(date).split(' ')[0] # 爬虫结束的抓取日期 current_time = datetime.now() # 当前日期 current_day = str(current_time).split(' ')[0] print('爬取时间段:{}到{}'.format(news_start_time, current_day)) logging.info('爬取时间段:{}到{}'.format(news_start_time, current_day)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = current_day # 标记爬虫工作 self.is_break = False self.redis_example = redis_example self.pid = os.getpid() # 链接hdfs self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000', user='******') self.hdfsclient.makedirs( '/user/cspider_daily/nike_daily/ecommerce/{}'.format( time.strftime('%Y%m%d'))) # 创建每日文件夹 self.time_data = str(time.time()).split('.')[0] # 替换所有的HTML标签 def re_html(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r'\n', '', ret1) ret3 = re.sub(r'\u3000', '', ret2) ret4 = re.sub(r'品牌:', '', ret3) ret5 = re.sub(r'\xa0', '', ret4) ret6 = re.sub(r'→_→', '', ret5) ret7 = re.sub(r'……', '', ret6) ret8 = re.sub(r'":', '', ret7) return ret8 except: pass # 过滤月销量里面的非数字 def re_not_number(self, data): try: message = str(data) ret1 = re.sub(r'\D', '', message) return ret1 except: pass # 过滤url里面的#detail def re_detail(self, data): try: message = str(data) ret1 = re.sub(r'#detail', '', message) return ret1 except: pass # 过滤品牌 def re_pin_pai(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r'<li title.*?>', '', ret1) ret3 = re.sub(r'品牌: ', '', ret2) return ret3 except: pass # 解析请求得到的商品信息 def parse_goods_url(self, data): goods_dict = dict() goods_dict['平台'] = '淘宝' goods_dict['URL'] = data['URL'] goods_dict['商品名'] = data['商品名'] try: goods_dict['品牌'] = data['品牌'] except: goods_dict['品牌'] = '' goods_dict['价格'] = data['价格'] goods_dict['shop_name'] = data['shop_name'] goods_dict['关键词'] = data['关键词'] goods_dict['itemId'] = data['itemId'] goods_dict['sellerId'] = data['sellerId'] goods_dict['imageurl'] = data['商品图片'] goods_dict['audiourl'] = '' # logger.log(31, '************************正在抓取的商品是:%s................' % goods_dict) self.goods_data(goods_dict) # 抓取商品详情 def goods_data(self, goods_dict): try: id = goods_dict['itemId'] url = 'https://h5api.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?jsv=2.5.7&appKey=12574478&sign=fdd00ab68c3566e514d61534041592d7&api=mtop.taobao.detail.getdetail&v=6.0&isSec=0&ecode=0&AntiFlood=true&AntiCreep=true&H5Request=true&ttid=2018%40taobao_h5_9.9.9&type=jsonp&dataType=jsonp&data=%7B%22spm%22%3A%22a230r.1.14.48.6c1d4af9UmOqpx%22%2C%22id%22%3A%22{}%22%2C%22ns%22%3A%221%22%2C%22abbucket%22%3A%226%22%2C%22itemNumId%22%3A%22{}%22%2C%22itemId%22%3A%22{}%22%2C%22exParams%22%3A%22%7B%5C%22spm%5C%22%3A%5C%22a230r.1.14.48.6c1d4af9UmOqpx%5C%22%2C%5C%22id%5C%22%3A%5C%22{}%5C%22%2C%5C%22ns%5C%22%3A%5C%221%5C%22%2C%5C%22abbucket%5C%22%3A%5C%226%5C%22%7D%22%2C%22detail_v%22%3A%228.0.0%22%2C%22utdid%22%3A%221%22%7D'.format( id, id, id, id) headers = {'User-Agent': random.choice(user_agent_list)} try: time.sleep(0.3) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.3) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.3) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) json_data = json.loads( json.loads(response.text, strict=False)['data']['apiStack'][0] ['value'])['item'] # 通过是否有apiStack判断商品是否下架 commentCount = json.loads( response.text)['data']['item']['commentCount'] favcount = json.loads(response.text)['data']['item']['favcount'] SellCount = json_data['vagueSellCount'] goods_dict['人气数'] = favcount goods_dict['评价人数'] = commentCount goods_dict['月销量'] = SellCount if int(self.re_not_number(goods_dict['评价人数'])) == 0: # logger.log(31, '-----------该商品没有评论数据--------------') pass else: pages_num = int(math.ceil(float(int(goods_dict['评价人数']) / 20))) response.close() self.goods_comments(goods_dict, pages_num) except: print(5555555555555555555555, traceback.format_exc()) # 解析商品评论 def goods_comments(self, goods_dict, pages_num): try: is_break = self.is_break # print(goods_dict) itemId = goods_dict['itemId'] sellerId = goods_dict['sellerId'] headers = { 'cookie': 't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB', 'pragma': 'no-cache', 'upgrade-insecure-requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'https://item.taobao.com/item.htm?spm=a230r.1.14.31.26804e4c03W4qw&id=563490255667&ns=1&abbucket=1', 'User-Agent': random.choice(user_agent_list) } if int(pages_num) >= 99: pages = 99 else: pages = pages_num # print('----------------商品评论总页数是: %s -----------------------' % pages_num) # 抓取商品评论链接(总共99页,从1开始) for i in range(1, int(pages) + 1): comment_url = 'https://rate.taobao.com/feedRateList.htm?auctionNumId={}&userNumId={}¤tPageNum={}&pageSize=20&rateType=&orderType=feedbackdate&attribute=&sku=&hasSku=false&folded=0'.format( itemId, sellerId, i) # print(comment_url) # response = requests.get(url=comment_url, headers=headers, proxies=random.choice(proxies), timeout=10) try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) comment_data = response.text # print('开始抓取评论') # print(comment_data) comment = re.search( r'{"qnaDisabled":true,"watershed":.*"maxPage":.*}', comment_data) # print(comment.group()) items = json.loads(comment.group())['comments'] # print(items) goods_data = dict() # logger.log(31, '--------********开始写入商品数据********--------') for item in items: # if item['date'] != None: # time_test = item['date'].split(' ')[0].replace('年', '-').replace('月', '-').replace('日', '') + ' ' + item['date'].split(' ')[1] + ':00' date_data = item['date'].split(' ')[0].replace( '年', '-').replace('月', '-').replace('日', '') try: time_data = item['date'].split(' ')[1] + ':00' except: time_data = '' # print('评论时间', date_data, time_data) try: content = item['content'] except: content = '' # 追加评论 try: comments_2 = item['appendList'][0]['content'] except: comments_2 = '' # 判断评论时间是否在规定的抓取时间内 if self.start_time <= date_data.strip(): goods_data['platform'] = goods_dict['平台'] goods_data['date'] = date_data.strip() goods_data['time'] = time_data.strip() goods_data['keyword'] = goods_dict['关键词'] goods_data['name'] = goods_dict['商品名'] goods_data['url'] = goods_dict['URL'] goods_data['shop_name'] = goods_dict['shop_name'] goods_data['user_name'] = item['user']['nick'] goods_data['content'] = content + ';' + comments_2 goods_data['content_id'] = str(item['rateId']) goods_data['brand'] = goods_dict['品牌'] goods_data['price'] = goods_dict['价格'] goods_data['sales'] = goods_dict['月销量'] goods_data['focus_count'] = goods_dict['人气数'] goods_data['comment_num'] = goods_dict['评价人数'] goods_data['views'] = '' goods_data['likes'] = item['useful'] goods_data['comments_count'] = '' goods_data['author_id'] = '' goods_data['reposts_count'] = '' goods_data['topic_id'] = str(goods_dict['itemId']) # 判断size和type 颜色分类:黑色高帮  尺码:37 test_data = item['auction']['sku'] if '码' in test_data: goods_data['type'] = test_data.split( ':')[1].replace('尺码', '').replace( '  ', '').replace('鞋码', '').replace(';尺码', '') goods_data['size'] = test_data.split(':')[2] else: goods_data['type'] = '' goods_data['size'] = '' goods_data['imageurl'] = goods_dict['imageurl'] goods_data['audiourl'] = goods_dict['audiourl'] goods_data['file_code'] = '55' # print(goods_data) # items = json.dumps(dict(goods_data), ensure_ascii=False) + '\n' # with open('./bu_cai/55_{}_taobao_nike_{}_1.json'.format(time.strftime('%Y%m%d'), self.pid), 'ab') as f: # f.write(items.encode("utf-8")) item = json.dumps(dict(goods_data), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_daily/ecommerce/{}/55_{}_TaoBao_nike{}.json' .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d'), self.pid), item, encoding='utf-8') if date_data.strip() < self.start_time: is_break = True if is_break: break except: print(66666666666666666666, traceback.format_exc()) def run(self, lock): for num in range(1000000): lock.acquire() redis_url_num = self.redis_example.llen('taobao_day_url') if str(redis_url_num) == '0': print('*******Redis消息队列中url为空,程序等待中...进程 {} 等待中...*******'. format(str(os.getpid()))) item = self.redis_example.brpop('taobao_day_url', timeout=600)[1] lock.release() item1 = json.loads(item.decode()) # print(item) self.parse_goods_url(item1)
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self): # 时间部分,按小时抓取 date_time = str(datetime.now() - timedelta(days=1)).split('.')[0] start_time_test = time.strftime('%Y-%m-%d 00:00:00') end_time = time.strftime('%Y-%m-%d %H:%M:%S') a = end_time.split(' ')[1].split(':')[0] if a == '00': start_time_data = date_time hours_name = '22_24' wen_jian_jia_date = str(datetime.now() - timedelta(days=1)).split('.')[0].split(' ')[0].replace('-', '') else: two_hours_ago = int(a) - 2 if len(str(two_hours_ago)) == 1: two_hour_ago = '0' + str(two_hours_ago) else: two_hour_ago = str(two_hours_ago) hours_name = str(two_hour_ago) + '_' + str(a) start_time_data = start_time_test wen_jian_jia_date = time.strftime('%Y%m%d') print('爬取时间段:{}到{}'.format(start_time_data, end_time)) logging.info('爬取时间段:{}到{}'.format(start_time_data, end_time)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = start_time_data # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = end_time self.pid = os.getpid() self.h2_name = hours_name self.date_time = wen_jian_jia_date # 标记爬虫工作 self.is_break = False self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') self.hdfsclient.makedirs('/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format(wen_jian_jia_date, hours_name)) # 创建每日文件夹 self.time_data = str(time.time()).split('.')[0] # 替换所有的HTML标签 def re_html(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub('<a.*></a>', '', ret1) return ret2 except: pass # 过滤月销量里面的非数字 def re_not_number(self, data): try: message = str(data) ret1 = re.sub(r'\D', '', message) return ret1 except: pass def parse_url(self, data): # 创建一个字典接收数据 goods_dict = dict() goods_dict['平台'] = data['平台'] goods_dict['关键词'] = data['关键词'] goods_dict['商品名'] = data['商品名'] goods_dict['商品图片'] = data['商品图片'] goods_dict['URL'] = data['URL'] goods_dict['shop_name'] = data['shop_name'] goods_dict['goods_id'] = data['goods_id'] goods_dict['品牌'] = data['品牌'] goods_dict['月销量'] = data['月销量'] # logger.log(31, '--------********正在抓取的商品是:%s********--------' % goods_dict) self.parse_goods_price(goods_dict) # 解析商品价格信息 def parse_goods_price(self, goods_dict): try: goods_url = 'https://p.3.cn/prices/mgets?callback=jQuery6465631&source=jshop&skuids=J_{}'.format(goods_dict['goods_id']) headers = { 'content-type': 'application/json;charset=utf-8', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'no-cache', 'pragma': 'no-cache', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } try: time.sleep(0.2) response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) price_data = re.search(r'\[{".*"}]', response.text) goods_dict['价格'] = json.loads(price_data.group())[0]['p'] # print(goods_dict) self.parse_comment_num(goods_dict) except: print(22222222222222222222222, traceback.format_exc()) # 抓取商品评论数 def parse_comment_num(self, goods_dict): try: productId = goods_dict['goods_id'] referer_url = goods_dict['URL'] comment_url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv46&productId={}&score=0&sortType=6&page=0&pageSize=10&isShadowSku=0&rid=0&fold=1'.format(productId) headers = { 'content-type': 'text/html;charset=GBK', 'authority': 'sclub.jd.com', 'method': 'GET', # 'cookie': 'shshshfpa=32a16413-dbf0-50ea-e5b3-fc0700600c82-1555380265; shshshfpb=xpfj85AdZf7nEIXa%2FfPnKQA%3D%3D; user-key=76e73b75-478f-450a-843d-e6bc97ab6f57; TrackID=1JkU9AvzDgHTRRBHhgHdYahMQFpg9HwywXxp4mumaDTg3wgCwgl-Om3llO2sZlBTQ7ojPYO3q3E7f1jiEFu3roH67lDo9yP-tEUKh5hPh0R0; pinId=0ng4x50EOTPaVd8k7Hb6MA; pin=t15239619067; unick=t152*****067; _tp=WXVubGec3KjciXDtJzPQhA%3D%3D; _pst=t15239619067; mt_xid=V2_52007VwMWVllaW1scThxaBGIDEFFYXlRbGEwdbARlBkJVVVBVRhwZHV4ZYgRGVEEIVgpMVRxbAWYEQlNfUFQPF3kaXQVvHxNXQVhaSx9JEl8NbAAbYl9oUmoWQRhYBGULEFRVWltTGkkcWgZiMxdb; unpl=V2_ZzNtbRBSRkd2CBFULxxcBmIBFV0SUxYRfFsTAHweWAdiChReclRCFX0UR1FnGVQUZwYZXktcQRRFCEdkeB5fA2AFEFlBZxVLK14bADlNDEY1WnwHBAJfFn0PTlJ7GFQFYwIabXJUQyV1CXZUfx1YB24CEVpHUUIQdQpFUX0fXQJiByJtRWdzJXEMQFF6GGwEVwIiHxYLSxV2CkdTNhlYAWMBG1xBUEYTdA1GVngcWgNmBBdZclZzFg%3d%3d; __jdv=122270672|google-search|t_262767352_googlesearch|cpc|kwd-296971091509_0_c44c21f1e4124361a5d58bde66534872|1555655309636; cn=1; _gcl_au=1.1.1967935789.1555659711; __jdc=122270672; areaId=2; __jdu=15553802647041324770645; __jda=122270672.15553802647041324770645.1555380265.1556415731.1556518168.15; ipLoc-djd=2-2830-51800-0; wlfstk_smdl=zn0664dqolt95jf7g1wjtft1hao7l0yl; 3AB9D23F7A4B3C9B=HPX726VSHMRMSR3STZRR7N5NRDNPYWVN43VETWWM5H7ZKTJNQRUDNAN3OFAJHRA4GMFUVMZ4HQPSNV63PBO6R5QDQI; shshshfp=4a332a1f062877da491a157dabe360b2; shshshsID=60254c5e3d13551f63eed3d934c61d6d_8_1556519503209; __jdb=122270672.11.15553802647041324770645|15.1556518168; JSESSIONID=831DC446C63444F227CAFCFFA4085E88.s1', 'referer': referer_url, 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) comment_data = re.search(r'{"productAttr":null.*]}', response.text) comment_number = json.loads(comment_data.group())['productCommentSummary']['commentCount'] goods_dict['comment_num'] = json.loads(comment_data.group())['productCommentSummary']['commentCountStr'] # print(comment_number) if int(comment_number) == 0: logger.log(31, '************该商品没有评论数据*********') else: pages = int(math.ceil(float(int(comment_number) / 10))) self.goods_comments(goods_dict, pages) except: print(33333333333333333333333, traceback.format_exc()) # 解析商品评论 def goods_comments(self, goods_dict, pages): try: is_break = self.is_break # print(goods_dict) productId = goods_dict['goods_id'] referer_url = goods_dict['URL'] headers = { 'content-type': 'text/html;charset=GBK', 'authority': 'sclub.jd.com', 'method': 'GET', # 'cookie': 'shshshfpa=32a16413-dbf0-50ea-e5b3-fc0700600c82-1555380265; shshshfpb=xpfj85AdZf7nEIXa%2FfPnKQA%3D%3D; user-key=76e73b75-478f-450a-843d-e6bc97ab6f57; TrackID=1JkU9AvzDgHTRRBHhgHdYahMQFpg9HwywXxp4mumaDTg3wgCwgl-Om3llO2sZlBTQ7ojPYO3q3E7f1jiEFu3roH67lDo9yP-tEUKh5hPh0R0; pinId=0ng4x50EOTPaVd8k7Hb6MA; pin=t15239619067; unick=t152*****067; _tp=WXVubGec3KjciXDtJzPQhA%3D%3D; _pst=t15239619067; mt_xid=V2_52007VwMWVllaW1scThxaBGIDEFFYXlRbGEwdbARlBkJVVVBVRhwZHV4ZYgRGVEEIVgpMVRxbAWYEQlNfUFQPF3kaXQVvHxNXQVhaSx9JEl8NbAAbYl9oUmoWQRhYBGULEFRVWltTGkkcWgZiMxdb; unpl=V2_ZzNtbRBSRkd2CBFULxxcBmIBFV0SUxYRfFsTAHweWAdiChReclRCFX0UR1FnGVQUZwYZXktcQRRFCEdkeB5fA2AFEFlBZxVLK14bADlNDEY1WnwHBAJfFn0PTlJ7GFQFYwIabXJUQyV1CXZUfx1YB24CEVpHUUIQdQpFUX0fXQJiByJtRWdzJXEMQFF6GGwEVwIiHxYLSxV2CkdTNhlYAWMBG1xBUEYTdA1GVngcWgNmBBdZclZzFg%3d%3d; __jdv=122270672|google-search|t_262767352_googlesearch|cpc|kwd-296971091509_0_c44c21f1e4124361a5d58bde66534872|1555655309636; cn=1; _gcl_au=1.1.1967935789.1555659711; __jdc=122270672; areaId=2; __jdu=15553802647041324770645; __jda=122270672.15553802647041324770645.1555380265.1556415731.1556518168.15; ipLoc-djd=2-2830-51800-0; wlfstk_smdl=zn0664dqolt95jf7g1wjtft1hao7l0yl; 3AB9D23F7A4B3C9B=HPX726VSHMRMSR3STZRR7N5NRDNPYWVN43VETWWM5H7ZKTJNQRUDNAN3OFAJHRA4GMFUVMZ4HQPSNV63PBO6R5QDQI; shshshfp=4a332a1f062877da491a157dabe360b2; shshshsID=60254c5e3d13551f63eed3d934c61d6d_8_1556519503209; __jdb=122270672.11.15553802647041324770645|15.1556518168; JSESSIONID=831DC446C63444F227CAFCFFA4085E88.s1', 'referer': referer_url, 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } if int(pages) >= 50: pages_num = 50 else: pages_num = pages # 抓取商品评论链接(总共50页,第一页从0开始) for i in range(0, int(pages_num)): comment_url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv46&productId={}&score=0&sortType=6&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1'.format(productId, i) # print(comment_url) try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) # print("正在抓取的页面是: %s" % comment_url) comments = response.text # print(comments) # {"productAttr":null,.*]} comment = re.search(r'{"productAttr":null.*]}', comments) # print(comment.group()) items = json.loads(comment.group())['comments'] if int(len(items)) == 0: break else: goods_comment_dict = dict() for item in items: date_data = item['creationTime'].split(' ', 1)[0] time_data = item['creationTime'].split(' ', 1)[1] # print(date, time) try: content = self.re_html(item['content']) except: content = '' # 追加评论 try: comments_2 = item['afterUserComment']['content'] except: comments_2 = '' # 判断评论时间是否在规定的抓取时间内 if self.start_time <= item['creationTime']: goods_comment_dict['platform'] = goods_dict['平台'] goods_comment_dict['date'] = date_data.strip() goods_comment_dict['time'] = time_data.strip() goods_comment_dict['keyword'] = goods_dict['关键词'] goods_comment_dict['name'] = goods_dict['商品名'] goods_comment_dict['imageurl'] = goods_dict['商品图片'] goods_comment_dict['audiourl'] = '' goods_comment_dict['url'] = goods_dict['URL'] goods_comment_dict['shop_name'] = goods_dict['shop_name'] goods_comment_dict['user_name'] = item['nickname'] goods_comment_dict['author_id'] = '' goods_comment_dict['content'] = content + ';' + comments_2 goods_comment_dict['content_id'] = str(item['id']) goods_comment_dict['brand'] = goods_dict['品牌'] goods_comment_dict['price'] = goods_dict['价格'] goods_comment_dict['sales'] = goods_dict['月销量'] goods_comment_dict['focus_count'] = '' goods_comment_dict['comment_num'] = goods_dict['comment_num'] goods_comment_dict['views'] = '' goods_comment_dict['likes'] = item['usefulVoteCount'] try: goods_comment_dict['comments_count'] = item['replyCount'] except: goods_comment_dict['comments_count'] = '' goods_comment_dict['reposts_count'] = '' goods_comment_dict['topic_id'] = str(goods_dict['goods_id']) try: goods_comment_dict['type'] = item['productColor'] except: goods_comment_dict['type'] = '' try: goods_comment_dict['size'] = item['productSize'] except: goods_comment_dict['size'] = '' goods_comment_dict['file_code'] = '51' # print('********--------开始写入商品数据--------********') # print(goods_comment_dict) item = json.dumps(dict(goods_comment_dict), ensure_ascii=False) + '\n' self.hdfsclient.new_write('/user/cspider_daily/nike_2h/ecommerce/{}/{}/51_{}_jingdong_nike{}.json'.format(self.date_time, self.h2_name, time.strftime('%Y%m%d'), self.pid), item, encoding='utf-8') if date_data.strip() < self.start_time: is_break = True if is_break: break except: print(444444444444444444444444444, traceback.format_exc()) def run(self, data): self.parse_url(data)
class TouTiaoSpider(object): """ 今日头条的爬虫,主要采集和汽车有关的新闻 """ def __init__(self): # 'cookie':'uuid="w:d0214807f672416fb7d3ee0431aa13a3"; UM_distinctid=1674ef3a9800-0bce565d4c8dc4-414f0120-15f900-1674ef3a981290; _ga=GA1.2.823209007.1543222670; _gid=GA1.2.547615301.1543222670; CNZZDATA1259612802=603836554-1543213069-%7C1543218469; __tasessionId=tpisw88851543281460530; csrftoken=d9a6dad7de6c1fbbf3ddd1a3de811481; tt_webid=6628070185327625741', # ':authority':'www.toutiao.com', # ':method':'GET', # ':path':'/api/pc/feed/?category=news_car&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A1E56B7F8CD9B35&cp=5BFC39BB43B5DE1&_signature=pMmtcAAA.0TvpJ9rFvhWIKTJrW', # ':scheme':'https', # 'cache-control': 'max-age=0', # 'cookie': 'tt_webid=6628733243796178436; tt_webid=6628733243796178436; csrftoken=3a6f2dc0f315bd1fe957319a75bba4ed; uuid="w:2203d39caf3249c0bcda19ee5839b850"; UM_distinctid=1675827673a27a-0dd556679b3f63-3a3a5d0c-15f900-1675827673b22c; __tasessionId=qb2c0x9mb1543386267822; CNZZDATA1259612802=992935523-1543369669-%7C1543385869', # 'referer': 'https://www.toutiao.com/ch/news_car/', self.headers_one = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' } self.start_url = 'https://www.toutiao.com/api/pc/feed/' # 评论接口模板 self.commnet_port_url = '' date = datetime.now() - timedelta(days=1) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=1) # 昨天时间 yesterday = str(yesterday).split(' ')[0] print('爬取时间段:{}到{}'.format(news_start_time, yesterday)) logging.info('爬取时间段:{}到{}'.format(news_start_time, yesterday)) # 定义开始时间 y-m-d 离现在时间远 self.start_time = news_start_time # self.start_time = '2019-09-09' # 定义结束时间 y-m-d 离现在时间近 self.end_time = yesterday # self.end_time = '2019-09-16' print('爬取时间段:{}到{}'.format(self.start_time, self.end_time)) # 标记爬虫工作 self.is_work = True # 评论页数 self.comment_page_num = 1 # 去重列表 self.set_list = [] # 代理ip self.proxies = [ '112.245.235.249:4243', # '59.53.47.4:4249' ] # 搜集问答类网页的列表 self.questions_list = [] # 读取url列表 # with open('./../toutiao/new_url_file.json', 'r') as f: # self.url_list = f.readlines() self.file_name_time = self.get_file_name_time() try: os.mkdir('./../toutiao/json_file/{}'.format(self.file_name_time.split(' ')[0])) except: pass self.file_path = '/user/cspider_daily/nike_daily/article' self.comment_apth = '/user/cspider_daily/nike_daily/articlecomments' self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') hour = str(datetime.now()).split(' ')[-1].split(':')[0] if str(hour) != '00': two_hour_ago = int(hour) - 2 if len(str(two_hour_ago)) == 1: two_hour_ago = '0' + str(two_hour_ago) self.hour_name = str(two_hour_ago) + '_' + str(hour) else: self.hour_name = '22_24' self.hdfsclient.makedirs('{}/{}'.format(self.file_path, self.file_name_time.split(' ')[0].replace('-', ''))) # 创建每日文件夹 self.hdfsclient.makedirs('{}/{}'.format(self.comment_apth, self.file_name_time.split(' ')[0].replace('-', ''))) # 创建每日文件夹 self.time_time = str(time.time()).split('.')[0] # 代理服务器 proxyHost = "http-dyn.abuyun.com" proxyPort = "9020" # 代理隧道验证信息 # proxyUser = "******" # proxyUser = "******" proxyUser = "******" # proxyPass = "******" # proxyPass = "******" proxyPass = "******" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, "user": proxyUser, "pass": proxyPass, } self.proxies = { # "http": proxyMeta, "https": proxyMeta } def get_news_page(self, url): user_agent = [ 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1', ] headers_one = { 'accept': 'textml,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cookie': '__tasessionId=0k1ayrc511577344635809', 'sec-fetch-user': '******', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': '{}'.format(random.choice(user_agent)) } item = {} response = requests.get(url, headers=headers_one, proxies=self.proxies, timeout=60) #, proxies={'https': ip} stutus_code = response.status_code if str(stutus_code) == '200': data_all = response.content.decode() try: data = re.search(r"articleInfo: {([\s\S]*time: '\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2})", data_all).group(1) data = '{' + data + "'}}" data = re.sub('\n', '', data) data = unescape(data) data = data.replace('"', '"').replace('=', '=') content = re.search('content: ([\s\S]*)groupId', data).group(1).strip()[1:][:-2] content = etree.HTML(content) text = content.xpath('.//p//text()') text_con = ''.join(text) # print(text_con) text_con = re.sub(r'class=.*?class', '', text_con) # print(text_con) text_con = re.sub(r'\\u003C.*?\\u003E', '', text_con).replace('.slice(6, -6', '') # print(text_con) date, create_time = re.search('(\d{4}-\d{1,2}-\d{1,2}) (\d{1,2}:\d{1,2}:\d{1,2})', data).group(1, 2) id_num = re.search("groupId: '(\d{1,50}).*itemId", data).group(1) # 新闻的标识id source = re.search("source: '(.*)time", data).group(1).strip()[:-2] # 来源 comment_count = re.search("commentCount: '(\d{0,10})[\s\S]*ban_comment", data_all).group(1) title = re.search("title: '([\s\S])*content", data).group(0).split("'")[1] item['platform'] = '今日头条' item['date'] = date item['time'] = create_time item['title'] = title.replace('"', '') item['article_source'] = '' # 文章来源 item['article_author'] = source # 文章作者 item['content'] = text_con item['comments_count'] = comment_count item['clicks'] = '' item['views'] = '' item['likes'] = '' item['keyword'] = '' item['article_url'] = url # 文章详情URL item['dislikes'] = '' # 踩人数 item['series_url'] = '' # 车系首页 item['list_url'] = 'https://www.toutiao.com/ch/news_car/' # 文章列表URL item['article_type_1st'] = '' # 文章类型 item['article_type_2nd'] = '' # 文章类型 item['insert_time'] = str(datetime.now()).split('.')[0] # 初始爬取时间 item['update_time'] = str(datetime.now()).split('.')[0] # 最后爬取时间 content_id = url.split('/')[-1] item['content_id'] = str(content_id) # 文章id item['topic_id'] = str(content_id) # 主贴id item['author_id'] = '' # 作者id item['file_code'] = '24' item['reposts_count'] = '' print(item) self.write_news_jsonfile(item) self.get_comment_info(url, title, date, create_time) except AttributeError: print('问答类网页', url) self.questions_list.append(url) print(self.questions_list) else: print('网页404错误', url) # 获取评论 # http://lf.snssdk.com/article/v1/tab_comments/?count=50&item_id=6629460454148145678&group_id=6629460454148145678&offset=0 def get_comment_info(self, source_url, source_title, source_date, source_time, page_id="0"): user_agent = [ 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1', ] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', # 'Connection': 'keep-alive', 'Host': 'lf.snssdk.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': '{}'.format(random.choice(user_agent)) } headers_two = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', # 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 'Cache-Control': 'no-cache', 'Cookie': 'csrftoken=4be00616a67933bdef5696b162e70937; tt_webid=6762029080495375880', # 'Connection': 'keep-alive', # 'Host': 'lf.snssdk.com', # 'Upgrade-Insecure-Requests': '1', 'Connection': 'keep-alive', 'User-Agent': 'PostmanRuntime/7.20.1' } url_id = source_url.split('/')[-1][1:] news_comment_url = 'https://www.toutiao.com/article/v2/tab_comments/?aid=24&app_name=toutiao-web&group_id={}&item_id={}&offset={}&count=50'.format(url_id, url_id, page_id) comment_url = 'http://lf.snssdk.com/article/v1/tab_comments/?count=50&item_id={}&group_id={}&offset={}'.format(url_id, url_id, page_id) print('评论爬取中......') print(comment_url) # ip = random.choice(self.proxies) try: response = requests.get(news_comment_url, headers=headers_two, proxies=self.proxies) # , proxies={'https': ip} datas = json.loads(response.content) print(datas) data_list = datas['data'] if data_list: total_item = '' for comment in data_list: print(1111111111111, comment) item = dict() item['platform'] = '今日头条' item['source_date'] = source_date item['source_time'] = source_time content = comment['comment']['text'] date_all = comment['comment']['create_time'] # #转换成localtime time_local = time.localtime(float(str(date_all))) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) date = dt.split(' ')[0] comment_time = dt.split(' ')[1] item['date'] = date item['time'] = comment_time item['title'] = source_title author = comment['comment']['user_name'] item['author'] = author item['content'] = content item['source_url'] = source_url item['comment_url'] = source_url item['floor'] = '' item['views'] = '' item['comments_count'] = comment['comment']['reply_count'] item['keyword'] = '' item['likes'] = comment['comment']['digg_count'] item['author_id'] = comment['comment']['user_id'] # 用户id item['dislikes'] = '' # 踩人数 item['insert_time'] = str(datetime.now()).split('.')[0] # 初始爬取时间 item['update_time'] = str(datetime.now()).split('.')[0] # 最后爬取时间 item['content_id'] = comment['comment']['id'] content_id = source_url.split('/')[-1] item['topic_id'] = str(content_id) # 主贴id item['file_code'] = '38' item['reposts_count'] = '' item = json.dumps(dict(item), ensure_ascii=False) + '\n' total_item += item print('写入评论中......') self.write_comment_jsonfile(total_item) if len(data_list) == 50: page_id = int(page_id) + 50 print('爬取评论翻页信息.....') time.sleep(2) self.get_comment_info(source_url, source_title, source_date, source_time, page_id=str(page_id)) except requests.exceptions.ConnectionError: print('获取评论时发生链接错误,程序暂停100s后爬取') time.sleep(100) self.get_comment_info(source_url, source_title, source_date, source_time, page_id=str(page_id)) logging.error('获取评论时发生链接错误,程序暂停100s后爬取,get_comment error:{}'.format(traceback.format_exc())) # 写入json文件 def write_news_jsonfile(self, item): item = json.dumps(dict(item), ensure_ascii=False) + '\n' self.hdfsclient.new_write('{}/{}/24_{}_{}_toutiao_news.json'.format(self.file_path, str(datetime.now()).split(' ')[0].replace('-', ''), str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item,encoding='utf-8') def write_comment_jsonfile(self, item): # item = json.dumps(dict(item), ensure_ascii=False) + '\n' self.hdfsclient.new_write('{}/{}/38_{}_{}_toutiao_comment.json'.format(self.comment_apth, str(datetime.now()).split(' ')[0].replace('-', ''), str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item,encoding='utf-8') def get_file_name_time(self): a = str(datetime.now()) hour = a.split(' ')[-1].split(':')[0] num = int(hour) / 3 num = int(num) * 3 if num == 0: num = 24 a = str(datetime.now() - timedelta(days=1)) # 昨天时间 num = a.split(' ')[0] + ' ' + str(num) return num def run(self): set_list = [] logger.info('开始读取url文件,进行新闻爬取') for url in open('./../toutiao/json_file/{}/{}_comment_url.json'.format(self.file_name_time.split(' ')[0], self.file_name_time.split(' ')[0])): # for url in open('./../toutiao/json_file/{}/{}_comment_url.json'.format('2020-03-23', '2020-03-23')): if url in set_list: continue else: set_list.append(url) if url: logger.info('打开new_url_file.json文件,读取要爬取的url') url = url.strip() print('一个爬虫正在爬取网址{}'.format(url)) logger.info('一个爬虫正在爬取网址{}'.format(url)) try: self.get_news_page(url) except: print(traceback.format_exc()) try: self.get_news_page(url) except: print('错误') print('一个网址爬虫结束.....') logger.info('爬取完毕......')
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self, redis_example): # 时间部分 # 爬虫开始抓取的日期 date = datetime.now() - timedelta(days=1) news_start_time = str(date).split(' ')[0] # 爬虫结束的抓取日期 current_time = datetime.now() # 当前日期 current_day = str(current_time).split(' ')[0] print('爬取时间段:{}到{}'.format(news_start_time, current_day)) logging.info('爬取时间段:{}到{}'.format(news_start_time, current_day)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = current_day # 标记爬虫工作 self.is_break = False self.redis_example = redis_example self.pid = os.getpid() # 链接hdfs self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000', user='******') self.hdfsclient.makedirs( '/user/cspider_daily/nike_daily/ecommerce/{}'.format( time.strftime('%Y%m%d'))) # 创建每日文件夹 self.time_data = str(time.time()).split('.')[0] # 替换所有的HTML标签 def re_html(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r'\n', '', ret1) ret3 = re.sub(r'\u3000', '', ret2) ret4 = re.sub(r'品牌:', '', ret3) ret5 = re.sub(r'\xa0', '', ret4) ret6 = re.sub(r'→_→', '', ret5) ret7 = re.sub(r'……', '', ret6) ret8 = re.sub(r'":', '', ret7) return ret8 except: pass # 过滤月销量里面的非数字 def re_not_number(self, data): try: message = str(data) ret1 = re.sub(r'\D', '', message) return ret1 except: pass # 过滤url里面的#detail def re_detail(self, data): try: message = str(data) ret1 = re.sub(r'#detail', '', message) return ret1 except: pass # 解析请求得到的商品信息 def parse_goods_url(self, data): goods_dict = dict() goods_dict['平台'] = '天猫' goods_dict['URL'] = data['URL'] goods_dict['商品名'] = data['商品名'] goods_dict['价格'] = data['价格'] goods_dict['shop_name'] = data['shop_name'] goods_dict['关键词'] = data['关键词'] goods_dict['品牌'] = data['品牌'] goods_dict['itemId'] = data['itemId'] goods_dict['sellerId'] = data['sellerId'] goods_dict['imageurl'] = data['商品图片'] goods_dict['audiourl'] = '' # logger.log(31, '***************************正在抓取的商品是:%s.................' % goods_dict) self.parse_goods_details(goods_dict) # 解析商品详情信息 def parse_goods_details(self, goods_dict): try: url = 'https://h5api.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?jsv=2.4.8&appKey=12574478&sign=88ccc8f2a1382304046c12960f2711c9&api=mtop.taobao.detail.getdetail&v=6.0&dataType=jsonp&ttid=2017%40taobao_h5_6.6.0&AntiCreep=true&type=jsonp&data=%7B%22itemNumId%22%3A%22{}%22%7D'.format( goods_dict['itemId']) headers = {'User-Agent': random.choice(user_agent_list)} try: time.sleep(0.3) response = requests.get(url=url, headers=headers, proxies=proxies, timeout=30) except: try: time.sleep(0.3) response = requests.get(url=url, headers=headers, proxies=proxies, timeout=30) except: time.sleep(0.3) response = requests.get(url=url, headers=headers, proxies=proxies, timeout=30) if 'apiStack' not in response.text: # print('******该商品已下架******') pass else: json_data = json.loads( json.loads( response.text, strict=False)['data']['apiStack'][0]['value'])['item'] commentCount = json.loads( response.text)['data']['item']['commentCount'] # 评价数 favcount = json.loads( response.text)['data']['item']['favcount'] # 收藏数 sellCount = json_data['sellCount'] # 月销量 spuId = json_data['spuId'] goods_dict['spuid'] = spuId goods_dict['月销量'] = sellCount goods_dict['人气数'] = favcount goods_dict['评价人数'] = commentCount if int(self.re_not_number(goods_dict['评价人数'])) == 0: # logger.log(31, '---------该商品没有评价数据-------') response.close() pass else: pages_num = int( math.ceil(float(int(goods_dict['评价人数']) / 20))) response.close() self.goods_comments(goods_dict, pages_num) except: print(5555555555555555555555, traceback.format_exc()) # 解析商品评论 def goods_comments(self, goods_dict, pages_num): try: is_break = self.is_break # print(goods_dict) itemId = goods_dict['itemId'] sellerId = goods_dict['sellerId'] spuId = goods_dict['spuid'] headers = { 'cookie': 't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB', 'pragma': 'no-cache', 'upgrade-insecure-requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'https://detail.tmall.com/item.htm?spm=a230r.1.14.16.26804e4ck29eWS&id=597034992998&ns=1&abbucket=1', 'User-Agent': random.choice(user_agent_list) } if int(pages_num) >= 99: pages = 99 else: pages = pages_num # logger.log(31, '-------------评论总页数是:%s --------------' % pages) # 抓取商品评论链接(总共99页,从1开始) for i in range(1, int(pages) + 1): comment_url = 'https://rate.tmall.com/list_detail_rate.htm?itemId={}&spuId={}&sellerId={}&order=1¤tPage={}'.format( itemId, spuId, sellerId, i) # print(comment_url) # response = requests.get(url=comment_url, headers=headers, proxies=random.choice(proxies), timeout=10) try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: time.sleep(0.2) response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) comment_data = response.text # logger.log(31, '开始抓取评论') # print(comment_data) comment = re.search( r'{"rateDetail":{"rateCount":{"total":.*"tags":\[]}}', comment_data) # print(comment.group()) items = json.loads(comment.group())['rateDetail']['rateList'] # print(items) goods_data = dict() logger.log(31, '--------********开始写入商品数据********--------') for item in items: date_data = item['rateDate'].split(' ', 1)[0] time_data = item['rateDate'].split(' ', 1)[1] # print('评论时间', date_data, time_data) try: content = item['appendComment']['content'] except: content = '' # 判断评论时间是否在规定的抓取时间内 if self.start_time <= date_data.strip(): goods_data['platform'] = goods_dict['平台'] goods_data['date'] = date_data.strip() goods_data['time'] = time_data.strip() goods_data['keyword'] = goods_dict['关键词'] goods_data['name'] = goods_dict['商品名'] goods_data['url'] = goods_dict['URL'] goods_data['shop_name'] = goods_dict['shop_name'] goods_data['user_name'] = item['displayUserNick'] goods_data['content'] = self.re_html( item['rateContent']) + ';' + str( self.re_html(content)) goods_data['content_id'] = str(item['id']) goods_data['brand'] = goods_dict['品牌'] goods_data['price'] = goods_dict['价格'] goods_data['sales'] = goods_dict['月销量'] goods_data['focus_count'] = goods_dict['人气数'] goods_data['comment_num'] = goods_dict['评价人数'] goods_data['views'] = '' goods_data['likes'] = '' goods_data['comments_count'] = '' goods_data['author_id'] = '' goods_data['reposts_count'] = '' goods_data['topic_id'] = str(goods_dict['itemId']) # 判断size和type test_data = item['auctionSku'] if '分类' in test_data: goods_data['type'] = test_data.split( ':')[1].replace(';尺码:', '').replace(';鞋码', '').replace( ';尺码', '') try: goods_data['size'] = test_data.split( ':')[2].split(';')[0] except: try: goods_data['size'] = test_data.split( ':')[2] except: goods_data['size'] = '' else: goods_data['type'] = '' goods_data['size'] = '' goods_data['imageurl'] = goods_dict['imageurl'] goods_data['audiourl'] = goods_dict['audiourl'] goods_data['file_code'] = '50' # print(goods_data) # item = json.dumps(dict(goods_data), ensure_ascii=False) + '\n' # with open('./json_data/{}_tmall_goods_data_{}.json'.format(time.strftime('%Y%m%d'), self.pid), 'ab') as f: # f.write(item.encode("utf-8")) item = json.dumps(dict(goods_data), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_daily/ecommerce/{}/50_{}_{}_Tmall_nike{}.json' .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d'), self.time_data, self.pid), item, encoding='utf-8') if date_data.strip() < self.start_time: is_break = True if is_break: break except: print(7777777777777777777, traceback.format_exc()) def run(self, lock): for num in range(1000000): lock.acquire() redis_url_num = self.redis_example.llen('Tmall_day_url') if str(redis_url_num) == '0': print('*******Redis消息队列中url为空,程序等待中...进程 {} 等待中....******'. format(str(os.getpid()))) item = self.redis_example.brpop('Tmall_day_url', timeout=600)[1] lock.release() item1 = json.loads(item.decode()) # print('正在抓取商品:', item1) self.parse_goods_url(item1)
class Spider(object): """ 这是一个爬虫模板 """ def __init__(self): # 时间部分,按小时抓取 date_time = str(datetime.now() - timedelta(days=1)).split('.')[0] start_time_test = time.strftime('%Y-%m-%d 00:00:00') end_time = time.strftime('%Y-%m-%d %H:%M:%S') a = end_time.split(' ')[1].split(':')[0] if a == '00': start_time_data = date_time hours_name = '22_24' wen_jian_jia_date = str(datetime.now() - timedelta( days=1)).split('.')[0].split(' ')[0].replace('-', '') else: two_hours_ago = int(a) - 2 if len(str(two_hours_ago)) == 1: two_hour_ago = '0' + str(two_hours_ago) else: two_hour_ago = str(two_hours_ago) hours_name = str(two_hour_ago) + '_' + str(a) start_time_data = start_time_test wen_jian_jia_date = time.strftime('%Y%m%d') print('爬取时间段:{}到{}'.format(start_time_data, end_time)) logging.info('爬取时间段:{}到{}'.format(start_time_data, end_time)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = start_time_data # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = end_time # 标记爬虫工作 self.is_break = False self.h2_name = hours_name self.date_time = wen_jian_jia_date # 链接hdfs self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') self.hdfsclient.makedirs( '/user/cspider_daily/nike_2h/video/{}/{}'.format( wen_jian_jia_date, hours_name)) # 创建每日文件夹 self.hdfsclient.makedirs( '/user/cspider_daily/nike_2h/videocomments/{}/{}'.format( wen_jian_jia_date, hours_name)) # 创建每日文件夹 self.time_data = str(time.time()).split('.')[0] # 替换所有的HTML标签 def re_html(self, data): # 替换抓取数据中的html标签 .replace('[', '').replace(']', '').replace(',', '').replace("'", '') try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r'\[', '', ret1) ret3 = re.sub(r'\]', '', ret2) ret4 = re.sub(r',', '', ret3) ret5 = re.sub(r"'", '', ret4) ret6 = re.sub(r' ', '', ret5) return ret6 except: pass # 过滤非数字 def re_not_number(self, data): try: message = str(data) ret1 = re.sub(r'\D', '', message) return ret1 except: pass # 匹配具体时间 def clean_date(self, x): now = datetime.now() if str(x).find('昨天') != -1: x = datetime.strftime(now + timedelta(days=-1), '%Y-%m-%d %H:%M:%S') elif str(x).find('前天') != -1: x = datetime.strftime(now + timedelta(days=-2), '%Y-%m-%d %H:%M:%S') elif str(x).find('天前') != -1: x = datetime.strftime( now + timedelta(days=-int(str(x).replace('天前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('小时前') != -1: x = datetime.strftime( now + timedelta(hours=-int(str(x).replace('小时前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('分钟前') != -1: x = datetime.strftime( now + timedelta(minutes=-int(str(x).replace('分钟前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('今天') != -1: x = str(x).replace('今天', now.strftime('%Y-%m-%d') + ' ') elif str(x).find('刚刚') != -1: x = now.strftime('%Y-%m-%d %H:%M:%S') elif str(x).find('秒前') != -1: x = now.strftime('%Y-%m-%d %H:%M:%S') elif str(x).find('月前') != -1: x = datetime.strftime( now + timedelta(weeks=-4 * int(str(x).replace('月前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('周前') != -1: x = datetime.strftime( now + timedelta(weeks=-int(str(x).replace('周前', ''))), '%Y-%m-%d %H:%M:%S') elif str(x).find('[') != -1: x = x.replace('[', '').replace(']', '') elif str(x).find('月') != -1: x = x.replace('月', '-').replace('日', '') return x # 根据关键词搜索请求得到商品第一页信息 def parse_page1(self, key_word): try: # 根据关键词,例如:洗发水,抓取商品信息 url = 'https://www.meipai.com/search/all?q={}'.format(key_word) # print(url) headers = { 'Content-Type': 'text/html; charset=utf-8', 'Cookie': 'MUSID=ta4877m5ongth47s2n7kt0km13; virtual_device_id=d8afe1d9634ad1f6591e3486d4312976; pvid=imZ0YWzzY7TAFPWAQnp71Vl1bDpOXY91; sid=ta4877m5ongth47s2n7kt0km13; UM_distinctid=16c84237ad71af-096aa6932eb12f-37c143e-1fa400-16c84237ad870f; CNZZDATA1256786412=2077818170-1565584700-https%253A%252F%252Fwww.baidu.com%252F%7C1565584700; searchStr=AJ%7C361%E5%BA%A6%7C%E9%98%BF%E8%BF%AA%E8%BE%BE%E6%96%AF%7C%E8%80%90%E5%85%8B%7C', 'Host': 'www.meipai.com', 'Pragma': 'no-cache', # 'Referer': '{}'.format(url), 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } try: # time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: # time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: # time.sleep(0.1) response = requests.get(url=url, headers=headers, allow_redirects=False, timeout=30) # print(response.text) # 将响应转换成一个element对象 html = etree.HTML(response.text) # 获取视频列表信息 video_data_list = html.xpath('//ul[@id="mediasList"]/li') video_dict = dict() # logger.log(31, '***********关键词%s的*****第1页数据***********' % (key_word)) for node in video_data_list: video_dict['platform'] = '美拍' video_dict['title'] = node.xpath('./img/@alt')[0].replace( '\n', ' ') video_dict['keyword'] = key_word video_dict['url'] = 'https://www.meipai.com' + node.xpath( './div[1]/a/@href')[0] video_dict['imageurl'] = 'https:' + node.xpath('./img/@src')[0] video_dict['audiourl'] = video_dict['url'] video_dict['topic_id'] = str( self.re_not_number(node.xpath('./div[1]/a/@href')[0])) video_dict['source_author'] = node.xpath( './div[2]/p/a/@title')[0] video_dict['上传者url'] = 'https://www.meipai.com' + node.xpath( './div[2]/p/a/@href')[0] video_dict['author_id'] = str( self.re_not_number(node.xpath('./div[2]/p/a/@href')[0])) video_dict['categroy'] = '' # print(video_dict) response.close() self.parse_video_data(video_dict, key_word) except: print(111111111111111111111111, traceback.format_exc()) # 根据关键词搜索请求得到商品后1页信息 def parse_page2(self, key_word): try: for i in range(2, 3): # 根据关键词,例如:洗发水,抓取商品信息 url = 'https://www.meipai.com/search/mv?q={}&page={}&fromAll=1'.format( key_word, i) # logger.log(31, '***********关键词%s的******第%s页数据*********** + %s' % (key_word, i, url)) # print(url) headers = { 'Content-Type': 'text/html; charset=utf-8', 'Cookie': 'MUSID=ta4877m5ongth47s2n7kt0km13; virtual_device_id=d8afe1d9634ad1f6591e3486d4312976; pvid=imZ0YWzzY7TAFPWAQnp71Vl1bDpOXY91; sid=ta4877m5ongth47s2n7kt0km13; UM_distinctid=16c84237ad71af-096aa6932eb12f-37c143e-1fa400-16c84237ad870f; CNZZDATA1256786412=2077818170-1565584700-https%253A%252F%252Fwww.baidu.com%252F%7C1565584700; searchStr=AJ%7C361%E5%BA%A6%7C%E9%98%BF%E8%BF%AA%E8%BE%BE%E6%96%AF%7C%E8%80%90%E5%85%8B%7C', 'Host': 'www.meipai.com', 'Pragma': 'no-cache', # 'Referer': url, 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } try: # time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: # time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: # time.sleep(0.1) response = requests.get(url=url, headers=headers, allow_redirects=False, timeout=30) # 将响应转换成一个element对象 html = etree.HTML(response.text) # 判断是否有视频数据 test_data = html.xpath( '//div[@class="search-result-null break"]/p//text()') if test_data != []: break else: # 获取视频列表信息 video_data_list = html.xpath('//ul[@id="mediasList"]/li') video_dict = dict() for node in video_data_list: video_dict['platform'] = '美拍' video_dict['title'] = node.xpath( './img/@alt')[0].replace('\n', ' ').replace( '\r', ' ').replace('\u200b', '').replace( '\u200e', '').replace('\u200c', '') video_dict['keyword'] = key_word video_dict[ 'url'] = 'https://www.meipai.com' + node.xpath( './div[1]/a/@href')[0] video_dict['imageurl'] = 'https:' + node.xpath( './img/@src')[0] video_dict['audiourl'] = video_dict['url'] video_dict['topic_id'] = str( self.re_not_number( node.xpath('./div[1]/a/@href')[0])) video_dict['source_author'] = node.xpath( './div[2]/p/a/@title')[0] video_dict[ '上传者url'] = 'https://www.meipai.com' + node.xpath( './div[2]/p/a/@href')[0] video_dict['author_id'] = str( self.re_not_number( node.xpath('./div[2]/p/a/@href')[0])) video_dict['categroy'] = '' # print(video_dict) response.close() self.parse_video_data(video_dict, key_word) except: print(222222222222222222222222, traceback.format_exc()) # 进入视频页面,抓取数据信息 def parse_video_data(self, video_dict, key_word): try: url = video_dict['url'] headers = { 'Content-Type': 'text/html; charset=utf-8', 'Cookie': 'MUSID=ta4877m5ongth47s2n7kt0km13; virtual_device_id=d8afe1d9634ad1f6591e3486d4312976; pvid=imZ0YWzzY7TAFPWAQnp71Vl1bDpOXY91; sid=ta4877m5ongth47s2n7kt0km13; UM_distinctid=16c84237ad71af-096aa6932eb12f-37c143e-1fa400-16c84237ad870f; CNZZDATA1256786412=2077818170-1565584700-https%253A%252F%252Fwww.baidu.com%252F%7C1565584700; searchStr=AJ%7C%E9%98%BF%E8%BF%AA%E8%BE%BE%E6%96%AF%7C%E6%9D%8E%E5%AE%81%7C%E8%80%90%E5%85%8B%7C361%E5%BA%A6%7C', 'Host': 'www.meipai.com', 'Pragma': 'no-cache', # 'Referer': 'https://www.meipai.com/search/all?q={}'.format(key_word), 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } # logger.log(31, '视频链接是: %s' % url) try: # time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: # time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: # time.sleep(0.1) response = requests.get(url=url, headers=headers, allow_redirects=False, timeout=30) # 将响应转换成一个element对象 html = etree.HTML(response.text) # 发布日期时间数据 date_time_data = html.xpath( '//div[@class="detail-time pa"]/strong/text()')[0] data = self.clean_date(date_time_data) # print(data) date_data = data.split(' ')[0] time_data = data.split(' ')[1] if len(date_data.split('-')) == 3: video_dict['date'] = date_data.strip() else: video_dict['date'] = (time.strftime('%Y') + '-' + date_data).strip() if len(time_data.split(':')) == 3: video_dict['time'] = time_data.strip() else: video_dict['time'] = (time_data + ':00').strip() test_date_time = video_dict['date'] + ' ' + video_dict['time'] # print(test_date_time) if self.start_time <= test_date_time <= self.end_time: # 视频描述 content = html.xpath( '//div[@class="detail-info pr"]/h1//text()') # print('99999999999999999999999999999999999999999999999999999') # print(content) try: video_dict['description'] = self.re_html(content).replace( '\\u200d', '').replace('\\n', '') except: video_dict['description'] = '' video_dict['content_id'] = video_dict['topic_id'] # print(video_dict['description']) video_dict['clicks'] = '' # 播放量 try: play_data = html.xpath( '//div[@class="detail-location"]/text()')[1].replace( '播放', '').replace('\n', '').replace(' ', '') except: play_data = '' video_dict['play'] = play_data # 评论数 comment_num = html.xpath( '//span[@id="commentCount"]/text()')[0] if comment_num == '评论': video_dict['comments_count'] = 0 else: video_dict['comments_count'] = comment_num # 点赞数 likes_data = html.xpath( '//span[@itemprop="ratingCount"]/text()')[0] video_dict['likes'] = likes_data video_dict['reposts_count'] = '' # print(video_dict) response.close() self.parse_followers_count(video_dict) else: pass except: print(333333333333333333333333333, traceback.format_exc()) # 抓取作者粉丝数 def parse_followers_count(self, video_dict): try: url = video_dict['上传者url'] # print(url) headers = { 'Content-Type': 'text/html; charset=utf-8', 'Cookie': 'MUSID=ta4877m5ongth47s2n7kt0km13; virtual_device_id=d8afe1d9634ad1f6591e3486d4312976; pvid=imZ0YWzzY7TAFPWAQnp71Vl1bDpOXY91; sid=ta4877m5ongth47s2n7kt0km13; UM_distinctid=16c84237ad71af-096aa6932eb12f-37c143e-1fa400-16c84237ad870f; searchStr=AJ%7C%E9%98%BF%E8%BF%AA%E8%BE%BE%E6%96%AF%7C%E6%9D%8E%E5%AE%81%7C%E8%80%90%E5%85%8B%7C361%E5%BA%A6%7C; CNZZDATA1256786412=2077818170-1565584700-https%253A%252F%252Fwww.baidu.com%252F%7C1565590100', 'Host': 'www.meipai.com', 'Pragma': 'no-cache', # 'Referer': url, 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } try: # time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: # time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: # time.sleep(0.1) response = requests.get(url=url, headers=headers, allow_redirects=False, timeout=30) # 将响应转换成一个element对象 html = etree.HTML(response.text) # 粉丝数 followers_count = html.xpath( '//div[@class="user-num"]/a[4]/span[1]//text()') video_dict['followers_count'] = self.re_html( followers_count).replace(' ', '') video_dict['file_code'] = '165' video_data = video_dict.pop('上传者url') # logger.log(31, '--------------------------------开始录入视频主贴数据------------------------------------') # print(video_dict) response.close() # self.write_topic_jsonfile(video_dict) item = json.dumps(dict(video_dict), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_2h/video/{}/{}/165_{}_{}_MeiPai_Nike.json' .format(self.date_time, self.h2_name, time.strftime('%Y%m%d'), self.time_data), item, encoding='utf-8') if int(video_dict['comments_count']) == 0: logger.log(31, '此主贴没有视频评论回复。。。。。。') else: pages = int( math.ceil(float(int(video_dict['comments_count']) / 10))) # logger.log(31, '~~~~~~~~~~~~~~~~视频回帖数:%s , 回帖总页数: %s ~~~~~~~~~~~~~' % (video_dict['comments_count'], pages)) self.parse_comment_data(video_dict, pages) except: print(4444444444444444444, traceback.format_exc()) # 抓取视频回复数据 def parse_comment_data(self, video_dict, pages): try: is_break = self.is_break headers = { 'Content-Type': 'application/json; charset=utf-8', 'Cookie': 'MUSID=ta4877m5ongth47s2n7kt0km13; virtual_device_id=d8afe1d9634ad1f6591e3486d4312976; pvid=imZ0YWzzY7TAFPWAQnp71Vl1bDpOXY91; sid=ta4877m5ongth47s2n7kt0km13; UM_distinctid=16c84237ad71af-096aa6932eb12f-37c143e-1fa400-16c84237ad870f; searchStr=AJ%7C%E9%98%BF%E8%BF%AA%E8%BE%BE%E6%96%AF%7C%E6%9D%8E%E5%AE%81%7C%E8%80%90%E5%85%8B%7C361%E5%BA%A6%7C; CNZZDATA1256786412=2077818170-1565584700-https%253A%252F%252Fwww.baidu.com%252F%7C1565590100', 'Host': 'www.meipai.com', 'Pragma': 'no-cache', # 'Referer': video_dict['url'], 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } for i in range(1, int(pages) + 1): url = 'https://www.meipai.com/medias/comments_timeline?page={}&count=10&id={}'.format( i, video_dict['topic_id']) # print(url) try: # time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: try: # time.sleep(0.1) response = requests.get(url=url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30) except: # time.sleep(0.1) response = requests.get(url=url, headers=headers, allow_redirects=False, timeout=30) comments_data = json.loads(response.text) video_comment = dict() for item in comments_data: date_time_data = item['created_at'] # print(date_time_data, self.clean_date(date_time_data)) date_data = self.clean_date(date_time_data).split(' ')[0] time_data = self.clean_date(date_time_data).split(' ')[1] if len(date_data.split('-')) == 3: date_data_test = date_data.strip() else: date_data_test = (time.strftime('%Y') + '-' + date_data).strip() if len(time_data.split(':')) == 3: time_data_test = time_data.strip() else: time_data_test = (time_data + ':00').strip() date_time_test = date_data_test + ' ' + time_data_test if self.start_time <= date_time_test <= self.end_time: video_comment['platform'] = video_dict['platform'] video_comment['source_date'] = video_dict['date'] video_comment['source_time'] = video_dict['time'] video_comment['date'] = date_data_test video_comment['time'] = time_data_test video_comment['title'] = video_dict['title'] video_comment['author'] = item['user']['screen_name'] video_comment['author_id'] = str(item['user']['id']) video_comment['content'] = item['content_origin'] video_comment['content_id'] = str(item['id']) video_comment['floor'] = '' video_comment['keyword'] = video_dict['keyword'] video_comment['comment_url'] = url video_comment['source_url'] = video_dict['url'] video_comment['comments_count'] = '' video_comment['likes'] = '' video_comment['views'] = '' video_comment['reposts_count'] = '' video_comment['topic_id'] = video_dict['topic_id'] video_comment['imageurl'] = '' video_comment['audiourl'] = '' video_comment['file_code'] = '166' # logger.log(31, '--------------------------------开始录入视频回贴内容------------------------------------') # print(video_comment) # self.write_comment_jsonfile(video_comment) response.close() item = json.dumps(dict(video_comment), ensure_ascii=False) + '\n' self.hdfsclient.new_write( '/user/cspider_daily/nike_2h/videocomments/{}/{}/166_{}_{}_MeiPai_nike.json' .format(self.date_time, self.h2_name, time.strftime('%Y%m%d'), self.time_data), item, encoding='utf-8') if date_data_test < self.start_time: is_break = True if is_break: break except: print(5555555555555555555555, traceback.format_exc()) # 读取excel获取关键词 def parse_xlsx(self): # 设置路径 path = './快消采集关键词_v12_20200119.xlsx' # 打开execl workbook = xlrd.open_workbook(path) # 根据sheet索引或者名称获取sheet内容 Data_sheet = workbook.sheets()[0] # 通过索引获取 rowNum = Data_sheet.nrows # sheet行数 colNum = Data_sheet.ncols # sheet列数 # 获取所有单元格的内容 list = [] for i in range(rowNum): rowlist = [] for j in range(colNum): rowlist.append(Data_sheet.cell_value(i, j)) list.append(rowlist) for data in list[1::]: brand = data[0] # print(brand) yield { '关键词': brand, } def run(self): key_word_list = [] for item in self.parse_xlsx(): # print(item) key_word_list.append(item) for item_data1 in key_word_list: print(item_data1['关键词']) self.parse_page1(item_data1['关键词']) self.parse_page2(item_data1['关键词'])