def get_user_id(self): mysql_command = MySQLCommand() mysql_command.connectdb() mysql_command.cursor.execute("select userId, playlistCount from user") user_list = mysql_command.cursor.fetchall() for userinfo in user_list: user_id = userinfo['userId'] playlistCount = userinfo['playlistCount'] if playlistCount is not None: playlistCount = playlistCount.strip() else: playlistCount = 0 if len(user_id) > 0: self.music_task.put(user_id) self.list_task.put([user_id, int(playlistCount)]) time.sleep(2)
def save_user_info(self): mysql_command = MySQLCommand() mysql_command.connectdb() while True: result = self.user_queue.get() # print('爬去的用户结果: ', result) mysql_command.insert_user(result)
def save_sql(self): mysql_command = MySQLCommand() mysql_command.connectdb() while True: ids = self.id_queue.get() print('数据为:\n', ids) mysql_command.update_list(ids)
def save_music_list(self): mysql_command = MySQLCommand() mysql_command.connectdb() while True: result = self.list_queue.get() # print('爬去的歌单结果: ', result) mysql_command.insert_list(result)
def __init__(self): self.list1 = [ 1001, 1002, 1003, 2001, 2002, 2003, 6001, 6002, 6003, 7001, 7002, 7003, 4001, 4002, 4003 ] self.list2 = [ -1, 0, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90 ] # initial的值 self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Cookie': '_iuqxldmzr_=32; _ntes_nnid=0e6e1606eb78758c48c3fc823c6c57dd,1527314455632; ' '_ntes_nuid=0e6e1606eb78758c48c3fc823c6c57dd; __utmc=94650624; __utmz=94650624.1527314456.1.1.' 'utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); WM_TID=blBrSVohtue8%2B6VgDkxOkJ2G0VyAgyOY;' ' JSESSIONID-WYYY=Du06y%5Csx0ddxxx8n6G6Dwk97Dhy2vuMzYDhQY8D%2BmW3vlbshKsMRxS%2BJYEnvCCh%5CKY' 'x2hJ5xhmAy8W%5CT%2BKqwjWnTDaOzhlQj19AuJwMttOIh5T%5C05uByqO%2FWM%2F1ZS9sqjslE2AC8YD7h7Tt0Shufi' '2d077U9tlBepCx048eEImRkXDkr%3A1527321477141; __utma=94650624.1687343966.1527314456.1527314456' '.1527319890.2; __utmb=94650624.3.10.1527319890', 'Host': 'music.163.com', 'Referer': 'http://music.163.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/66.0.3359.181 Safari/537.36' } self.mysqlCommand = MySQLCommand() self.mysqlCommand.connectdb()
class MusicSpider(object): def __init__(self): self.file_path = '../data/' # 用户信息保存位置 self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Cookie': 'JSESSIONID-WYYY=ByCd%2F1zHaA6%5CBqA%2BY6sxOkSFXycajAx3XuQyySu2buAYehwzXeZkRb1wscB8vUIg83pUvkMHO1SmtGIO3pKyySb%5CoxUpy9CUWWEo0hjRRszV%2FkqPsH%2B5PykExoVq9zQCZuwyQz4tQqCrvotiqb%5CO%5CA8cpWAqAQraI5NsvM5VY5KenvqS%3A1578052539036; _iuqxldmzr_=32; _ntes_nnid=6773350955c533de38f1625624ebe4f4,1578050739108; _ntes_nuid=6773350955c533de38f1625624ebe4f4; WM_NI=3NHJAjwsUDaG8r2TMyn128jA6fBbyickbyK%2FnunpTznOsK4Xk5AhevMS3EvW6tQsbNoSelxCjgnNNqWFyUEP%2B1e8SaaQ51OcjIxmvagcdyPMlC%2B%2BTwteRAImrcPzeEINM0U%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eed2d14a9596ae94f067a88e8ba2d14a929a9aaabb21bab2aba9c240b19bfdb7db2af0fea7c3b92af19288abc462b5ad9ba5e44dfcaefeb5d073aeeffed9e94bf6ba8e83fc63a1b5ae9aca25aeaba291d772ae91bdacb754a9eb8f89e87e8f8dfda6f55df6ac9f94e146ad8dab8dfb49aab9a2afcd7b959ab7b6c85ce9efabd9d26ba38ffbd2ce69aa97b88ef56ba5bdac9ad347b09de5ccd77db8bb9ea2cc67b2bda09be84f8b9283d1d837e2a3; WM_TID=mCNsKkYK71tBAQBBRFNtqjmPHS4pFjUG', 'Host': 'music.163.com', 'Referer': 'http://music.163.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/66.0.3359.181 Safari/537.36' } self.data_list = pd.read_csv('../data/csv_file/music_spider.csv') self.num = 0 # 从第0首歌曲开始爬取 self.music = False self.cookie_path = '../data/cookie.txt' @staticmethod def _generate_random_strs(length): string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" i = 0 # 控制次数参数i # 初始化随机字符串 random_strs = "" while i < length: e = random.random() * len(string) # 向下取整 e = math.floor(e) random_strs = random_strs + list(string)[e] i = i + 1 return random_strs # AES加密 @staticmethod def _aes_encrypt(msg, key): padding = 16 - len(msg) % 16 # 如果不是16的倍数则进行填充(padding) msg = msg + padding * chr(padding) # 这里使用padding对应的单字符进行填充 iv = '0102030405060708' # 用来加密或者解密的初始向量(必须是16位) cipher = AES.new(key, AES.MODE_CBC, iv) encrypted_bytes = cipher.encrypt(msg) # 加密后得到的是bytes类型的数据 encode_strs = base64.b64encode(encrypted_bytes) # 使用Base64进行编码,返回byte字符串 enc_text = encode_strs.decode('utf-8') # 对byte字符串按utf-8进行解码 return enc_text # RSA加密 @staticmethod def _rsa_encrypt(random_strs, key, f): # 随机字符串逆序排列 string = random_strs[::-1] # 将随机字符串转换成byte类型数据 text = bytes(string, 'utf-8') seckey = int(codecs.encode(text, encoding='hex'), 16) ** int(key, 16) % int(f, 16) return format(seckey, 'x').zfill(256) # 获取参数 def get_params(self, id_msg, comment): # msg也可以写成msg = {"offset":"页面偏移量=(页数-1) * 20", "limit":"20"},offset和limit这两个参数必须有(js) # limit最大值为100,当设为100时,获取第二页时,默认前一页是20个评论,也就是说第二页最新评论有80个,有20个是第一页显示的 # msg = '{"rid":"R_SO_4_1302938992","offset":"0","total":"True","limit":"100","csrf_token":""}' # offset = (page-1) * 20 # msg = '{offset":' + str(offset) + ',"limit":"20"}' # msg = '{"rid":"R_SO_4_1302938992","offset":' + str(offset) + ',"total":"True","limit":"20","csrf_token":""}' key = '0CoJUm6Qyw8W8jud' if comment: offset = (id_msg - 1) * 20 # offset和limit是必选参数,其他参数是可选的,其他参数不影响data数据的生成 msg = '{"offset":' + str(offset) + ',"total":"True","limit":"20","csrf_token":""}' else: msg = '{id: ' + id_msg + ', lv: -1, tv: -1}' f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7' e = '010001' enc_text = self._aes_encrypt(msg, key) # 生成长度为16的随机字符串 i = self._generate_random_strs(16) # 两次AES加密之后得到params的值 enc_text = self._aes_encrypt(enc_text, i) # RSA加密之后得到encSecKey的值 enc_seckey = self._rsa_encrypt(i, e, f) return enc_text, enc_seckey # 数据正则处理 def re_value(self, value): value = re.sub(r'\r|\n|\\|\'|\{|\}|\"', ' ', value) return value def check_headers(self): cookie_list = [] with open(self.cookie_path, 'r') as fp: for i in fp.readlines(): i = json.loads(i) cookie_list.append(i) self.headers['Cookie'] = random.choice(cookie_list)['cookie'] # 获取评论总数 def page_spider(self, music_id): url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_' + music_id + '?csrf_token=' page = 1 params, encSecKey = self.get_params(page, True) data = {'params': params, 'encSecKey': encSecKey} self.headers['Referer'] = 'https://music.163.com/song?id=%s' % music_id repeat = 0 while repeat < 8: try: if repeat > 5: self.check_headers() r = requests.post(url, headers=self.headers, data=data) time.sleep(repeat) r.encoding = "utf-8" if r.status_code == 200: # 返回json格式的数据 result = r.json() if 'total' in result.keys(): total = result['total'] return total else: return 0 else: repeat += 1 except Exception as e: print('ID为%s的歌曲评论总数获取失败, 原因是%s' % (music_id, e)) repeat += 1 def get_lynic(self, song_id): # params的长度为108,不要拿浏览器控制面板中的数据进行测试,那里的params长度为128,不符合 params, encSecKey = self.get_params(song_id, False) data = {'params': params, 'encSecKey': encSecKey} url = 'https://music.163.com/weapi/song/lyric?csrf_token=' repeat = 1 while repeat < 16: try: if repeat > 8: self.check_headers() r = requests.post(url, headers=self.headers, data=data) time.sleep(repeat) song = r.json() if 'uncollected' in song.keys() or 'lrc' in song.keys() or 'nolyric' in song.keys(): break else: if 'sgc' in song.keys(): if song['sgc']: break repeat += 1 print('第%d次获取ID为%s的歌曲歌词失败,请求太快!' % (repeat, song_id)) except Exception as e: print('第%d次获取ID为%s的歌曲歌词失败,原因%s' % (repeat, song_id, e)) repeat += 1 try: song_lynic = song['lrc']['lyric'] song_lynic = re.sub(r'\[(.*?)\]', '', song_lynic) song_lynic = re.sub(r'\n', ',', song_lynic) song_lynic = self.re_value(song_lynic) except Exception: # print(song_id) # print(song) song_lynic = '' try: id = song['lyricUser']['userid'] uptime = song['lyricUser']['uptime'] lynic_user = json.dumps({'user_id': id, 'uptime': uptime}) except Exception: lynic_user = json.dumps({'user_id': '', 'uptime': ''}) result = {'song_lynic': song_lynic, 'lynic_user': lynic_user} # print(result) return result # 获取歌曲详情 def get_music_info(self, music): music_dict = {} # lynic_result = self.get_lynic(music) lynic_result = {'song_lynic': '', 'lynic_user': ''} m_id = music music_dict['music_id'] = m_id simple_music = [] contain_list = [] url = 'https://music.163.com/song?id=%s' % m_id repeat = 0 while repeat < 5: try: response = requests.get(url, headers=self.headers) time.sleep(repeat) response = response.text soup = BeautifulSoup(response, 'html5lib') try: title = soup.find_all('div', attrs={'class': 'tit'})[0] title = title.find('em').text music_dict['music_name'] = title break except Exception as e: music_dict['music_name'] = '' print('未找到ID为%s歌曲的歌名,原因是%s' % (m_id, e)) repeat += 1 except Exception as e: print('第%d次获取ID为%s歌曲详情失败!原因是%s ' % (repeat, m_id, e)) repeat += 1 break try: for index, info in enumerate(soup.find_all('p', attrs={'class': 'des s-fc4'})): try: singer_id = info.find_all('span')[0].find_all('a')[0]['href'].replace('/artist?id=', '').strip() music_dict['singer_id'] = singer_id except Exception: try: album_id = info.find_all('a')[0]['href'].replace('/album?id=', '').strip() music_dict['album_id'] = album_id except: if index == 0: music_dict['singer_id'] = '' else: music_dict['album_id'] = '' except Exception as e: music_dict['singer_id'] = '' music_dict['album_id'] = '' print('ID为%s的歌曲的歌手和专辑信息获取失败,使用默认空值!失败原因是%s' % (m_id, e)) try: music_list = soup.find_all('ul', attrs={'class': 'm-rctlist f-cb'})[0] for info in music_list.find_all('li'): try: playlist = re.findall(r'playlist\?id=(.*?)" title', str(info))[0] creator_id = re.findall(r'/user/home\?id=(.*?)" title', str(info))[0] contain_list.append({'list': playlist, 'creator': creator_id}) except: print('歌单ID和创建者ID爬取异常!此信息为:', str(info)) except Exception as e: print('获取包含此歌的歌单信息失败!失败歌曲为: %s, 原因是%s' % (m_id, e)) music_dict['contain_list'] = json.dumps(contain_list) try: simple_m = soup.find_all('ul', attrs={'class': 'm-sglist f-cb'})[0] for music in simple_m.find_all('li', attrs={'class': 'f-cb'}): try: song_id = re.findall(r'/song\?id=(.*?)" title', str(music))[0] try: singer_id = re.findall(r'/artist\?id=(.*?)">', str(music))[0] except: try: singer_id = re.findall(r'title="(.*?)"><span', str(music))[0] except: singer_id = '' simple_music.append({'song': song_id, 'singer': singer_id}) except: print('歌曲ID和歌手ID爬取异常!此信息为:', str(music)) except Exception as e: print('获取于此歌相似的歌曲失败!失败歌曲为: %s, 原因是%s' % (m_id, e)) comment_num = self.page_spider(m_id) music_dict['comment_num'] = str(comment_num) music_dict['simple_music'] = json.dumps(simple_music) music_dict['song_lynic'] = lynic_result['song_lynic'] music_dict['lynic_user'] = lynic_result['lynic_user'] return music_dict # 重连数据库 def conn_music(self): self.mysqlMusic = MySQLCommand() self.mysqlMusic.connectdb() self.music = True # 保存歌曲信息 def save_music(self, num, result): try: self.mysqlMusic.insert_music(result) print(result) print('--->第%d首歌曲爬取完成<---' % num) except: self.music = False print('数据库异常,重新连接数据库...') # 从csv文件下发任务 def get_list_id(self): if self.music is False: self.conn_music() time.sleep(1) data = self.data_list['music_id'] num = 0 for task in data.values: task = str(task) print('爬取ID为 %s 的歌曲...' % task) if num >= self.num: result = self.get_music_info(task) while True: if self.music is False: self.conn_music() time.sleep(1) else: self.save_music(num, result) break num += 1
def save_user_info(self): mysql_command = MySQLCommand() mysql_command.connectdb() while True: result = self.user_result_queue.get() mysql_command.insert_user(result)
def user_conn(self): self.mysqlUser = MySQLCommand() self.mysqlUser.connectdb() self.conn_user = True
def result_conn(self): self.mysqlResult = MySQLCommand() self.mysqlResult.connectdb() self.conn_result = True
def task_conn(self): self.mysqlCommand = MySQLCommand() self.mysqlCommand.connectdb() self.conn_task = True
def conn_list(self): self.mysqlList = MySQLCommand() self.mysqlList.connectdb() self.list = True
def list_id(self): mysql_command = MySQLCommand() mysql_command.connectdb() mysql_command.cursor.execute("select id, userLikeId from music_list") list_ids = mysql_command.cursor.fetchall() for id in list_ids: userids = id.get('userLikeId') list_id = id.get('id') if userids is not None and len(userids) > 10: print('ID为%s的歌单已经更新' % list_id) continue if len(list_id) > 0: replace = 1 print('正在爬取ID为%s 的歌单' % list_id) id = list_id.strip() url = 'https://music.163.com/playlist?id=%s' % id print('爬取的歌单url为:%s' % url) while replace < 10: msg = 0 try: headers = self.headers res = requests.get(url, headers=headers) time.sleep(replace + 5) soup = BeautifulSoup(res.text, 'html5lib') try: music = soup.find('ul', attrs={'class': 'f-hide'}) music_id = music.find_all('li') except Exception as e: music_id = [] msg += 1 print('ID为%s的歌单没有歌曲!原因是%s' % (list_id, e)) try: user = soup.find('ul', attrs={'class': 'm-piclist f-cb'}) user_id = user.find_all('li') except Exception as e: user_id = [] msg += 1 print('ID为%s的歌单没有喜欢的用户!原因是%s' % (list_id, e)) try: simple_list = soup.findAll('div', attrs={'class': 'info'}) except Exception as e: simple_list = [] msg += 1 print('ID为%s的歌单没有相关推荐或热门歌单!原因是%s' % (list_id, e)) if msg < 2: try: self.extract_id(list_id, music_id, user_id, simple_list) break except Exception as e: print('失败!原因是%r' % e) replace += 1 else: replace += 2 except Exception as e: print('重试! %r' % e) replace += 1 time.sleep(2)
def conn_singer(self): self.mysqlSinger = MySQLCommand() self.mysqlSinger.connectdb() self.singer = True
def conn_list_comm(self): self.mysqlLcomm = MySQLCommand() self.mysqlLcomm.connectdb() self.list_comm = True
def conn_comm(self): self.mysqlComment = MySQLCommand() self.mysqlComment.connectdb() self.comm = True
def conn_user(self): self.mysqlUser = MySQLCommand() self.mysqlUser.connectdb() self.user = True
def conn_music(self): self.mysqlMusic = MySQLCommand() self.mysqlMusic.connectdb() self.music = True
class SingerSpider(object): def __init__(self): self.list1 = [ 1001, 1002, 1003, 2001, 2002, 2003, 6001, 6002, 6003, 7001, 7002, 7003, 4001, 4002, 4003 ] self.list2 = [ -1, 0, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90 ] # initial的值 self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Cookie': '_iuqxldmzr_=32; _ntes_nnid=0e6e1606eb78758c48c3fc823c6c57dd,1527314455632; ' '_ntes_nuid=0e6e1606eb78758c48c3fc823c6c57dd; __utmc=94650624; __utmz=94650624.1527314456.1.1.' 'utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); WM_TID=blBrSVohtue8%2B6VgDkxOkJ2G0VyAgyOY;' ' JSESSIONID-WYYY=Du06y%5Csx0ddxxx8n6G6Dwk97Dhy2vuMzYDhQY8D%2BmW3vlbshKsMRxS%2BJYEnvCCh%5CKY' 'x2hJ5xhmAy8W%5CT%2BKqwjWnTDaOzhlQj19AuJwMttOIh5T%5C05uByqO%2FWM%2F1ZS9sqjslE2AC8YD7h7Tt0Shufi' '2d077U9tlBepCx048eEImRkXDkr%3A1527321477141; __utma=94650624.1687343966.1527314456.1527314456' '.1527319890.2; __utmb=94650624.3.10.1527319890', 'Host': 'music.163.com', 'Referer': 'http://music.163.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/66.0.3359.181 Safari/537.36' } self.mysqlCommand = MySQLCommand() self.mysqlCommand.connectdb() # 获取歌手信息 def get_singer_info(self, artist_id): song_dict = dict() # 歌手热门歌曲字典 try: url = 'https://music.163.com/artist?id=' + artist_id r = requests.get(url, headers=self.headers) soup = BeautifulSoup(r.text, 'html5lib') try: singer_homepage = soup.find('a', attrs={'class': 'btn-rz f-tid'}) singer_homepage = singer_homepage['href'].replace( '/user/home?id=', '').strip() except: singer_homepage = '' try: song_list = str(soup.find_all('ul', attrs={'class': 'f-hide'})) song_list = BeautifulSoup(song_list, 'html5lib') song_list = song_list.find_all('a') except: song_list = [] for song in song_list: song_name = song.string song_id = song['href'].replace('/song?id=', '').strip() song_dict[song_id] = song_name song_dict = str(song_dict) song_dict = json.dumps(song_dict) return singer_homepage, song_dict except: return '', json.dumps({}) # 获取所有歌手 def get_all_singer(self, url): r = requests.get(url, headers=self.headers) soup = BeautifulSoup(r.text, 'html5lib') for artist in soup.find_all('a', attrs={'class': 'nm nm-icn f-thide s-fc0'}): artist_name = artist.string artist_id = artist['href'].replace('/artist?id=', '').strip() singer_homepage, song_dict = self.get_singer_info(artist_id) print(artist_id, artist_name, singer_homepage) try: self.mysqlCommand.insert_singer(artist_id, artist_name, singer_homepage, song_dict) except Exception as msg: print(msg) # spider主函数 def spider_main(self): print('开始爬取歌手信息...') for index, i in enumerate(self.list1): for j in self.list2: url = 'http://music.163.com/discover/artist/cat?id=' + str( i) + '&initial=' + str(j) self.get_all_singer(url)
class ListCommSpider(): def __init__(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Cookie': 'WM_TID=36fj4OhQ7NdU9DhsEbdKFbVmy9tNk1KM; _iuqxldmzr_=32; _ntes_nnid=26fc3120577a92f179a3743269d8d0d9,1536048184013; _ntes_nuid=26fc3120577a92f179a3743269d8d0d9; __utmc=94650624; __utmz=94650624.1536199016.26.8.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); WM_NI=2Uy%2FbtqzhAuF6WR544z5u96yPa%2BfNHlrtTBCGhkg7oAHeZje7SJiXAoA5YNCbyP6gcJ5NYTs5IAJHQBjiFt561sfsS5Xg%2BvZx1OW9mPzJ49pU7Voono9gXq9H0RpP5HTclE%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eed5cb8085b2ab83ee7b87ac8c87cb60f78da2dac5439b9ca4b1d621f3e900b4b82af0fea7c3b92af28bb7d0e180b3a6a8a2f84ef6899ed6b740baebbbdab57394bfe587cd44b0aebcb5c14985b8a588b6658398abbbe96ff58d868adb4bad9ffbbacd49a2a7a0d7e6698aeb82bad779f7978fabcb5b82b6a7a7f73ff6efbd87f259f788a9ccf552bcef81b8bc6794a686d5bc7c97e99a90ee66ade7a9b9f4338cf09e91d33f8c8cad8dc837e2a3; JSESSIONID-WYYY=G%5CSvabx1X1F0JTg8HK5Z%2BIATVQdgwh77oo%2BDOXuG2CpwvoKPnNTKOGH91AkCHVdm0t6XKQEEnAFP%2BQ35cF49Y%2BAviwQKVN04%2B6ZbeKc2tNOeeC5vfTZ4Cme%2BwZVk7zGkwHJbfjgp1J9Y30o1fMKHOE5rxyhwQw%2B%5CDH6Md%5CpJZAAh2xkZ%3A1536204296617; __utma=94650624.1052021654.1536048185.1536199016.1536203113.27; __utmb=94650624.12.10.1536203113', 'Host': 'music.163.com', 'Referer': 'http://music.163.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/66.0.3359.181 Safari/537.36'} self.host_path = '../data/host.txt' self.cookie_path = '../data/cookie.txt' self.ip_queue = Queue() self.save_queue = Queue() # 结果队列 self.task_queue = Queue() # 任务队列 self.save_user_queue = Queue() # 评论人队列 self.conn_task = False self.conn_result = False self.conn_user = False self.prosiex_start = True # 是否启动代理IP爬取线程 self.num = 0 # 从第0个歌单开始爬取 self.listid = pd.read_csv('/Users/apple/PycharmProjects/WYY_sprider/demo/musicList.csv', dtype={'list_id': str, 'user_id': str}) # 重连数据库 def task_conn(self): self.mysqlCommand = MySQLCommand() self.mysqlCommand.connectdb() self.conn_task = True def result_conn(self): self.mysqlResult = MySQLCommand() self.mysqlResult.connectdb() self.conn_result = True def user_conn(self): self.mysqlUser = MySQLCommand() self.mysqlUser.connectdb() self.conn_user = True # 生成16个随机字符 def generate_random_strs(self, length): string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" # 控制次数参数i i = 0 # 初始化随机字符串 random_strs = "" while i < length: e = random.random() * len(string) # 向下取整 e = math.floor(e) random_strs = random_strs + list(string)[e] i = i + 1 return random_strs # AES加密 def AESencrypt(self, msg, key): # 如果不是16的倍数则进行填充(paddiing) padding = 16 - len(msg) % 16 # 这里使用padding对应的单字符进行填充 msg = msg + padding * chr(padding) # 用来加密或者解密的初始向量(必须是16位) iv = '0102030405060708' cipher = AES.new(key, AES.MODE_CBC, iv) # 加密后得到的是bytes类型的数据 encryptedbytes = cipher.encrypt(msg) # 使用Base64进行编码,返回byte字符串 encodestrs = base64.b64encode(encryptedbytes) # 对byte字符串按utf-8进行解码 enctext = encodestrs.decode('utf-8') return enctext # RSA加密 def RSAencrypt(self, randomstrs, key, f): # 随机字符串逆序排列 string = randomstrs[::-1] # 将随机字符串转换成byte类型数据 text = bytes(string, 'utf-8') seckey = int(codecs.encode(text, encoding='hex'), 16) ** int(key, 16) % int(f, 16) return format(seckey, 'x').zfill(256) # 获取参数 def get_params(self, page): # msg也可以写成msg = {"offset":"页面偏移量=(页数-1) * 20", "limit":"20"},offset和limit这两个参数必须有(js) # limit最大值为100,当设为100时,获取第二页时,默认前一页是20个评论,也就是说第二页最新评论有80个,有20个是第一页显示的 # msg = '{"rid":"R_SO_4_1302938992","offset":"0","total":"True","limit":"100","csrf_token":""}' # 偏移量 offset = (page - 1) * 20 # offset和limit是必选参数,其他参数是可选的,其他参数不影响data数据的生成 msg = '{"offset":' + str(offset) + ',"total":"True","limit":"20","csrf_token":""}' key = '0CoJUm6Qyw8W8jud' f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7' e = '010001' enctext = self.AESencrypt(msg, key) # 生成长度为16的随机字符串 i = self.generate_random_strs(16) # 两次AES加密之后得到params的值 encText = self.AESencrypt(enctext, i) # RSA加密之后得到encSecKey的值 encSecKey = self.RSAencrypt(i, e, f) return encText, encSecKey def check_headers(self): cookie_list = [] with open(self.cookie_path, 'r') as fp: for i in fp.readlines(): i = json.loads(i) cookie_list.append(i) self.headers['Cookie'] = random.choice(cookie_list)['cookie'] # 检查代理IP是否可用 def check_ip(self, proxies): try: header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/64.0.3282.186 Safari/537.36'} ip = '://' + proxies['ip'] + ':' + proxies['port'] proxies = {'https': 'https' + ip} url = 'https://www.ipip.net/' r = requests.get(url, headers=header, proxies=proxies, timeout=5) r.raise_for_status() except: return False else: print(proxies, '检查通过!') return True # 生成IP代理 def ip_proxies(self): api = 'http://www.xicidaili.com/wn/{}' header = { 'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJTZlOTVjNGQ1MmUxMDlmNzhlNjkwMDU3MDUxMTQ4YTUwBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUpRcU9ySVRNcmlOTytuNm9ZWm53RUFDYzhzTnZCbGlNa0ZIaHJzancvZEU9BjsARg%3D%3D--742b1937a06cc747483cd594752ef2ae80fc4d91; Hm_lvt_0cf76c77469e965d2957f0553e6ecf59=1577952296; Hm_lpvt_0cf76c77469e965d2957f0553e6ecf59=1578016572', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/' '537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Host': 'www.xicidaili.com', 'Connection': 'keep-alive', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Cache-Control': 'no-cache'} fp = open(self.host_path, 'a+', encoding=('utf-8')) self.ip_pool = [] for i in range(20): api = api.format(1) respones = requests.get(url=api, headers=header) time.sleep(3) soup = BeautifulSoup(respones.text, 'html.parser') container = soup.find_all(name='tr', attrs={'class': 'odd'}) for tag in container: try: con_soup = BeautifulSoup(str(tag), 'html.parser') td_list = con_soup.find_all('td') ip = str(td_list[1])[4:-5] port = str(td_list[2])[4:-5] _type = td_list[5].text IPport = {'ip': ip, 'port': port, 'type': _type.lower()} if self.check_ip(IPport): IPport = json.dumps(IPport) self.ip_pool.append(IPport) fp.write(IPport) fp.write('\n') self.ip_queue.put(IPport) except Exception as e: print('No IP!') if self.prosiex_start is False: break fp.close() # 从host.txt中读取代理 def ip_txt(self): print('IP代理爬取不够,从host.txt中添加...') with open(self.host_path, 'r') as fp: ip_port = fp.readlines() for i in ip_port: self.ip_pool.append(i) self.ip_queue.put(i) def get_comments_json(self, url, data): repeat = 0 while repeat < 4: try: r = requests.post(url, headers=self.headers, data=data) time.sleep(repeat+2) r.encoding = "utf-8" if r.status_code == 200: # 返回json格式的数据 result = r.json() if 'total' in result.keys(): total = result['total'] repeat = 0 self.ip_pool = [] return result, total elif 'code' in result.keys(): if result['code'] == -460: if repeat < 3: self.check_headers() else: if len(self.ip_pool) < 10: Thread(target=self.ip_proxies, args=()).start() if len(self.ip_pool) < 10: self.ip_txt() result, total = self.ip_spider(url, data) if result is None: self.prosiex_start = False for i in range(90000): print('\r IP可能被封,代理IP不可用!需要等待' + str(90000 - i) + '秒...', sep=' ', end='', flush=True) time.sleep(1) self.prosiex_start = True else: self.prosiex_start = True return result, total repeat += 1 except: time.sleep(1) repeat += 1 print("第%d次爬取url为%s 的页面失败!正重新尝试..." % (repeat, url)) return None, None # 使用代理爬取 def ip_spider(self, url, data): repeat = 0 while repeat < 50: proxies = self.ip_queue.get() proxies = json.loads(proxies) ip = '://' + proxies['ip'] + ':' + proxies['port'] proxies = {'https': 'https' + ip} print('使用的代理IP为:', proxies) try: r = requests.post(url, headers=self.headers, data=data, proxies=proxies) time.sleep(2) try: r.encoding = 'utf-8' result = r.json() except Exception as e: print('错误:', e) return r, None if 'code' in result.keys(): if result['code'] == -460: repeat += 1 print('%r的IP代理不可用, 访问URL为%s的网页失败!原因是%s, 重试第%d次' % (proxies, url, result, repeat + 1)) if 'total' in result.keys(): total = result['total'] print('result: ', result) return result, total except Exception as e: print('IP代理为%r, 访问URL为%s的网页失败!原因是%s, 重试第%d次' % (proxies, url, e, repeat+1)) repeat += 1 print('返回的是none') return None, None # 数据正则处理 def re_value(self, value): value = re.sub(r'\r|\n|\\|\'|\{|\}|\"', ' ', value) return value # 获取热门评论 def hot_comments(self, html, list_id, pages, total, creater_id): try: print("正在获取歌单{}的热门评论,总共有{}页{}条评论!".format(list_id, pages, total)) if 'hotComments' in html: for item in html['hotComments']: # 提取发表热门评论的用户名 user = item['user'] if item['content'] is not None: comment = self.re_value(item['content']) else: comment = '' # 写入文件 hot_comment = {'hot_comment': '1', 'user_id': str(user['userId']), 'comment': comment, 'likedCount': str(item['likedCount']), 'time': str(item['time']), 'list_id': list_id, 'creater_id': creater_id} self.save_user_queue.put(str(user['userId'])) # 回复评论 reply_comment = [] if len(item['beReplied']) != 0: for reply in item['beReplied']: # 提取发表回复评论的用户名 reply_user = reply['user'] if reply['content'] is not None: content = self.re_value(reply['content']) else: content = '' reply_comment.append({'user_id': reply_user['userId'], 'content': content}) self.save_user_queue.put(str(reply_user['userId'])) hot_comment['reply'] = str(reply_comment) self.save_queue.put(hot_comment) except Exception as e: print('获取歌单{}的评论失败,原因是{}'.format(list_id, e)) return False # 获取普通评论 def comments(self, html, list_id, i, pages, total, creater_id): # try: print("正在获取歌单{}的第{}页评论,总共有{}页{}条评论!".format(list_id, i, pages, total)) # 全部评论 for item in html['comments']: # 提取发表评论的用户名 user = item['user'] if item['content'] is not None: comment = self.re_value(item['content']) else: comment = '' comment = {'hot_comment': '0', 'user_id': str(user['userId']), 'comment': comment, 'likedCount': str(item['likedCount']), 'time': str(item['time']), 'list_id': list_id, 'creater_id': creater_id} self.save_user_queue.put(str(user['userId'])) # 回复评论 reply_comment = [] if len(item['beReplied']) != 0: for reply in item['beReplied']: # 提取发表回复评论的用户名 reply_user = reply['user'] if reply['content'] is not None: content = self.re_value(reply['content']) else: content = '' reply_comment.append({'user_id': reply_user['userId'], 'content': content}) self.save_user_queue.put(str(reply_user['userId'])) comment['reply'] = str(reply_comment) self.save_queue.put(comment) return True # except Exception as e: # print('获取歌单{}的第{}页评论失败,原因是{}'.format(list_id, i, e)) # return False def page_spider(self): while True: list_id, creater_id = self.task_queue.get() print('开始爬取ID为%s歌单的所有评论!!!!!' % list_id) url1 = 'https://music.163.com/playlist?id=' + list_id url = 'https://music.163.com/weapi/v1/resource/comments/A_PL_0_' + list_id + '?csrf_token=' page = 1 params, encSecKey = self.get_params(page) data = {'params': params, 'encSecKey': encSecKey} # 获取第一页评论 try: html, total = self.get_comments_json(url, data) # 评论总数 if html is None: continue if 'comments' in html.keys(): if html['comments'] is None: try: requests.get(url1, headers=self.headers) except: pass html, total = self.get_comments_json(url, data) if html is None: continue except Exception as e: print('此歌单: %s, 评论爬取失败!原因:%s' % (list_id, e)) continue # 总页数 pages = math.ceil(total / 20) try: self.hot_comments(html, list_id, pages, total, creater_id) except Exception as e: print('此歌单: %s, 热门评论爬取失败!原因:%s' % (list_id, e)) try: self.comments(html, list_id, page, pages, total, creater_id) except Exception as e: print('此歌单: %s, 第一页普通评论爬取失败!原因:%s' % (list_id, e)) # 开始获取歌曲的全部评论 page = 2 reverse = False # 若请求的评论结果为空,则从最后评论页向前爬取 while True: if page == 0: break params, encSecKey = self.get_params(page) data = {'params': params, 'encSecKey': encSecKey} html, total = self.get_comments_json(url, data) # 从后向前已经把可请求的评论页请求完成,则跳出循环 if reverse is True and len(html['comments']) == 0: break if len(html['comments']) == 0: reverse = True page = pages print('开始倒序爬取!') continue # 从第二页开始获取评论 try: self.comments(html, list_id, page, pages, total, creater_id) except Exception as e: print('此歌单: %s, 第%d页普通评论爬取失败!原因:%s' % (list_id, page, e)) print('重新爬取!') if 'total' in str(e): for i in range(90000): print('\r IP可能被封,需要等待' + str(90000 - i) + '秒...', sep=' ', end='', flush=True) time.sleep(1) elif 'comments' in str(e): for i in range(10000): print('\r IP可能被封,需要等待' + str(10000 - i) + '秒...', sep=' ', end='', flush=True) time.sleep(1) else: continue if reverse is False: page += 1 else: page -= 1 # 如果爬取完成,则跳出循环 if page > pages: break # 连接wyy_spider数据库 def conn_data(self): while True: print('连接到mysql服务器...') try: conn = pymysql.connect( host='localhost', user='******', passwd='0321', port=3306, db='wyy_spider', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor ) cursor = conn.cursor() print('wyy_spider连接上了!') return conn, cursor except: print('wyy_spider连接失败!') time.sleep(2) # 从musicList.csv中获取任务 def sql_task(self): conn, cursor = self.conn_data() data = self.listid.loc[:, ['list_id', 'user_id']] num = 0 for listId, userId in zip(data['list_id'], data['user_id']): sql = "select user_id from list_comment where list_id=%s limit 1" % listId cursor.execute(sql) music_ids = cursor.fetchall() if len(music_ids) == 0: print('开始爬取ID为 %s 的歌单评论...' % listId) num += 1 else: num += 1 print('===' * 10, 'id为%s的歌单爬取完成, 第%d个歌单' % (listId, num), '===' * 10) continue if num >= self.num: list_id = listId.strip() creater_id = userId.strip() self.task_queue.put([list_id, creater_id]) time.sleep(15) print('===' * 10, 'id为%s的歌单爬取完成, 第%d个歌单' % (listId, num), '===' * 10) # 评论保存至数据库 def save_result(self): while True: comment = self.save_queue.get() if self.conn_result is False: self.result_conn() try: self.mysqlResult.insert_list_comm(comment) except: self.conn_result = False # 评论人保存至数据库 def save_user(self): while True: comment_user = self.save_user_queue.get() if self.conn_user is False: self.user_conn() try: self.mysqlUser.insert_co_user(comment_user) except: self.conn_user = False def spider_main(self): # Thread(target=self.page_spider, args=()).start() # Thread(target=self.page_spider, args=()).start() # Thread(target=self.page_spider, args=()).start() Thread(target=self.page_spider, args=()).start() Thread(target=self.save_result, args=()).start() Thread(target=self.save_user, args=()).start() self.sql_task()
class DataToCSV(): def __init__(self): self.music_queue = mp.Queue() self.user_queue = mp.Queue() self.list_queue = mp.Queue() self.singer_queue = mp.Queue() self.comment_queue = mp.Queue() self.music = False self.user = False self.list = False self.comm = False self.singer = False self.list_comm = False # 重连数据库 def conn_music(self): self.mysqlMusic = MySQLCommand() self.mysqlMusic.connectdb() self.music = True def conn_list(self): self.mysqlList = MySQLCommand() self.mysqlList.connectdb() self.list = True def conn_user(self): self.mysqlUser = MySQLCommand() self.mysqlUser.connectdb() self.user = True def conn_comm(self): self.mysqlComment = MySQLCommand() self.mysqlComment.connectdb() self.comm = True def conn_list_comm(self): self.mysqlLcomm = MySQLCommand() self.mysqlLcomm.connectdb() self.list_comm = True def conn_singer(self): self.mysqlSinger = MySQLCommand() self.mysqlSinger.connectdb() self.singer = True # def sql_music(self): # while True: # if self.music is False: # self.conn_music() # self.music = True # try: # self.mysqlMusic.cursor.execute("select * from music") # music_ids = self.mysqlMusic.cursor.fetchall() # break # except: # self.music = False # with open("music.csv", "w") as csvfile: # writer = csv.writer(csvfile) # # 先写入columns_name # writer.writerow(["歌曲ID", "歌名", "歌手ID", "专辑ID", "包含这首歌的歌单ID", "相似歌曲ID", "歌词", "歌词贡献者ID", "评论数"]) # # 写入多行用writerows # for id in music_ids: # # music_id = id.get('music_id') # singer_id = id.get('singer_id') # music_name = id.get('music_name') # album_id = id.get('album_id') # contain_list = id.get('contain_list') # simple_music = id.get('simple_music') # song_lynic = id.get('song_lynic') # lynic_user = id.get('lynic_user') # if song_lynic == '' or song_lynic is None: # if random.randint(1, 10) >= 3: # continue # comment_num = id.get('comment_num') # writer.writerow([music_id, music_name, singer_id, album_id, contain_list, simple_music, song_lynic, lynic_user, comment_num]) def sql_list(self): if self.list is False: self.conn_list() self.list = True self.mysqlList.cursor.execute("select * from music_list") list_ids = self.mysqlList.cursor.fetchall() num = 0 with open("music_list.csv", "w") as csvfile: writer = csv.writer(csvfile) # 先写入columns_name writer.writerow([ "list_id", "list_name", "list_user_id", "tags", "e_tags", "create_time", "update_time", "authority", "music_count", "play_count", "special_category", "subscription_count", "cloud_count", "music_count_update_time", "music_update_time", "is_quality", "list_describe", "like_user", "music_ids", "hot_list" ]) # 写入多行用writerows for id in list_ids: if id is None: continue list_id = id.get('id') musicId = id.get('musicId') if musicId is None: continue list_name = id.get('name') user_id = id.get('userId') createTime = id.get('createTime') updateTime = id.get('updateTime') description = id.get('description') trackCount = id.get('trackCount') authority = id.get('authority') playCount = id.get('playCount') specialType = id.get('specialType') expertTags = id.get('expertTags') tags = id.get('tags') if len(str(tags).strip()) < 5: continue num += 1 subscribedCount = id.get('subscribedCount') cloudTrackCount = id.get('cloudTrackCount') trackUpdateTime = id.get('trackUpdateTime') trackNumberUpdateTime = id.get('trackNumverUpdateTime') highQuality = id.get('highQuality') userLikeId = id.get('userLikeId') hotlist = id.get('hotlist') writer.writerow([ list_id, list_name, user_id, tags, expertTags, createTime, updateTime, authority, trackCount, playCount, specialType, subscribedCount, cloudTrackCount, trackNumberUpdateTime, trackUpdateTime, highQuality, description, userLikeId, musicId, hotlist ]) print(num) # # def sql_singer(self): # if self.singer is False: # self.conn_singer() # self.singer = True # self.mysqlSinger.cursor.execute("select * from singer") # singer_ids = self.mysqlSinger.cursor.fetchall() # with open("singer.csv", "w") as csvfile: # writer = csv.writer(csvfile) # # 先写入columns_name # writer.writerow(["歌手ID", "歌手名", "歌手主页ID", "歌手主页中的歌曲"]) # # 写入多行用writerows # for id in singer_ids: # artist_id = id.get('artist_id') # artist_name = id.get('artist_name') # homepage_id = id.get('homepage_id') # top50 = id.get('top50_song_dict') # if top50 == '' or top50 is None: # if random.randint(1, 10) >= 3: # continue # writer.writerow([artist_id, artist_name, homepage_id, top50]) # # def sql_user(self): # if self.user is False: # self.conn_user() # self.user = True # self.mysqlUser.cursor.execute("select * from user limit 0, 600000") # user_ids = self.mysqlUser.cursor.fetchall() # with open("user.csv", "w") as csvfile: # writer = csv.writer(csvfile) # # 先写入columns_name # writer.writerow(['用户ID', '用户昵称', '性别', '省份', '城市', '生日', '描述信息', '详细描述信息', '专家标签', '个性签名', # '用户类型', 'vip类型', '关注量', '粉丝量', '动态量', '创建的歌单数', '用户创建时间', '所有歌单ID', '本周听过', '以前听过', '听过的歌曲数']) # # 写入多行用writerows # for id in user_ids: # user_id = id.get('userId') # nickname = id.get('nickname') # province = id.get('province') # city = id.get('city') # birthday = id.get('birthday') # detailDescription = id.get('detailDescription') # description = id.get('description') # expertTags = id.get('expertTags') # signature = id.get('signature') # userType = id.get('userType') # vipType = id.get('vipType') # list_id = id.get('list_id') # eventCount = id.get('eventCount') # followeds = id.get('followeds') # follows = id.get('follows') # gender = id.get('gender') # playlistCount = id.get('playlistCount') # time = id.get('time') # week_music = id.get('week_music') # all_music = id.get('all_music') # listen_num = id.get('listen_num') # if province == '' or province is None: # if random.randint(1, 10) >= 3: # continue # if week_music == '' or week_music is None: # if random.randint(1, 10) >= 9: # continue # writer.writerow([user_id, nickname, gender, province, city, birthday, description, detailDescription, expertTags, # signature, userType, vipType, follows, followeds, eventCount, playlistCount, time, list_id, week_music, all_music, listen_num]) # # def sql_comments(self): # if self.comm is False: # self.conn_comm() # self.comm = True # self.mysqlComment.cursor.execute("select * from comments limit 100000, 600000") # comment_ids = self.mysqlComment.cursor.fetchall() # with open("comment.csv", "w") as csvfile: # writer = csv.writer(csvfile) # # 先写入columns_name # writer.writerow(['歌曲ID', '用户ID', '歌手ID', '评论时间', '是否为热门评论', '点赞量', '评论', '回复']) # # 写入多行用writerows # for id in comment_ids: # # music_id = id.get('music_id') # user_id = id.get('user_id') # hot_comment = id.get('hot_comment') # comment = id.get('comment') # likedCount = id.get('likedCount') # time = id.get('time') # singer_id = id.get('singer_id') # reply = id.get('reply') # writer.writerow([music_id, user_id, singer_id, time, hot_comment, likedCount, comment, reply]) # # def sql_list_comments(self): # if self.list_comm is False: # self.conn_list_comm() # self.list_comm = True # self.mysqlLcomm.cursor.execute("select * from list_comment limit 0, 600000") # comment_ids = self.mysqlLcomm.cursor.fetchall() # with open("list_comment.csv", 'w') as csvfile: # writer = csv.writer(csvfile) # writer.writerow(['歌单ID', '创建者ID', '评论者ID', '评论时间', '是否为热门评论', '点赞量', '评论', '回复']) # for id in comment_ids: # list_id = id.get('list_id') # user_id = id.get('user_id') # hot_comment = id.get('hot_comment') # likedCount = id.get('likedCount') # time = id.get('time') # comment = id.get('comment') # creater_id = id.get('creater_id') # reply = id.get('reply') # writer.writerow([list_id, creater_id, user_id, time, hot_comment, likedCount, comment, reply]) def execute_main(self): # Thread(target=self.sql_singer, args=()).start() # Thread(target=self.sql_comments, args=()).start() Thread(target=self.sql_list, args=()).start()
def save_sql(self): mysql_command = MySQLCommand() mysql_command.connectdb() while True: ids = self.id_queue.get() mysql_command.insert_list(ids)
class CommSpider(object): def __init__(self): self.headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Host': 'music.163.com', 'Origin': 'http://music.163.com', 'Cookie': '_iuqxldmzr_=32; _ntes_nnid=0e6e1606eb78758c48c3fc823c6c57dd,1527314455632; ' '_ntes_nuid=0e6e1606eb78758c48c3fc823c6c57dd; __utmc=94650624; __utmz=94650624.1527314456.1.1.' 'utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); WM_TID=blBrSVohtue8%2B6VgDkxOkJ2G0VyAgyOY;' ' JSESSIONID-WYYY=Du06y%5Csx0ddxxx8n6G6Dwk97Dhy2vuMzYDhQY8D%2BmW3vlbshKsMRxS%2BJYEnvCCh%5CKY' 'x2hJ5xhmAy8W%5CT%2BKqwjWnTDaOzhlQj19AuJwMttOIh5T%5C05uByqO%2FWM%2F1ZS9sqjslE2AC8YD7h7Tt0Shufi' '2d077U9tlBepCx048eEImRkXDkr%3A1527321477141; __utma=94650624.1687343966.1527314456.1527314456' '.1527319890.2; __utmb=94650624.3.10.1527319890', 'Referer': 'http://music.163.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded' } self.host_path = '../data/host.txt' self.cookie_path = '../data/cookie.txt' self.ip_queue = Queue() self.save_queue = Queue() # 结果队列 self.task_queue = Queue() # 任务队列 self.save_user_queue = Queue() # 评论人队列 self.ip_pool = [] # ip代理池 self.conn_task = False self.conn_result = False self.conn_user = False self.prosiex_start = True # 是否启动代理IP爬取线程 # 重连数据库 def task_conn(self): self.mysqlCommand = MySQLCommand() self.mysqlCommand.connectdb() self.conn_task = True time.sleep(1) def result_conn(self): self.mysqlResult = MySQLCommand() self.mysqlResult.connectdb() self.conn_result = True time.sleep(1) def user_conn(self): self.mysqlUser = MySQLCommand() self.mysqlUser.connectdb() self.conn_user = True time.sleep(1) def check_headers(self): cookie_list = [] with open(self.cookie_path, 'r') as fp: for i in fp.readlines(): i = json.loads(i) cookie_list.append(i) self.headers['Cookie'] = random.choice(cookie_list)['cookie'] # 检查代理IP是否可用 def check_ip(self, proxies): try: header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/64.0.3282.186 Safari/537.36' } ip = '://' + proxies['ip'] + ':' + proxies['port'] proxies = {'https': 'https' + ip} url = 'https://www.ipip.net/' r = requests.get(url, headers=header, proxies=proxies, timeout=5) r.raise_for_status() except: return False else: print(proxies, '检查通过!') return True # 生成IP代理 def ip_proxies(self): api = 'http://www.xicidaili.com/wn/{}' header = { 'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJTZlOTVjNGQ1MmUxMDlmNzhlNjkwMDU3MDUxMTQ4YTUwBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUpRcU9ySVRNcmlOTytuNm9ZWm53RUFDYzhzTnZCbGlNa0ZIaHJzancvZEU9BjsARg%3D%3D--742b1937a06cc747483cd594752ef2ae80fc4d91; Hm_lvt_0cf76c77469e965d2957f0553e6ecf59=1577952296; Hm_lpvt_0cf76c77469e965d2957f0553e6ecf59=1578016572', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/' '537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Host': 'www.xicidaili.com', 'Connection': 'keep-alive', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Cache-Control': 'no-cache' } fp = open(self.host_path, 'a+', encoding=('utf-8')) self.ip_pool = [] for i in range(20): api = api.format(1) respones = requests.get(url=api, headers=header) time.sleep(3) soup = BeautifulSoup(respones.text, 'html.parser') container = soup.find_all(name='tr', attrs={'class': 'odd'}) for tag in container: try: con_soup = BeautifulSoup(str(tag), 'html.parser') td_list = con_soup.find_all('td') ip = str(td_list[1])[4:-5] port = str(td_list[2])[4:-5] _type = td_list[5].text IPport = {'ip': ip, 'port': port, 'type': _type.lower()} if self.check_ip(IPport): IPport = json.dumps(IPport) self.ip_pool.append(IPport) fp.write(IPport) fp.write('\n') self.ip_queue.put(IPport) except Exception as e: print('No IP!') if self.prosiex_start is False: break fp.close() # 从host.txt中读取代理 def ip_txt(self): print('IP代理爬取不够,从host.txt中添加...') with open(self.host_path, 'r') as fp: ip_port = fp.readlines() for i in ip_port: self.ip_pool.append(i) self.ip_queue.put(i) # 生成16个随机字符 def generate_random_strs(self, length): string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" # 控制次数参数i i = 0 # 初始化随机字符串 random_strs = "" while i < length: e = random.random() * len(string) # 向下取整 e = math.floor(e) random_strs = random_strs + list(string)[e] i = i + 1 return random_strs # AES加密 def AESencrypt(self, msg, key): # 如果不是16的倍数则进行填充(paddiing) padding = 16 - len(msg) % 16 # 这里使用padding对应的单字符进行填充 msg = msg + padding * chr(padding) # 用来加密或者解密的初始向量(必须是16位) iv = '0102030405060708' cipher = AES.new(key, AES.MODE_CBC, iv) # 加密后得到的是bytes类型的数据 encryptedbytes = cipher.encrypt(msg) # 使用Base64进行编码,返回byte字符串 encodestrs = base64.b64encode(encryptedbytes) # 对byte字符串按utf-8进行解码 enctext = encodestrs.decode('utf-8') return enctext # RSA加密 def RSAencrypt(self, randomstrs, key, f): # 随机字符串逆序排列 string = randomstrs[::-1] # 将随机字符串转换成byte类型数据 text = bytes(string, 'utf-8') seckey = int(codecs.encode(text, encoding='hex'), 16)**int( key, 16) % int(f, 16) return format(seckey, 'x').zfill(256) # 获取参数 def get_params(self, page): # msg也可以写成msg = {"offset":"页面偏移量=(页数-1) * 20", "limit":"20"},offset和limit这两个参数必须有(js) # limit最大值为100,当设为100时,获取第二页时,默认前一页是20个评论,也就是说第二页最新评论有80个,有20个是第一页显示的 # msg = '{"rid":"R_SO_4_1302938992","offset":"0","total":"True","limit":"100","csrf_token":""}' # 偏移量 offset = (page - 1) * 20 # offset和limit是必选参数,其他参数是可选的,其他参数不影响data数据的生成 msg = '{"offset":' + str( offset) + ',"total":"True","limit":"20","csrf_token":""}' key = '0CoJUm6Qyw8W8jud' f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7' e = '010001' enctext = self.AESencrypt(msg, key) # 生成长度为16的随机字符串 i = self.generate_random_strs(16) # 两次AES加密之后得到params的值 encText = self.AESencrypt(enctext, i) # RSA加密之后得到encSecKey的值 encSecKey = self.RSAencrypt(i, e, f) return encText, encSecKey # 使用代理爬取 def ip_spider(self, url, data): repeat = 0 while repeat < 50: proxies = self.ip_queue.get() proxies = json.loads(proxies) ip = '://' + proxies['ip'] + ':' + proxies['port'] proxies = {'https': 'https' + ip} print('使用的代理IP为:', proxies) try: r = requests.post(url, headers=self.headers, data=data, proxies=proxies) time.sleep(2) try: r.encoding = 'utf-8' result = r.json() except Exception as e: print('错误:', e) return r, None if 'code' in result.keys(): if result['code'] == -460: repeat += 1 print('%r的IP代理不可用, 访问URL为%s的网页失败!原因是%s, 重试第%d次' % (proxies, url, result, repeat + 1)) if 'total' in result.keys(): total = result['total'] print('result: ', result) return result, total except Exception as e: print('IP代理为%r, 访问URL为%s的网页失败!原因是%s, 重试第%d次' % (proxies, url, e, repeat + 1)) repeat += 1 print('返回的是none') return None, None def get_comments_json(self, url, data): repeat = 0 while repeat < 4: try: r = requests.post(url, headers=self.headers, data=data) time.sleep(repeat + 2) r.encoding = "utf-8" if r.status_code == 200: # 返回json格式的数据 result = r.json() if 'total' in result.keys(): total = result['total'] repeat = 0 self.ip_pool = [] return result, total elif 'code' in result.keys(): if result['code'] == -460: if repeat < 3: self.check_headers() else: if len(self.ip_pool) < 10: Thread(target=self.ip_proxies, args=()).start() if len(self.ip_pool) < 10: self.ip_txt() result, total = self.ip_spider(url, data) if result is None: self.prosiex_start = False for i in range(90000): print('\r IP可能被封,代理IP不可用!需要等待' + str(90000 - i) + '秒...', sep=' ', end='', flush=True) time.sleep(1) self.prosiex_start = True else: self.prosiex_start = True return result, total repeat += 1 except: time.sleep(1) repeat += 1 print("第%d次爬取url为%s 的页面失败!正重新尝试..." % (repeat, url)) return None, None # 数据正则处理 def re_value(self, value): value = re.sub(r'\r|\n|\\|\'|\{|\}|\"', ' ', value) return value # 获取热门评论 def hot_comments(self, html, song_id, pages, total, singer_id): print("正在获取歌曲{}的热门评论,总共有{}页{}条评论!".format(song_id, pages, total)) if 'hotComments' in html: for item in html['hotComments']: # 提取发表热门评论的用户名 user = item['user'] if item['content'] is not None: comment = self.re_value(item['content']) else: comment = '' # 写入文件 hot_comment = { 'hot_comment': '1', 'user_id': str(user['userId']).strip(), 'comment': comment, 'likedCount': str(item['likedCount']), 'time': str(item['time']), 'music_id': song_id, 'singer_id': singer_id } self.save_user_queue.put(str(user['userId']).strip()) # 回复评论 reply_comment = [] if len(item['beReplied']) != 0: for reply in item['beReplied']: # 提取发表回复评论的用户名 reply_user = reply['user'] if reply['content'] is not None: content = self.re_value(reply['content']) else: content = '' reply_comment.append({ 'user_id': str(reply_user['userId']).strip(), 'content': content }) self.save_user_queue.put( str(reply_user['userId']).strip()) hot_comment['reply'] = str(reply_comment) self.save_queue.put(hot_comment) # 获取普通评论 def comments(self, html, song_id, i, pages, total, singer_id): print("正在获取歌曲{}的第{}页评论,总共有{}页{}条评论!".format(song_id, i, pages, total)) # 全部评论 for item in html['comments']: # 提取发表评论的用户名 user = item['user'] if item['content'] is not None: comment = self.re_value(item['content']) else: comment = '' comment = { 'hot_comment': '0', 'user_id': str(user['userId']).strip(), 'comment': comment, 'likedCount': str(item['likedCount']), 'time': str(item['time']), 'music_id': song_id, 'singer_id': singer_id } self.save_user_queue.put(str(user['userId'])) # 回复评论 reply_comment = [] if len(item['beReplied']) != 0: for reply in item['beReplied']: # 提取发表回复评论的用户名 reply_user = reply['user'] if reply['content'] is not None: content = self.re_value(reply['content']) else: content = '' reply_comment.append({ 'user_id': str(reply_user['userId']).strip(), 'content': content }) self.save_user_queue.put(str(reply_user['userId'])) comment['reply'] = str(reply_comment) self.save_queue.put(comment) return True def page_spider(self): while True: songid, singer_id = self.task_queue.get() print('开始爬取ID为%s歌曲的所有评论!!!!!' % songid) url1 = 'https://music.163.com/song?id=' + songid url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_' + songid + '?csrf_token=' page = 1 params, encSecKey = self.get_params(page) data = {'params': params, 'encSecKey': encSecKey} self.headers[ 'Referer'] = 'https://music.163.com/song?id=%s' % songid # 获取第一页评论 try: html, total = self.get_comments_json(url, data) # 评论总数 if html is None: continue if 'comments' in html.keys(): if html['comments'] is None: try: requests.get(url1, headers=self.headers) time.sleep(2) except: pass html, total = self.get_comments_json(url, data) if html is None: continue except Exception as e: print('此歌曲: %s, 评论爬取失败!原因:%s' % (songid, e)) if 'total' in str(e): for i in range(90000): print('\r IP可能被封,需要等待' + str(90000 - i) + '秒...', sep=' ', end='', flush=True) time.sleep(1) else: continue continue # 总页数 pages = math.ceil(total / 20) try: self.hot_comments(html, songid, pages, total, singer_id) except Exception as e: print('此歌曲: %s, 热门评论爬取失败!原因:%s' % (songid, e)) try: self.comments(html, songid, page, pages, total, singer_id) except Exception as e: print('此歌曲: %s, 第一页普通评论爬取失败!原因:%s' % (songid, e)) # 开始获取歌曲的全部评论 page = 2 reverse = False # 若请求的评论结果为空,则从最后评论页向前爬取 while True: if page == 0: break params, encSecKey = self.get_params(page) data = {'params': params, 'encSecKey': encSecKey} html, total = self.get_comments_json(url, data) # 从后向前已经把可请求的评论页请求完成,则跳出循环 if reverse is True and len(html['comments']) == 0: break # 从第二页到后可请求的评论已请求完,则从后向前请求 if len(html['comments']) == 0: reverse = True page = pages continue try: self.comments(html, songid, page, pages, total, singer_id) except Exception as e: print('此歌曲: %s, 第%d页普通评论爬取失败!原因:%s' % (songid, page, e)) print('重新爬取!') if 'total' in str(e): for i in range(90000): print('\r IP可能被封,需要等待' + str(90000 - i) + '秒...', sep=' ', end='', flush=True) time.sleep(1) elif 'comments' in str(e): for i in range(10000): print('\r IP可能被封,需要等待' + str(10000 - i) + '秒...', sep=' ', end='', flush=True) time.sleep(1) else: continue if reverse is False: page += 1 else: page -= 1 # 如果爬取完成,则跳出循环 if page > pages: break print('==' * 20, '%s====歌====曲====爬====取====完====成' % songid, '==' * 20) # 连接wyy_spider数据库 def conn_data(self): while True: print('连接到mysql服务器...') try: conn = pymysql.connect(host='localhost', user='******', passwd='0321', port=3306, db='wyy_spider', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) cursor = conn.cursor() print('wyy_spider连接上了!') return conn, cursor except: print('wyy_spider连接失败!') time.sleep(2) # 从数据库获取任务 def sql_task(self): conn, cursor = self.conn_data() cursor.execute("select music_id, singer_id from music limit 20,100") music_ids = cursor.fetchall() for id in music_ids: if id is None: continue try: music_id = id.get('music_id').strip() singer_id = id.get('singer_id').strip() except: continue self.task_queue.put([music_id, singer_id]) # 评论保存至数据库 def save_result(self): while True: comment = self.save_queue.get() if self.conn_result is False: self.result_conn() try: self.mysqlResult.insert_comments(comment) except: self.conn_result = False # 评论人保存至数据库 def save_user(self): while True: comment_user = self.save_user_queue.get() if self.conn_user is False: self.user_conn() try: self.mysqlUser.insert_co_user(comment_user) except: self.conn_user = False def spider_main(self): # Thread(target=self.page_spider, args=()).start() # Thread(target=self.page_spider, args=()).start() # Thread(target=self.page_spider, args=()).start() Thread(target=self.page_spider, args=()).start() Thread(target=self.save_result, args=()).start() Thread(target=self.save_user, args=()).start() self.sql_task()