def save_music_by_api(self, album_id): url = "http://music.163.com/api/album/" + str(album_id) # 去redis验证是否爬取过 check = redis_util.checkIfRequest(redis_util.musicPrefix, str(album_id)) if (check): print("url:", url, "has request. pass") time.sleep(1) return agent = random.choice(agents) self.headers["User-Agent"] = agent r = requests.get(url, headers=self.headers) # 解析 ablum_json = json.loads(r.text) # 保存redis去重缓存 if (ablum_json['code'] == 200): redis_util.saveUrl(redis_util.musicPrefix, str(album_id)) else: print(url, " request error :", ablum_json) return for item in ablum_json.get('album').get('songs'): music_id = item['id'] music_name = item['name'] try: sql.insert_music(music_id, music_name, album_id) except Exception as e: # 打印错误日志 print(music_id, music_name, album_id, ' inset db error: ', str(e)) # traceback.print_exc() time.sleep(1)
def get_record(user_id): # 创建获取歌手信息的方法 url_recd = 'https://music.163.com/#/user/songs/rank?id=' + user_id # 构建爬取url option = webdriver.ChromeOptions() # 设置不弹出浏览器 option.headless = True driver = webdriver.Chrome(chrome_options=option) # webdriver实例化 song = {} driver.get(url_recd) # 实例化对象访问url driver.switch_to.frame('g_iframe') # 找到指定iframe标签(这里是g_iframe)然后跳入 driver.implicitly_wait(10) # 隐式等待 driver.maximize_window() time.sleep(0.5) # 首次打开此界面会出现"现在支持搜索mv了"的提示,将songsall给挡住无法点击,所以最大化,等待加载后再点击 driver.find_element_by_id("songsall").click() # 定位到切换到所有时间的按钮标签,模拟鼠标点击查看所有时间下的听歌排行 # 注释.click()行,可改变(最近一周/所有时间) driver.implicitly_wait(10) # 隐式等待 time.sleep(0.5) # 这里还需要强制等待加载时间,一般一秒内就可以了 htmlrec = driver.page_source # 此时网页成为静态页面,获取所有页面信息 pagerec = BeautifulSoup(htmlrec, 'html.parser') # 使用bs4解析静态网页 allrec = pagerec.find(class_="g-wrap p-prf").find(class_="m-record").find( class_="j-flag").find_all('li') # 定位该位置下所有<li>的标签 try: # 使用try except结构防止意外报错中断运行 for i in allrec: # 遍历刚才位置下每一个<li>标签内信息 nickname = i.find(class_="s-fc8").text.replace('-', '') # 定位并获取歌手文本信息,再用replace方法清洗文本去掉歌名和各种之间连结的'-' # 将干净的歌手文本加入列表 music = i.find(class_="txt").find('a') music_name = music.getText() music_id = music['href'].replace('/song?id=', '') song[music_name] = music_id try: sql.insert_music(user_id, music_id, music_name, nickname) except Exception as e: # 打印错误日志 print(' inset db error: ', str(e)) # traceback.print_exc() # time.sleep(1) except Exception as e: print('unknown error: ', str(e)) # 如遇到意外,提示'未知'。 driver.close() # 关闭浏览器 return pagerec
def save_music_by_api(self, playlist_id): url = "http://music.163.com/api/playlist/detail?id=" + str(playlist_id) # 访问 agent = random.choice(agents) self.headers["User-Agent"] = agent @retrying.retry( stop_max_attempt_number=settings.connect["max_retries"], wait_fixed=settings.connect["interval"]) def get(): return requests.get(url, headers=self.headers, proxies=proxy.proxy, timeout=settings.connect["timeout"]) try: r = get() except Exception as e: logger.critical("代理连接失败" + str(e)) return # r = requests.get(url, headers=self.headers) # 解析 playlist_json = json.loads(r.text) # 错误处理 if playlist_json['code'] != 200: logger.error("{} request error :{}".format(url, playlist_json)) return for item in playlist_json.get('result').get('tracks'): music_id = item['id'] music_name = item['name'] try: sql.conn_lock.acquire() sql.insert_music(music_id, music_name, playlist_id) # print("sql success a song") except Exception as e: # 打印错误日志 logger.debug(' insert db error: ' + str(e)) # traceback.print_exc() # time.sleep(1) finally: sql.conn_lock.release()
def save_music_by_api(self, album_id): url = "http://music.163.com/api/album/" + str(album_id) # 访问 agent = random.choice(agents) self.headers["User-Agent"] = agent @retrying.retry(stop_max_attempt_number=settings.connect["max_retries"], wait_fixed=settings.connect["interval"]) def get(): return requests.get(url, headers=self.headers, proxies=proxy.proxy, timeout=settings.connect["timeout"]) try: r = get() except Exception as e: logger.critical("代理连接失败" + str(e)) return # r = requests.get(url, headers=self.headers) # 解析 album_json = json.loads(r.text) # 保存redis去重缓存 放弃 if album_json['code'] == 200: # redis_util.saveUrl(redis_util.albumPrefix, str(album_id)) pass else: logger.error("{} api请求错误: {}".format(url, album_json)) return for item in album_json.get('album').get('songs'): music_id = item['id'] music_name = item['name'] try: sql.conn_lock.acquire() sql.insert_music(music_id, music_name, album_id) except Exception as e: # 打印错误日志 logger.debug(' insert db error: ' + str(e)) # traceback.print_exc() # time.sleep(1) finally: sql.conn_lock.release()
def save_music(self, album_id): params = {'id': album_id} # 获取专辑对应的页面 agent = random.choice(agents) self.headers["User-Agent"] = agent r = requests.get('https://music.163.com/album', headers=self.headers, params=params) # 网页解析 soup = BeautifulSoup(r.content.decode(), 'html.parser') body = soup.body musics = body.find('ul', attrs={'class': 'f-hide'}).find_all('li') # 获取专辑的所有音乐 if len(musics) == 0: return for music in musics: music = music.find('a') music_id = music['href'].replace('/song?id=', '') music_name = music.getText() try: sql.insert_music(music_id, music_name, album_id) except Exception as e: # 打印错误日志 print(music + ' inset db error: ' + str(e)) time.sleep(5)
def save_music(self, album_id): params = {'id': album_id} # 获取专辑对应的页面 agent = random.choice(agents) self.headers["User-Agent"] = agent url = 'https://music.163.com/album?id=' + str(album_id) # 去redis验证是否爬取过 check = redis_util.checkIfRequest(redis_util.musicPrefix, url) if (check): print("url:", url, "has request. pass") time.sleep(1) return r = requests.get('https://music.163.com/album', headers=self.headers, params=params) # 网页解析 soup = BeautifulSoup(r.content.decode(), 'html.parser') body = soup.body # 保存redis去重缓存 redis_util.saveUrl(redis_util.musicPrefix, url) musics = body.find('ul', attrs={ 'class': 'f-hide' }).find_all('li') # 获取专辑的所有音乐 if len(musics) == 0: return for music in musics: music = music.find('a') music_id = music['href'].replace('/song?id=', '') music_name = music.getText() try: sql.insert_music(music_id, music_name, album_id) except Exception as e: # 打印错误日志 print(music, ' inset db error: ', str(e)) # traceback.print_exc() time.sleep(1)