コード例 #1
0
 def save_music_by_api(self, album_id):
     url = "http://music.163.com/api/album/" + str(album_id)
     # 去redis验证是否爬取过
     check = redis_util.checkIfRequest(redis_util.musicPrefix,
                                       str(album_id))
     if (check):
         print("url:", url, "has request. pass")
         time.sleep(1)
         return
     agent = random.choice(agents)
     self.headers["User-Agent"] = agent
     r = requests.get(url, headers=self.headers)
     # 解析
     ablum_json = json.loads(r.text)
     # 保存redis去重缓存
     if (ablum_json['code'] == 200):
         redis_util.saveUrl(redis_util.musicPrefix, str(album_id))
     else:
         print(url, " request error :", ablum_json)
         return
     for item in ablum_json.get('album').get('songs'):
         music_id = item['id']
         music_name = item['name']
         try:
             sql.insert_music(music_id, music_name, album_id)
         except Exception as e:
             # 打印错误日志
             print(music_id, music_name, album_id, ' inset db error: ',
                   str(e))
             # traceback.print_exc()
             time.sleep(1)
コード例 #2
0
def get_record(user_id):  # 创建获取歌手信息的方法
    url_recd = 'https://music.163.com/#/user/songs/rank?id=' + user_id  # 构建爬取url
    option = webdriver.ChromeOptions()  # 设置不弹出浏览器
    option.headless = True
    driver = webdriver.Chrome(chrome_options=option)  # webdriver实例化
    song = {}
    driver.get(url_recd)
    # 实例化对象访问url
    driver.switch_to.frame('g_iframe')
    # 找到指定iframe标签(这里是g_iframe)然后跳入
    driver.implicitly_wait(10)  # 隐式等待
    driver.maximize_window()
    time.sleep(0.5)
    # 首次打开此界面会出现"现在支持搜索mv了"的提示,将songsall给挡住无法点击,所以最大化,等待加载后再点击
    driver.find_element_by_id("songsall").click()
    # 定位到切换到所有时间的按钮标签,模拟鼠标点击查看所有时间下的听歌排行
    # 注释.click()行,可改变(最近一周/所有时间)
    driver.implicitly_wait(10)  # 隐式等待
    time.sleep(0.5)  # 这里还需要强制等待加载时间,一般一秒内就可以了
    htmlrec = driver.page_source
    # 此时网页成为静态页面,获取所有页面信息
    pagerec = BeautifulSoup(htmlrec, 'html.parser')  # 使用bs4解析静态网页
    allrec = pagerec.find(class_="g-wrap p-prf").find(class_="m-record").find(
        class_="j-flag").find_all('li')
    # 定位该位置下所有<li>的标签
    try:  # 使用try except结构防止意外报错中断运行
        for i in allrec:
            # 遍历刚才位置下每一个<li>标签内信息
            nickname = i.find(class_="s-fc8").text.replace('-', '')
            # 定位并获取歌手文本信息,再用replace方法清洗文本去掉歌名和各种之间连结的'-'
            # 将干净的歌手文本加入列表
            music = i.find(class_="txt").find('a')
            music_name = music.getText()
            music_id = music['href'].replace('/song?id=', '')
            song[music_name] = music_id
            try:
                sql.insert_music(user_id, music_id, music_name, nickname)
            except Exception as e:
                # 打印错误日志
                print(' inset db error: ', str(e))
                # traceback.print_exc()
                # time.sleep(1)
    except Exception as e:
        print('unknown error: ', str(e))
        # 如遇到意外,提示'未知'。
    driver.close()  # 关闭浏览器
    return pagerec
コード例 #3
0
    def save_music_by_api(self, playlist_id):
        url = "http://music.163.com/api/playlist/detail?id=" + str(playlist_id)
        # 访问
        agent = random.choice(agents)
        self.headers["User-Agent"] = agent

        @retrying.retry(
            stop_max_attempt_number=settings.connect["max_retries"],
            wait_fixed=settings.connect["interval"])
        def get():
            return requests.get(url,
                                headers=self.headers,
                                proxies=proxy.proxy,
                                timeout=settings.connect["timeout"])

        try:
            r = get()
        except Exception as e:
            logger.critical("代理连接失败" + str(e))
            return
        # r = requests.get(url, headers=self.headers)
        # 解析
        playlist_json = json.loads(r.text)
        # 错误处理
        if playlist_json['code'] != 200:
            logger.error("{} request error :{}".format(url, playlist_json))
            return
        for item in playlist_json.get('result').get('tracks'):
            music_id = item['id']
            music_name = item['name']
            try:
                sql.conn_lock.acquire()
                sql.insert_music(music_id, music_name, playlist_id)
                # print("sql success a song")
            except Exception as e:
                # 打印错误日志
                logger.debug(' insert db error: ' + str(e))
                # traceback.print_exc()
                # time.sleep(1)
            finally:
                sql.conn_lock.release()
コード例 #4
0
    def save_music_by_api(self, album_id):
        url = "http://music.163.com/api/album/" + str(album_id)

        # 访问
        agent = random.choice(agents)
        self.headers["User-Agent"] = agent

        @retrying.retry(stop_max_attempt_number=settings.connect["max_retries"],
                        wait_fixed=settings.connect["interval"])
        def get():
            return requests.get(url, headers=self.headers, proxies=proxy.proxy, timeout=settings.connect["timeout"])

        try:
            r = get()
        except Exception as e:
            logger.critical("代理连接失败" + str(e))
            return
        # r = requests.get(url, headers=self.headers)
        # 解析
        album_json = json.loads(r.text)
        # 保存redis去重缓存 放弃
        if album_json['code'] == 200:
            # redis_util.saveUrl(redis_util.albumPrefix, str(album_id))
            pass
        else:
            logger.error("{} api请求错误: {}".format(url, album_json))
            return
        for item in album_json.get('album').get('songs'):
            music_id = item['id']
            music_name = item['name']
            try:
                sql.conn_lock.acquire()
                sql.insert_music(music_id, music_name, album_id)
            except Exception as e:
                # 打印错误日志
                logger.debug(' insert db error: ' + str(e))
                # traceback.print_exc()
                # time.sleep(1)
            finally:
                sql.conn_lock.release()
コード例 #5
0
    def save_music(self, album_id):
        params = {'id': album_id}
        # 获取专辑对应的页面
        agent = random.choice(agents)
        self.headers["User-Agent"] = agent
        r = requests.get('https://music.163.com/album', headers=self.headers, params=params)

        # 网页解析
        soup = BeautifulSoup(r.content.decode(), 'html.parser')
        body = soup.body
        musics = body.find('ul', attrs={'class': 'f-hide'}).find_all('li')  # 获取专辑的所有音乐
        if len(musics) == 0:
            return
        for music in musics:
            music = music.find('a')
            music_id = music['href'].replace('/song?id=', '')
            music_name = music.getText()
            try:
                sql.insert_music(music_id, music_name, album_id)
            except Exception as e:
                # 打印错误日志
                print(music + ' inset db error: ' + str(e))
                time.sleep(5)
コード例 #6
0
    def save_music(self, album_id):
        params = {'id': album_id}
        # 获取专辑对应的页面
        agent = random.choice(agents)
        self.headers["User-Agent"] = agent
        url = 'https://music.163.com/album?id=' + str(album_id)
        # 去redis验证是否爬取过
        check = redis_util.checkIfRequest(redis_util.musicPrefix, url)
        if (check):
            print("url:", url, "has request. pass")
            time.sleep(1)
            return
        r = requests.get('https://music.163.com/album',
                         headers=self.headers,
                         params=params)

        # 网页解析
        soup = BeautifulSoup(r.content.decode(), 'html.parser')
        body = soup.body
        # 保存redis去重缓存
        redis_util.saveUrl(redis_util.musicPrefix, url)
        musics = body.find('ul', attrs={
            'class': 'f-hide'
        }).find_all('li')  # 获取专辑的所有音乐
        if len(musics) == 0:
            return
        for music in musics:
            music = music.find('a')
            music_id = music['href'].replace('/song?id=', '')
            music_name = music.getText()
            try:
                sql.insert_music(music_id, music_name, album_id)
            except Exception as e:
                # 打印错误日志
                print(music, ' inset db error: ', str(e))
                # traceback.print_exc()
                time.sleep(1)