Beispiel #1
0
class wangyiyun():
    def __init__(self):
        options = Options()
        options.headless = True
        self.driver = webdriver.Firefox(options=options)
        # 连接数据库
        self.mysqlCommand = MySQLCommand()
        self.mysqlCommand.connectdb()
        # 每次查询数据库中最后一条数据的ID,新加的数据每成功插入一条ID+1
        self.music = {}

    def run(self):
        self.mysqlCommand.cursor.execute(
            "select url ,singer_name from table_singer")
        urls = self.mysqlCommand.cursor.fetchall()

        for odd, url in enumerate(urls):

            if url.get('url') != None and odd % 2 == 0:
                self.driver.get(url.get('url'))
                time.sleep(4)
                self.driver.switch_to.frame(
                    self.driver.find_element_by_name('contentFrame'))
                time.sleep(1)
                source = self.driver.page_source
                # print(url.get('list_url'))
                # print(source)
                html = etree.HTML(source)
                time.sleep(1)

                song_name = html.xpath(
                    "//div[@class='j-flag']//div[@class='ttc']/span[@class='txt']/a/b/@title"
                )

                song_url = html.xpath(
                    "//div[@class='j-flag']//div[@class='ttc']/span[@class='txt']/a/@href"
                )

                album = html.xpath("//div[@class='text']/a/@title")

                singer = url.get('singer_name')

                for i in range(len(song_name)):
                    song_n = re.sub(r'\\xa0', ' ', song_name[i])
                    song_u = 'https://music.163.com' + song_url[i]
                    albums = re.sub(r'\\xa0', ' ', album[i])
                    print(song_n, '+', song_u, '+', albums, '+', singer)
                    try:
                        self.mysqlCommand.insert_musicData(
                            song_n, song_u, albums, singer)
                        print('==' * 20)
                    except:
                        pass
                    print('==' * 20)
Beispiel #2
0
class wangyiyun():
    def __init__(self):
        options = Options()
        options.headless = True
        self.driver = webdriver.Firefox(options=options)
        # 连接数据库
        self.mysqlCommand = MySQLCommand()
        self.mysqlCommand.connectdb()
        # 每次查询数据库中最后一条数据的ID,新加的数据每成功插入一条ID+1
        self.music = {}

    def run(self):
        self.mysqlCommand.cursor.execute("select list_url from song_list")
        list_song = self.mysqlCommand.cursor.fetchall()

        for odd, url in enumerate(list_song):

            if url.get('list_url') != None and odd % 3 == 0:
                self.driver.get(url.get('list_url'))
                time.sleep(4)
                self.driver.switch_to.frame(
                    self.driver.find_element_by_name('contentFrame'))
                time.sleep(1)
                source = self.driver.page_source
                # print(url.get('list_url'))
                # print(source)
                html = etree.HTML(source)
                time.sleep(1)

                song_name = re.findall(r'"><b title="(.*?)">', source,
                                       re.DOTALL)

                song_url = re.findall(
                    r'<div class="ttc"><span class="txt"><a href="(.*?)"><b',
                    source, re.DOTALL)

                album = html.xpath("//div[@class='text']/a/@title")

                singer = html.xpath("//div[@class='text']/@title")

                singer_url = html.xpath("//div[@class='text']/span/a/@href")
                for i in range(len(song_name)):
                    song_n = re.sub(r'&nbsp;', ' ', song_name[i])
                    song_u = 'https://music.163.com' + song_url[i]
                    singerurl = 'https://music.163.com' + singer_url[i]
                    print(singer[i], singerurl)
                    try:
                        self.mysqlCommand.insert_musicData(
                            song_n, song_u, album[i], singer[i])
                        self.mysqlCommand.insert_singer(singer[i], singerurl)
                        print('==' * 20)
                    except Exception as e:
                        print(e)
                        pass
Beispiel #3
0
class wangyiyun():
    def __init__(self):
        options = Options()
        options.headless = True
        self.driver = webdriver.Firefox(options=options)
        self.url = [
            'https://music.163.com/#/discover/toplist?id=19723756',
            'https://music.163.com/#/discover/toplist?id=3779629',
            'https://music.163.com/#/discover/toplist?id=2884035',
            'https://music.163.com/#/discover/toplist?id=3778678',
            'https://music.163.com/#/discover/toplist?id=991319590',
            'https://music.163.com/#/discover/toplist?id=71384707',
            'https://music.163.com/#/discover/toplist?id=1978921795',
            'https://music.163.com/#/discover/toplist?id=2250011882',
            'https://music.163.com/#/discover/toplist?id=2617766278',
            'https://music.163.com/#/discover/toplist?id=71385702',
            'https://music.163.com/#/discover/toplist?id=745956260',
            'https://music.163.com/#/discover/toplist?id=10520166',
            'https://music.163.com/#/discover/toplist?id=2023401535',
            'https://music.163.com/#/discover/toplist?id=2006508653',
            'https://music.163.com/#/discover/toplist?id=180106',
            'https://music.163.com/#/discover/toplist?id=60198',
            'https://music.163.com/#/discover/toplist?id=3812895',
            'https://music.163.com/#/discover/toplist?id=27135204',
            'https://music.163.com/#/discover/toplist?id=21845217',
            'https://music.163.com/#/discover/toplist?id=11641012',
            'https://music.163.com/#/discover/toplist?id=60131',
            'https://music.163.com/#/discover/toplist?id=120001',
            'https://music.163.com/#/discover/toplist?id=112463',
            'https://music.163.com/#/discover/toplist?id=10169002',
            'https://music.163.com/#/discover/toplist?id=2809513713',
            'https://music.163.com/#/discover/toplist?id=2809577409'
        ]
        # 连接数据库
        self.mysqlCommand = MySQLCommand()
        self.mysqlCommand.connectdb()
        # 每次查询数据库中最后一条数据的ID,新加的数据每成功插入一条ID+1

    def run(self):

        for s_url in self.url:
            self.driver.get(s_url)
            time.sleep(4)
            self.driver.switch_to.frame(
                self.driver.find_element_by_name("contentFrame"))
            source = self.driver.page_source
            html = etree.HTML(source)
            list_name = html.xpath("//div[@class='hd f-cb']/h2/text()")
            play_num = html.xpath(
                "//div[@class='more s-fc3']/strong[@class='s-fc6']/text()")
            creator = '网易云'
            creator_url = '无'
            try:
                self.mysqlCommand.insert_list(s_url, list_name, creator,
                                              creator_url, play_num)
            except:
                print('列表错误' + list_name, play_num, creator_url, creator)

            url = re.findall(r'<span class="txt"><a href="(.*?)"><b', source,
                             re.DOTALL)
            song_name = re.findall(r'><b title="(.*?)">', source, re.DOTALL)
            singer = re.findall(r'div class="text" title="(.*?)"><span',
                                source, re.DOTALL)
            for i in range(len(url)):
                urli = 'https://music.163.com' + url[i]
                song_namei = re.sub(r'&nbsp;', ' ', song_name[i])
                singeri = singer[i]
                album = '网易云排行榜'
                try:
                    self.mysqlCommand.insert_musicData(song_namei, urli, album,
                                                       singeri)
                except:
                    song_nameii = ''
                    for i in song_namei:
                        if i != '\'' and i != ')':
                            song_nameii = song_nameii + i
                    try:
                        self.mysqlCommand.insert_musicData(
                            song_nameii, urli, album, singeri)
                    except Exception as e:
                        print('歌曲错误 ' + song_nameii, '歌手 ' + singeri,
                              '原名 ' + song_namei)
                        print(e)
        self.mysqlCommand.closeMysql()
Beispiel #4
0
class wangyiyun():
    def __init__(self):
        options=Options()
        options.headless=True
        self.driver = webdriver.Firefox(options=options)
        # 连接数据库
        self.mysqlCommand = MySQLCommand()
        self.mysqlCommand.connectdb()
        # 每次查询数据库中最后一条数据的ID,新加的数据每成功插入一条ID+1
        self.message = {}
        self.user = {}

    def run(self):
        self.mysqlCommand.cursor.execute("select url,song_name from message")
        name_url = self.mysqlCommand.cursor.fetchall()
        for odd, url in enumerate(name_url):
            if url.get('url') != None and odd % 4 == 1:
                # print(url)
                self.driver.get(url.get('url'))
                # self.request_preson_page(url.get('url'), url.get('song_name'))
                time.sleep(4)
                preson_url = url.get('url')
                song_name = url.get('song_name')
                self.driver.switch_to.frame(self.driver.find_element_by_name("contentFrame"))
                source = self.driver.page_source
                #time.sleep(2)
                self.parse_preson_page(source, song_name, preson_url)

    # 解析评论人信息页
    def parse_preson_page(self, source, song_name, preson_url):
        html = etree.HTML(source)
        person_name = "".join(html.xpath("//span[@class='tit f-ff2 s-fc0 f-thide']/text()"))
        IDs = html.xpath("//ul[@class='data s-fc3 f-cb']/li/a/@href")
        count = re.findall(r'<strong.*?</strong>', source, re.DOTALL)
        counts = []
        for i in count:
            count = re.findall(r'">(.*?)</strong>', i)
            counts.append(count)
        ids = []
        for i in IDs:
            ids.append("https://music.163.com" + i)
        if len(ids) == 0:
            print(ids, preson_url)
            exit()
        introduce = html.xpath("//div[@class='inf s-fc3 f-brk']/text()")
        if len(introduce) == 0:
            introduce = ['无信息']
        introduce = "".join(introduce)

        introduce = "".join(re.sub(r'个人介绍:', '',  introduce))
        introduce = re.sub(r'\n', ' ', introduce)

        district = "".join(re.findall(r'<div class="inf s-fc3".*?所在地区:(.*?)</span>', source, re.DOTALL))
        if len(district) == 0:
            district = '无信息'

        age = "".join(html.xpath("//span[@class='sep']/span/text()"))
        if len(age) == 0:
            age = '无信息'

        self.user['name'] = person_name
        self.user['introduction'] = introduce
        self.user['region'] = district
        self.user['age'] = age
        self.user['dynamic'] = ids[0]
        self.user['focus'] = ids[1]
        self.user['fans'] = ids[2]
        self.user['url'] = preson_url
        # print(self.user)

        try:
            # 插入数据
            self.mysqlCommand.insert_userData(self.user)
        except Exception as e:
            print("插入用户数据失败", str(e))  # 输出插入失败的报错语句
Beispiel #5
0
class wangyiyun():
    def __init__(self):
        options = Options()
        options.headless = True
        self.driver = webdriver.Firefox(options=options)
        # 连接数据库
        self.mysqlCommand = MySQLCommand()
        self.mysqlCommand.connectdb()
        # 每次查询数据库中最后一条数据的ID,新加的数据每成功插入一条ID+1
        self.message = {}
        self.user = {}


    def run(self):
        self.mysqlCommand.cursor.execute("select url from table_music")
        music_url = self.mysqlCommand.cursor.fetchall()
        for odd, url in enumerate(music_url):
            if url.get('url') != None and odd % 4 == 1:
                self.driver.get(url.get('url'))
                time.sleep(4)
                self.driver.switch_to.frame(self.driver.find_element_by_name('contentFrame'))
                # 滚动条到页面最底部
                js = "var q=document.documentElement.scrollTop=10000"
                self.driver.execute_script(js)
                time.sleep(1)
                source = self.driver.page_source
                #time.sleep(1)
                j_flag = "".join(re.findall(r'<div class="auto-(.*?) u-page">', source, re.DOTALL))
                i = 1
                while True:
                    source = self.driver.page_source
                    self.parse_detail_page(source)
                    try:
                        # self.parse_detail_page(source)
                        # self.driver.switch_to.window(self.driver.window_handles[1])
                        # self.driver.switch_to.frame(self.driver.find_element_by_name("contentFrame"))
                        # source = self.driver.page_source
                        time.sleep(5)
                        # print('刷新本页html:',source)
                        js = "var q=document.documentElement.scrollTop=10000"
                        self.driver.execute_script(js)
                        # self.driver.switch_to.frame(self.driver.find_element_by_name("contentFrame"))
                        WebDriverWait(driver=self.driver, timeout=10).until(EC.presence_of_element_located((By.XPATH, "//div[@class='auto-" + j_flag + " u-page']/a[last()]")))
                        next_btn = self.driver.find_element_by_xpath("//div[@class='auto-"+j_flag+" u-page']/a[last()]")
                        print('爬取第%d页成功!' % i)
                        if "js-disabled" in next_btn.get_attribute("class"):
                            print('本首歌爬取完成!')
                            # self.mysqlCommand.closeMysql()
                            break
                        else:
                            next_btn.click()
                            i += 1
                            time.sleep(2)
                    except:
                        if self.driver.page_source.find("//div[@class='auto-"+j_flag+" u-page']/a[last()]"):
                            print('有btn')
                        print('爬取第%d页失败!' % i)
                        # self.mysqlCommand.closeMysql()
                        print("=="*20)
                        # print(source)
                        print(j_flag)

                # 关闭详情页
                #self.driver.close()
                # 切换回排行榜列表
                #self.driver.switch_to.window(self.driver.window_handles[0])

    # 爬取歌曲评论信息
    def parse_detail_page(self, source):

        html = etree.HTML(source)
        preson_id = html.xpath("//div[@class='cnt f-brk']/a[@class='s-fc7']/@href")
        song_name = "".join(html.xpath("//div[@class='tit']/em[@class='f-ff2']/text()"))
        # 获取点击量
        points_tags = re.findall(r'<i class="zan u-icn2 u-icn2-12">(.*?)</a>', source, re.DOTALL)
        point = []
        for i in points_tags:

            point_rag = re.sub('</i> ', '', i)
            point_rag = re.sub('</i>', '0', point_rag)
            point.append(point_rag)

        name = html.xpath("//div[@class='cnt f-brk']/a[1]/text()")
        comment_tags = re.findall(r'<div class="cnt f-brk">.*?</a>(.*?)</div>.*?</a>(.*?)</div>', source, re.DOTALL)
        comments = []
        for item in comment_tags:
            comment=str()
            for i in item:
                comment_tag = re.sub('<br />', ' ', i)

                comment_tag = re.sub('<(.*?)>', '', comment_tag)
                if item.index(i) == 1 and comment_tag != '|回复':
                    comment_tag = '\n 评论回复'+comment_tag
                comment += comment_tag
            comment=comment.rstrip('|回复')
            comment=''.join(comment)
            comment='""'+comment+'""'
            comments.append(comment)

        time = []
        times = html.xpath("//div[@class='time s-fc4']/text()")
        for i in times:
            time.append(i.replace(' ', ''))
        # print('++' * 30)
        for i in range(len(name)):
            self.message['song_name'] = song_name
            self.message['name'] = name[i]
            self.message['comments'] = comments[i]
            self.message['time'] = time[i]
            self.message['point'] = point[i]
            self.message['url'] = "https://music.163.com"+preson_id[i]
            self.mysqlCommand.insert_messageData(self.message)

        comment_sum = ''.join(re.findall(r'<span class="j-flag">(.*?)</span>', source, re.DOTALL))
        try:
            self.mysqlCommand.insert_musicnum(song_name, comment_sum)
        except:
            print(song_name, comment_sum, '评论数插入失败!')
            pass
Beispiel #6
0
class wangyiyun():
    def __init__(self):
        options = Options()
        options.headless = True
        self.driver = webdriver.Firefox(options=options)
        self.url = [
            'https://music.163.com/#/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8',
        ]
        # 连接数据库
        self.mysqlCommand = MySQLCommand()
        self.mysqlCommand.connectdb()
        # 每次查询数据库中最后一条数据的ID,新加的数据每成功插入一条ID+1

    def run(self):

        for s_url in self.url:
            self.driver.get(s_url)
            time.sleep(5)
            self.driver.switch_to.frame(
                self.driver.find_element_by_name("contentFrame"))
            self.parse_list_page()

    # 获取歌曲的url
    def parse_list_page(self):
        k = 1
        h = 0
        while True:
            try:
                source = self.driver.page_source
                time.sleep(2)
                html = etree.HTML(source)
                list_url = html.xpath(
                    "//li/div[@class='u-cover u-cover-1']/a/@href")
                creator_url = html.xpath("//li//p[last()]/a/@href")

                play_num = re.findall(r'<span class="nb">(.*?)</span>', source,
                                      re.DOTALL)

                creator = html.xpath("//li/p[last()]/a/text()")

                list_name = html.xpath("//li/p[@class='dec']/a/text()")

                js = "var q=document.documentElement.scrollTop=10000"
                self.driver.execute_script(js)
                try:
                    # 插入数据
                    for i in range(len(creator_url)):
                        list = "https://music.163.com" + list_url[i]
                        print(creator_url[i])
                        creator_s = "https://music.163.com" + creator_url[i]
                        print(list, list_name[i], creator[i], creator_s,
                              play_num[i])
                        self.mysqlCommand.insert_list(list, list_name[i],
                                                      creator[i], creator_s,
                                                      play_num[i])
                except Exception as e:
                    print("插入歌单数据失败", str(e))  # 输出插入失败的报错语句
                next_btn = self.driver.find_element_by_xpath(
                    "//div[@class='u-page']/a[last()]")
                print('爬取第%d页成功!' % k)
                if "zbtn znxt js-disabled" in next_btn.get_attribute("class"):
                    h += 1
                    for i in range(len(creator_url)):
                        list = "https://music.163.com" + list_url[i]
                        print(creator_url[i])
                        creator_s = "https://music.163.com" + creator_url[i]
                        self.mysqlCommand.insert_list(list, list_name[i],
                                                      creator[i], creator_s,
                                                      play_num[i])
                    print('本首歌爬取完成!')
                    if h == 2:
                        self.mysqlCommand.closeMysql()
                        break
                    next_list = self.driver.find_element_by_xpath(
                        "//div[@class='u-btn f-fr u-btn-hot d-flag']/a[last()]"
                    )
                    next_list.click()
                else:
                    next_btn.click()
                    time.sleep(2)
                k += 1
                time.sleep(2)
            except:
                print('爬取第%d页失败!' % k)
                self.mysqlCommand.closeMysql()
                print("==" * 20)