def crawling_Artist(um):

    artistVO = Artist_VO()
    artistVO.Artist_ID = um.END_POINT
    artistVO.Artist_Node = '/'.join([um.NODE, str(um.END_POINT)])
    artistVO.Group = False

    html = crawler.crawling(url=um.URL)
    bs = BeautifulSoup(html, 'html.parser')
    tag_artist_info = bs.find('div', attrs={'class': 'artist_info'})

    if tag_artist_info is not None:
        singer = tag_artist_info.find('a', attrs={'class': 'song_name'})
        if singer is not None:
            artistVO.Artist_Name = singer.get_text()
        else:
            artistVO.Artist_Name = tag_artist_info.find(
                'li', attrs={
                    'class': 'top_left'
                }).find('p').get_text().strip()
            print("############# strip 결과 #############\n",
                  artistVO.Artist_Name,
                  "\n############# strip 결과 #############\n")

        a = tag_artist_info.find('div', attrs={'class': 'a_info_cont'})

        tags = tag_artist_info.findAll('span', attrs={'class': 'right'})
        for tag in tags:
            if tag is not None:
                text_list = tag.get_text().strip().replace(' ', '').replace(
                    '\r', '').replace('\n',
                                      '').replace('\t',
                                                  '').replace('\xa0',
                                                              '').split('|')
                # print(text_list)
                for text in text_list:
                    if text == '남성' or text == '여성' or text == '혼성':
                        artistVO.Gender = text
                    if text == '그룹':
                        artistVO.Group = True

        db_session.merge(artistVO)
        db_session.commit()

        print(artistVO)
Example #2
0
def collecting_music():
    # albums = [3188608, 1]
    um = UrlMaker()
    album_list = []
    # for id in range(1, 10000000):
    for id in range(10740, 10000000):
        um.set_param(node=URL_Node.ALBUM, end_point=id)
        try:
            crawling_album(um)
        except Exception as e:
            sleep(60 * 1)
            print(e)
            crawling_album(um)
            # id -= 1     # 이건 어차피 안될듯... range 에서 id 를 가져오는 for 의 특성
            continue

        # sleep(0.5)

        if id % 5 == 0:
            db_session.commit()
def crawling_album(um=UrlMaker()):
    # UrlMaker 객체를 매개변수로 넘겨받아서 1페이지를 크롤링
    albumVO = Album_VO()
    albumVO.Album_ID = um.END_POINT
    albumVO.Album_Node = '/'.join([um.NODE, str(um.END_POINT)])

    # bs from html response....
    html = cw.crawling(url=um.URL)
    bs = BeautifulSoup(html, 'html.parser')
    tag_Album_info = bs.find('div', attrs={'class': 'album_info'})

    if tag_Album_info is not None:
        # 아래는 태그자체가 앨범타이틀로 들어갈 수 있으니 주의할 것! 확인 요망
        albumVO.Album_Title = tag_Album_info.find('li',
                                                  attrs={'class': 'top_left'})
        if albumVO.Album_Title is not None:

            albumVO.Album_Title = albumVO.Album_Title.find(
                'p').get_text().strip()

        album_info = tag_Album_info.find('div', {'class': 'a_info_cont'})

        summary = album_info.find('dl').find('dd').findAll('p')

        for tag in summary:
            left_span = tag.find('span', attrs={'class', 'left'}).get_text()
            right_span = tag.find('span', attrs={'class', 'right'})

            if left_span == '아티스트':
                right_span_a_tag = right_span.find('a')
                # print(right_span_a_tag)
                if right_span_a_tag is not None:
                    albumVO.Singer_ID = int(
                        right_span_a_tag['href'].strip().rsplit('/', 1)[1])
                else:
                    albumVO.Singer_ID = Artist_VO.query.filter_by(
                        Artist_Name='Various Artists').first().Artist_ID
                    # albumVO.Singer_ID = 10000000
            if left_span == '발매일':
                ymd = [1, 1, 1]
                ymd_data = list(
                    map(lambda x: int(x),
                        right_span.get_text().split('.')))
                for i in range(len(ymd_data)):
                    # 웹에 기록된 length 만큼 돌면서 수행
                    if i == 0 and 0 < ymd_data[i]:
                        ymd[i] = ymd_data[i]
                    elif i == 1 and 0 < ymd_data[i] < 13:
                        ymd[i] = ymd_data[i]
                    elif i == 2 and 0 < ymd_data[i] < 32:
                        try:
                            datetime(ymd[0], ymd[1], ymd[2])
                        except:
                            ymd[i] = 1

                albumVO.Release_Date = datetime(ymd[0], ymd[1], ymd[2])

            if left_span == '기획사' or left_span == '레이블':
                albumVO.Agency = right_span.get_text().strip()
            if left_span == '유통사':
                albumVO.Distributor = right_span.get_text().strip()

        descriptions = album_info.find('div', attrs={'class', 'text_slider'})
        # print(descriptions)
        if descriptions is not None:
            descriptions = descriptions.findAll('p')
            desc = str(descriptions[len(descriptions) - 1].get_text())
            # print(desc)
            # print("앨범 설명 길이 len() : ", len(desc.encode('utf-8')), file=sys.stderr)
            # print("Desc 길이 : ", Album_VO.Description.type.length, file=sys.stderr)
            if len(desc.encode('utf-8')) <= Album_VO.Description.type.length:
                albumVO.Description = preprocessing_string(desc)
            print("앨범 설명 : ", albumVO.Description)

        try:
            # print(albumVO, file=sys.stderr)
            db_session.merge(albumVO)
            db_session.commit()
            # if albumVO.Album_ID%5 ==0:
            #     cw_log({albumVO.Album_ID : 'SUCCESS[Completely]'})

        except InternalError:
            db_session.rollback()
            try:
                import re
                pattern = re.compile(
                    u'[^ ~`!@#$%^&*()_\-+={\[}\]:<.>/?\'\"\n\ta-zA-Z0-9\u3131-\u3163\uac00-\ud7a3]+'
                )  # 한글 키보드 특문 영어 숫자

                albumVO.Description = re.sub(pattern, ' ', albumVO.Description)
                print("앨범 설명 : ", albumVO.Description)
                db_session.merge(albumVO)
                db_session.commit()
                cw_log({albumVO.Album_ID: 'SUCCESS[RE.Compile] - Desc'})
            except:
                print(" 완전 rollback", file=sys.stderr)
                cw_log({albumVO.Album_ID: 'FAILURE - Desc'})
                db_session.rollback()
                albumVO.Description = None
                db_session.merge(albumVO)
                db_session.commit()
        finally:
            print(albumVO)
Example #4
0
                                                  }).get_text().strip()
        left_attrs = summary.find('li', attrs={
            'class': 'left_con'
        }).findAll('p', attrs='left')
        right_attrs = summary.find('li', attrs={
            'class': 'left_con'
        }).findAll('p', attrs='right')
        for i in range(0, len(left_attrs)):
            if left_attrs[i].get_text().strip() == '발매일':
                albumVO.Release_Date = right_attrs[i].get_text()
            if left_attrs[i].get_text().strip() == '음악장르':
                musicVO.Genre = right_attrs[i].get_text().strip()
            if left_attrs[i].get_text().strip() == '작사':
                for lyricist in right_attrs[i].findAll('a', attrs='href'):
                    Music_VO.Lyricist.append(
                        Artist_VO.query.filter_by(
                            Artist_ID=int(lyricist.strip().rsplit('/', 1))))
            if left_attrs[i].get_text().strip() == '작곡':
                for comporser in right_attrs[i].findAll('a', attrs='href'):
                    Music_VO.Composer.append(
                        Artist_VO.query.filter_by(
                            Artist_ID=int(comporser.strip().rsplit('/', 1))))

        db_session.merge(albumVO)
        db_session.commit()

        db_session.merge(musicVO)

db_session.commit()

print(musicVO, albumVO)
def crawling_track(um):

    musicVO = Music_VO()
    musicVO.Music_ID = um.END_POINT
    musicVO.Music_Node = '/'.join([um.NODE, str(um.END_POINT)])

    html = cw.crawling(url=um.URL)
    bs = BeautifulSoup(html, 'html.parser')

    tag_music_info = bs.find('div', attrs={'class': 'music_info_view'})

    # 곡 정보가 존재하지 않는 페이지(없는 곡입니다) ....가 아닌 경우에만 수행 경우......
    if tag_music_info is not None:
        # 곡 소개 테이블
        summary = tag_music_info.find('div',
                                      attrs={'class': 'music_info_cont'})
        album_tag = summary.find('table').find('a')

        if album_tag is not None:
            musicVO.Album_Node = album_tag['href'].strip(" ")
            musicVO.Album_ID = int(musicVO.Album_Node.rsplit('/', 1)[1])

        # attrs = summary.find('li', attrs={'class': 'left_con'}).findAll('p', attrs={'class': 'right'})
        musicVO.Music_Title = tag_music_info.find('li',
                                                  attrs={'class': 'top_left'})
        if musicVO.Music_Title is not None:
            musicVO.Music_Title = musicVO.Music_Title.find(
                'p').get_text().strip()

        try:
            left_attrs = summary.find('li', attrs={
                'class': 'left_con'
            }).findAll('p', attrs={'class': 'left'})
            right_attrs = summary.find('li', attrs={
                'class': 'left_con'
            }).findAll('p', attrs={'class': 'right'})

        except AttributeError:
            attrs_list = bs.find('dd', attrs={'class': 'con'})
            left_attrs = attrs_list.find('li', attrs={
                'class': 'left_con'
            }).findAll('p', attrs={'class': 'left'})
            right_attrs = attrs_list.find('li', attrs={
                'class': 'left_con'
            }).findAll('p', attrs={'class': 'right'})

        for i in range(0, len(left_attrs)):
            if left_attrs[i].get_text().strip() == '음악장르':
                musicVO.Genre = right_attrs[i].get_text().strip()

        line_info = bs.findAll('div', attrs={'class': 'line_info'})

        lyric = line_info[0].find('li', attrs={'id': 'lyricsText'})

        if lyric is not None:
            buffer = lyric.get_text().replace('\n',
                                              '').replace('\t', '').replace(
                                                  '<br/>', '\n').strip()

            # 54187 때문에 포함...특문을 아예 제거 해야하는 가???
            # 특문을 포함하되 몇몇개 사용되는 것들로??
            if '</li>' in buffer:
                buffer = buffer.split('</li>', 1)[0]
            else:
                pass

            print('버퍼 : ',
                  buffer,
                  len(buffer.encode('utf-8')),
                  file=sys.stderr)
            if len(buffer.encode('utf-8')) <= Music_VO.Lyrics.type.length:
                musicVO.Lyrics = buffer

        if len(line_info) > 1:
            staffs = line_info[1].findAll('ul', attrs={'class': 'con2'})
        else:
            staffs = None

        if staffs is not None:
            for staff in staffs:
                lyricists = ''
                if staff.find('li', attrs={
                        'class': 'title'
                }).get_text().strip() == '작사':
                    lyricists = staff.findAll('a')
                    if len(lyricists) != 0:
                        res = ''
                        for lyricist in lyricists:
                            res = ','.join([
                                res, lyricist['href'].strip().rsplit('/', 1)[1]
                            ])
                        musicVO.Lyricist_ID = res.split(',', 1)[1]

                if staff.find('li', attrs={
                        'class': 'title'
                }).get_text().strip() == '작곡':
                    comporsers = staff.findAll('a')
                    if len(comporsers) != 0:
                        res = ''
                        for comporser in comporsers:
                            res = ','.join([
                                res, comporser['href'].strip().rsplit('/',
                                                                      1)[1]
                            ])
                        musicVO.Composer_ID = res.split(',', 1)[1]
        try:
            db_session.merge(musicVO)
            db_session.commit()
        except InternalError:
            db_session.rollback()
            try:
                import re
                pattern = re.compile(
                    u'[^ ~`!@#$%^&*()_\-+={\[}\]:<.>/?\'\"\n\ta-zA-Z0-9\u3131-\u3163\uac00-\ud7a3]+'
                )  # 한글 키보드 특문 영어 숫자

                musicVO.Lyrics = re.sub(pattern, ' ', musicVO.Lyrics)
                db_session.merge(musicVO)
                db_session.commit()
                cw_log({musicVO.Music_ID: 'SUCCESS[RE.Compile] - Lirics'})
            except:
                print(" 완전 rollback", file=sys.stderr)
                db_session.rollback()
                musicVO.Description = None
                db_session.merge(musicVO)
                db_session.commit()
                cw_log({musicVO.Music_ID: 'FAILURE - Lirics'})
        print('저장된 가사 : ', musicVO.Lyrics, file=sys.stderr)