def crawling_Artist(um): artistVO = Artist_VO() artistVO.Artist_ID = um.END_POINT artistVO.Artist_Node = '/'.join([um.NODE, str(um.END_POINT)]) artistVO.Group = False html = crawler.crawling(url=um.URL) bs = BeautifulSoup(html, 'html.parser') tag_artist_info = bs.find('div', attrs={'class': 'artist_info'}) if tag_artist_info is not None: singer = tag_artist_info.find('a', attrs={'class': 'song_name'}) if singer is not None: artistVO.Artist_Name = singer.get_text() else: artistVO.Artist_Name = tag_artist_info.find( 'li', attrs={ 'class': 'top_left' }).find('p').get_text().strip() print("############# strip 결과 #############\n", artistVO.Artist_Name, "\n############# strip 결과 #############\n") a = tag_artist_info.find('div', attrs={'class': 'a_info_cont'}) tags = tag_artist_info.findAll('span', attrs={'class': 'right'}) for tag in tags: if tag is not None: text_list = tag.get_text().strip().replace(' ', '').replace( '\r', '').replace('\n', '').replace('\t', '').replace('\xa0', '').split('|') # print(text_list) for text in text_list: if text == '남성' or text == '여성' or text == '혼성': artistVO.Gender = text if text == '그룹': artistVO.Group = True db_session.merge(artistVO) db_session.commit() print(artistVO)
def collecting_music(): # albums = [3188608, 1] um = UrlMaker() album_list = [] # for id in range(1, 10000000): for id in range(10740, 10000000): um.set_param(node=URL_Node.ALBUM, end_point=id) try: crawling_album(um) except Exception as e: sleep(60 * 1) print(e) crawling_album(um) # id -= 1 # 이건 어차피 안될듯... range 에서 id 를 가져오는 for 의 특성 continue # sleep(0.5) if id % 5 == 0: db_session.commit()
def crawling_album(um=UrlMaker()): # UrlMaker 객체를 매개변수로 넘겨받아서 1페이지를 크롤링 albumVO = Album_VO() albumVO.Album_ID = um.END_POINT albumVO.Album_Node = '/'.join([um.NODE, str(um.END_POINT)]) # bs from html response.... html = cw.crawling(url=um.URL) bs = BeautifulSoup(html, 'html.parser') tag_Album_info = bs.find('div', attrs={'class': 'album_info'}) if tag_Album_info is not None: # 아래는 태그자체가 앨범타이틀로 들어갈 수 있으니 주의할 것! 확인 요망 albumVO.Album_Title = tag_Album_info.find('li', attrs={'class': 'top_left'}) if albumVO.Album_Title is not None: albumVO.Album_Title = albumVO.Album_Title.find( 'p').get_text().strip() album_info = tag_Album_info.find('div', {'class': 'a_info_cont'}) summary = album_info.find('dl').find('dd').findAll('p') for tag in summary: left_span = tag.find('span', attrs={'class', 'left'}).get_text() right_span = tag.find('span', attrs={'class', 'right'}) if left_span == '아티스트': right_span_a_tag = right_span.find('a') # print(right_span_a_tag) if right_span_a_tag is not None: albumVO.Singer_ID = int( right_span_a_tag['href'].strip().rsplit('/', 1)[1]) else: albumVO.Singer_ID = Artist_VO.query.filter_by( Artist_Name='Various Artists').first().Artist_ID # albumVO.Singer_ID = 10000000 if left_span == '발매일': ymd = [1, 1, 1] ymd_data = list( map(lambda x: int(x), right_span.get_text().split('.'))) for i in range(len(ymd_data)): # 웹에 기록된 length 만큼 돌면서 수행 if i == 0 and 0 < ymd_data[i]: ymd[i] = ymd_data[i] elif i == 1 and 0 < ymd_data[i] < 13: ymd[i] = ymd_data[i] elif i == 2 and 0 < ymd_data[i] < 32: try: datetime(ymd[0], ymd[1], ymd[2]) except: ymd[i] = 1 albumVO.Release_Date = datetime(ymd[0], ymd[1], ymd[2]) if left_span == '기획사' or left_span == '레이블': albumVO.Agency = right_span.get_text().strip() if left_span == '유통사': albumVO.Distributor = right_span.get_text().strip() descriptions = album_info.find('div', attrs={'class', 'text_slider'}) # print(descriptions) if descriptions is not None: descriptions = descriptions.findAll('p') desc = str(descriptions[len(descriptions) - 1].get_text()) # print(desc) # print("앨범 설명 길이 len() : ", len(desc.encode('utf-8')), file=sys.stderr) # print("Desc 길이 : ", Album_VO.Description.type.length, file=sys.stderr) if len(desc.encode('utf-8')) <= Album_VO.Description.type.length: albumVO.Description = preprocessing_string(desc) print("앨범 설명 : ", albumVO.Description) try: # print(albumVO, file=sys.stderr) db_session.merge(albumVO) db_session.commit() # if albumVO.Album_ID%5 ==0: # cw_log({albumVO.Album_ID : 'SUCCESS[Completely]'}) except InternalError: db_session.rollback() try: import re pattern = re.compile( u'[^ ~`!@#$%^&*()_\-+={\[}\]:<.>/?\'\"\n\ta-zA-Z0-9\u3131-\u3163\uac00-\ud7a3]+' ) # 한글 키보드 특문 영어 숫자 albumVO.Description = re.sub(pattern, ' ', albumVO.Description) print("앨범 설명 : ", albumVO.Description) db_session.merge(albumVO) db_session.commit() cw_log({albumVO.Album_ID: 'SUCCESS[RE.Compile] - Desc'}) except: print(" 완전 rollback", file=sys.stderr) cw_log({albumVO.Album_ID: 'FAILURE - Desc'}) db_session.rollback() albumVO.Description = None db_session.merge(albumVO) db_session.commit() finally: print(albumVO)
}).get_text().strip() left_attrs = summary.find('li', attrs={ 'class': 'left_con' }).findAll('p', attrs='left') right_attrs = summary.find('li', attrs={ 'class': 'left_con' }).findAll('p', attrs='right') for i in range(0, len(left_attrs)): if left_attrs[i].get_text().strip() == '발매일': albumVO.Release_Date = right_attrs[i].get_text() if left_attrs[i].get_text().strip() == '음악장르': musicVO.Genre = right_attrs[i].get_text().strip() if left_attrs[i].get_text().strip() == '작사': for lyricist in right_attrs[i].findAll('a', attrs='href'): Music_VO.Lyricist.append( Artist_VO.query.filter_by( Artist_ID=int(lyricist.strip().rsplit('/', 1)))) if left_attrs[i].get_text().strip() == '작곡': for comporser in right_attrs[i].findAll('a', attrs='href'): Music_VO.Composer.append( Artist_VO.query.filter_by( Artist_ID=int(comporser.strip().rsplit('/', 1)))) db_session.merge(albumVO) db_session.commit() db_session.merge(musicVO) db_session.commit() print(musicVO, albumVO)
def crawling_track(um): musicVO = Music_VO() musicVO.Music_ID = um.END_POINT musicVO.Music_Node = '/'.join([um.NODE, str(um.END_POINT)]) html = cw.crawling(url=um.URL) bs = BeautifulSoup(html, 'html.parser') tag_music_info = bs.find('div', attrs={'class': 'music_info_view'}) # 곡 정보가 존재하지 않는 페이지(없는 곡입니다) ....가 아닌 경우에만 수행 경우...... if tag_music_info is not None: # 곡 소개 테이블 summary = tag_music_info.find('div', attrs={'class': 'music_info_cont'}) album_tag = summary.find('table').find('a') if album_tag is not None: musicVO.Album_Node = album_tag['href'].strip(" ") musicVO.Album_ID = int(musicVO.Album_Node.rsplit('/', 1)[1]) # attrs = summary.find('li', attrs={'class': 'left_con'}).findAll('p', attrs={'class': 'right'}) musicVO.Music_Title = tag_music_info.find('li', attrs={'class': 'top_left'}) if musicVO.Music_Title is not None: musicVO.Music_Title = musicVO.Music_Title.find( 'p').get_text().strip() try: left_attrs = summary.find('li', attrs={ 'class': 'left_con' }).findAll('p', attrs={'class': 'left'}) right_attrs = summary.find('li', attrs={ 'class': 'left_con' }).findAll('p', attrs={'class': 'right'}) except AttributeError: attrs_list = bs.find('dd', attrs={'class': 'con'}) left_attrs = attrs_list.find('li', attrs={ 'class': 'left_con' }).findAll('p', attrs={'class': 'left'}) right_attrs = attrs_list.find('li', attrs={ 'class': 'left_con' }).findAll('p', attrs={'class': 'right'}) for i in range(0, len(left_attrs)): if left_attrs[i].get_text().strip() == '음악장르': musicVO.Genre = right_attrs[i].get_text().strip() line_info = bs.findAll('div', attrs={'class': 'line_info'}) lyric = line_info[0].find('li', attrs={'id': 'lyricsText'}) if lyric is not None: buffer = lyric.get_text().replace('\n', '').replace('\t', '').replace( '<br/>', '\n').strip() # 54187 때문에 포함...특문을 아예 제거 해야하는 가??? # 특문을 포함하되 몇몇개 사용되는 것들로?? if '</li>' in buffer: buffer = buffer.split('</li>', 1)[0] else: pass print('버퍼 : ', buffer, len(buffer.encode('utf-8')), file=sys.stderr) if len(buffer.encode('utf-8')) <= Music_VO.Lyrics.type.length: musicVO.Lyrics = buffer if len(line_info) > 1: staffs = line_info[1].findAll('ul', attrs={'class': 'con2'}) else: staffs = None if staffs is not None: for staff in staffs: lyricists = '' if staff.find('li', attrs={ 'class': 'title' }).get_text().strip() == '작사': lyricists = staff.findAll('a') if len(lyricists) != 0: res = '' for lyricist in lyricists: res = ','.join([ res, lyricist['href'].strip().rsplit('/', 1)[1] ]) musicVO.Lyricist_ID = res.split(',', 1)[1] if staff.find('li', attrs={ 'class': 'title' }).get_text().strip() == '작곡': comporsers = staff.findAll('a') if len(comporsers) != 0: res = '' for comporser in comporsers: res = ','.join([ res, comporser['href'].strip().rsplit('/', 1)[1] ]) musicVO.Composer_ID = res.split(',', 1)[1] try: db_session.merge(musicVO) db_session.commit() except InternalError: db_session.rollback() try: import re pattern = re.compile( u'[^ ~`!@#$%^&*()_\-+={\[}\]:<.>/?\'\"\n\ta-zA-Z0-9\u3131-\u3163\uac00-\ud7a3]+' ) # 한글 키보드 특문 영어 숫자 musicVO.Lyrics = re.sub(pattern, ' ', musicVO.Lyrics) db_session.merge(musicVO) db_session.commit() cw_log({musicVO.Music_ID: 'SUCCESS[RE.Compile] - Lirics'}) except: print(" 완전 rollback", file=sys.stderr) db_session.rollback() musicVO.Description = None db_session.merge(musicVO) db_session.commit() cw_log({musicVO.Music_ID: 'FAILURE - Lirics'}) print('저장된 가사 : ', musicVO.Lyrics, file=sys.stderr)