def select_playlist(driver: webdriver, title_list, artist_list, image_list): time.sleep(3) for i in range(12): playlists = driver.find_elements_by_xpath( '//*[@id="container"]/section/div/ul/li') playlists[i].click() select_song(driver, title_list, artist_list, image_list) time.sleep(3) driver.back()
def select_top100(driver: webdriver): url = "https://music.bugs.co.kr/chart" request = requests.get(url) html = request.text bs = BeautifulSoup(html, 'html.parser') titles = bs.select('p.title') artists = bs.select('p.artist') images = bs.select('a.thumbnail') rank = [] title_list = [] artist_list = [] image_list = [] for i in range(len(titles)): rank.append(i + 1) title = str(titles[i].find('a').text) anchor_list = artists[i].find_all('a') if len(anchor_list) > 1: attr = anchor_list[1]['onclick'] attr = attr.split("'") attr = attr[1] attr = attr.split("||") for word in attr: if word.isdigit(): attr.remove(word) artist = attr[1::2] artist = ", ".join(artist) else: artist = artists[i].text.strip().split('\n')[0] image = images[i].find('img')['src'] title_list.append(title) artist_list.append(artist) image_list.append(image) data = zip(rank, title_list, artist_list, image_list) top100_df = pd.DataFrame([x for x in data]) top100_df.columns = ['Rank', 'Title', 'Artist', 'Image'] top100_df.to_excel("./data/Top100.xlsx", index=False) driver.back()
def select_song(driver: webdriver, title_list, artist_list, image_list): url = driver.current_url request = requests.get(url) html = request.text bs = BeautifulSoup(html, 'html.parser') titles = bs.select('p.title') artists = bs.select('p.artist') images = bs.select('a.thumbnail') for i in range(len(titles)): if '[권리없는 곡]' in str(titles[i].text): continue title = str(titles[i].find('a').text) anchor_list = artists[i].find_all('a') if len(anchor_list) > 1: attr = anchor_list[1]['onclick'] attr = attr.split("'") attr = attr[1] attr = attr.split("||") for word in attr: if word.isdigit(): attr.remove(word) artist = attr[1::2] artist = ", ".join(artist) else: artist = artists[i].text.strip().split('\n')[0] image = images[i].find('img')['src'] title_list.append(title) artist_list.append(artist) image_list.append(image) driver.back()
def read_pages(driver: webdriver, last_known_pdf: str) -> str: # returns latest pdf name # Reset counters new_head_pdf = '' cnt_pdf = 0 cnt_html = 0 page = 0 while True: page = page + 1 print('----------------------------------------------') print('Downloading page {:d}'.format(page)) print() # Scroll down to show the page number driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") sleep(1) # Get links and extract direct PDF-URLs from the links pdf_links = driver.find_elements_by_css_selector( "a[id*='urlAbfrage'][href*='.pdf']") pdf_urls = [] for pdfLink in pdf_links: pdf_urls.append(pdfLink.get_attribute("href")) # Get the HTML links html_links = driver.find_elements_by_css_selector( "a[id*='urlAbfrage'][href*='.html']") html_urls = [] for htmlLink in html_links: html_urls.append(htmlLink.get_attribute("href")) # Download all the PDFs to the default directory error = False for pdf_url in pdf_urls: try: # Get short URL -> use part after last '/', use part before '?' pdf_name = extract_pdf_from_link(pdf_url) # Sometimes, Termingebundenes is .pdf despite being HTML -> check if 'Termingebundenes' in pdf_name: cnt_html = cnt_html + 1 print('Skip {:4d}: {:s}'.format(cnt_html, pdf_name)) else: # Skip possibly HTML cnt_pdf = cnt_pdf + 1 # check if pdf is last-known-pdf, then done if pdf_name == last_known_pdf: print('------------------------------------') print("found last known pdf: '{:s}'".format( last_known_pdf)) print('exiting...') return new_head_pdf # use it as new head pdf, if its first one if cnt_pdf == 1: new_head_pdf = pdf_name print( "updated new head to url: '{:s}'".format(pdf_name)) driver.get(pdf_url) sleep(0.1) print('Get {:4d}: {:s}'.format(cnt_pdf, pdf_name)) except: driver.back() print('Error, failed to load {:s}'.format(pdf_url)) error = True break # Go to the next page if not error: # Show how many URLS were skipped for htmlUrl in html_urls: cnt_html = cnt_html + 1 x = htmlUrl.split('/') html_url_short = x[-1] print('Skip {:4d}: {:s}'.format(cnt_html, html_url_short)) print() # Check if there is another right button - stop if not try: right_button = find_next_page_button(driver) except: print('----------------------------------------------') print('Downloaded -> {:5d} documents'.format(cnt_pdf)) print('Skipped -> {:5d} documents'.format(cnt_html)) print('No more right button -> End of download') print('----------------------------------------------') break driver.execute_script("arguments[0].click();", right_button) # Stop on error else: break return new_head_pdf