Ejemplo n.º 1
0
def select_playlist(driver: webdriver, title_list, artist_list, image_list):
    time.sleep(3)

    for i in range(12):
        playlists = driver.find_elements_by_xpath(
            '//*[@id="container"]/section/div/ul/li')
        playlists[i].click()
        select_song(driver, title_list, artist_list, image_list)
        time.sleep(3)

    driver.back()
Ejemplo n.º 2
0
def select_top100(driver: webdriver):
    url = "https://music.bugs.co.kr/chart"
    request = requests.get(url)
    html = request.text
    bs = BeautifulSoup(html, 'html.parser')

    titles = bs.select('p.title')
    artists = bs.select('p.artist')
    images = bs.select('a.thumbnail')

    rank = []
    title_list = []
    artist_list = []
    image_list = []

    for i in range(len(titles)):
        rank.append(i + 1)

        title = str(titles[i].find('a').text)

        anchor_list = artists[i].find_all('a')
        if len(anchor_list) > 1:
            attr = anchor_list[1]['onclick']
            attr = attr.split("'")
            attr = attr[1]
            attr = attr.split("||")

            for word in attr:
                if word.isdigit():
                    attr.remove(word)

            artist = attr[1::2]
            artist = ", ".join(artist)
        else:
            artist = artists[i].text.strip().split('\n')[0]

        image = images[i].find('img')['src']

        title_list.append(title)
        artist_list.append(artist)
        image_list.append(image)

    data = zip(rank, title_list, artist_list, image_list)
    top100_df = pd.DataFrame([x for x in data])
    top100_df.columns = ['Rank', 'Title', 'Artist', 'Image']

    top100_df.to_excel("./data/Top100.xlsx", index=False)
    driver.back()
Ejemplo n.º 3
0
def select_song(driver: webdriver, title_list, artist_list, image_list):
    url = driver.current_url
    request = requests.get(url)
    html = request.text
    bs = BeautifulSoup(html, 'html.parser')

    titles = bs.select('p.title')
    artists = bs.select('p.artist')
    images = bs.select('a.thumbnail')

    for i in range(len(titles)):
        if '[권리없는 곡]' in str(titles[i].text):
            continue

        title = str(titles[i].find('a').text)

        anchor_list = artists[i].find_all('a')
        if len(anchor_list) > 1:
            attr = anchor_list[1]['onclick']
            attr = attr.split("'")
            attr = attr[1]
            attr = attr.split("||")

            for word in attr:
                if word.isdigit():
                    attr.remove(word)

            artist = attr[1::2]
            artist = ", ".join(artist)
        else:
            artist = artists[i].text.strip().split('\n')[0]

        image = images[i].find('img')['src']

        title_list.append(title)
        artist_list.append(artist)
        image_list.append(image)

    driver.back()
Ejemplo n.º 4
0
def read_pages(driver: webdriver, last_known_pdf: str) -> str:
    # returns latest pdf name

    # Reset counters
    new_head_pdf = ''
    cnt_pdf = 0
    cnt_html = 0
    page = 0

    while True:
        page = page + 1
        print('----------------------------------------------')
        print('Downloading page {:d}'.format(page))
        print()

        # Scroll down to show the page number
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        sleep(1)

        # Get links and extract direct PDF-URLs from the links
        pdf_links = driver.find_elements_by_css_selector(
            "a[id*='urlAbfrage'][href*='.pdf']")
        pdf_urls = []
        for pdfLink in pdf_links:
            pdf_urls.append(pdfLink.get_attribute("href"))

        # Get the HTML links
        html_links = driver.find_elements_by_css_selector(
            "a[id*='urlAbfrage'][href*='.html']")
        html_urls = []
        for htmlLink in html_links:
            html_urls.append(htmlLink.get_attribute("href"))

        # Download all the PDFs to the default directory
        error = False

        for pdf_url in pdf_urls:

            try:
                # Get short URL -> use part after last '/', use part before '?'
                pdf_name = extract_pdf_from_link(pdf_url)

                # Sometimes, Termingebundenes is .pdf despite being HTML -> check
                if 'Termingebundenes' in pdf_name:
                    cnt_html = cnt_html + 1
                    print('Skip {:4d}: {:s}'.format(cnt_html, pdf_name))

                else:  # Skip possibly HTML
                    cnt_pdf = cnt_pdf + 1

                    # check if pdf is last-known-pdf, then done
                    if pdf_name == last_known_pdf:
                        print('------------------------------------')
                        print("found last known pdf: '{:s}'".format(
                            last_known_pdf))
                        print('exiting...')

                        return new_head_pdf

                    # use it as new head pdf, if its first one
                    if cnt_pdf == 1:
                        new_head_pdf = pdf_name
                        print(
                            "updated new head to url: '{:s}'".format(pdf_name))

                    driver.get(pdf_url)
                    sleep(0.1)
                    print('Get  {:4d}: {:s}'.format(cnt_pdf, pdf_name))

            except:
                driver.back()
                print('Error, failed to load {:s}'.format(pdf_url))
                error = True
                break

        # Go to the next page
        if not error:

            # Show how many URLS were skipped
            for htmlUrl in html_urls:
                cnt_html = cnt_html + 1
                x = htmlUrl.split('/')
                html_url_short = x[-1]
                print('Skip {:4d}: {:s}'.format(cnt_html, html_url_short))
            print()

            # Check if there is another right button - stop if not
            try:
                right_button = find_next_page_button(driver)
            except:
                print('----------------------------------------------')
                print('Downloaded -> {:5d} documents'.format(cnt_pdf))
                print('Skipped    -> {:5d} documents'.format(cnt_html))
                print('No more right button -> End of download')
                print('----------------------------------------------')
                break
            driver.execute_script("arguments[0].click();", right_button)

        # Stop on error
        else:
            break
    return new_head_pdf