swdriver = webdriver.Chrome(chrome_options=chromeOptions) swdriver.get(url) #finding where the table lives and getting a list of the elements table_body_elements = swdriver.find_elements_by_xpath( "/html/body/div[4]/div/div/table/tbody/tr") print('Found {} sound file links on this page... \n'.format( len(table_body_elements) - 1)) print("Scraping for URL pairs...") progress = ProgressBar(len(table_body_elements) - 1, fmt=ProgressBar.FULL) sound_metadata_url_pairs = [] #grabbing the urls for each sound and metadata occuring on the table in the page for row in table_body_elements[1:]: progress.current += 1 progress.__call__() #grabbing the href for the sound file sound_el = row.find_elements_by_xpath(".//td[4]/a")[0] #print("Found sound data at: {}".format(download_el.get_attribute("href"))) temp_sound_url = str(sound_el.get_attribute("href")) #each file in the watkins db has a unique identifyer, here's one way to grab it: temp_file_id = str(temp_sound_url).split("/")[-1].split(".")[0] #grabbing the href for the metadata for the .wav file (has the same format for all, with replaced id) temp_metadata_url = "https://cis.whoi.edu/science/B/whalesounds/metaData.cfm?RN={}".format( temp_file_id) sound_metadata_url_pairs.append([temp_sound_url, temp_metadata_url])