def code_pub_main(s3_bucket, s3_path, rs_table, base_loc, start_link): cwd = os.getcwd() chrome_options = webdriver.ChromeOptions() # set download folder # configure multiple file download and turn off prompt prefs = { 'download.default_directory': base_loc, 'profile.default_content_setting_values.automatic_downloads': 1, 'download.prompt_for_download': 'false', 'default_content_settings.automatic_downloads': 1, 'profile.content_settings.exceptions.automatic_downloads': 1 } chrome_options.add_experimental_option('prefs', prefs) keys_written = [] city = start_link[0] links = start_link[1] print(city) for link in links: try: driver = webdriver.Chrome(f'{cwd}/chromedriver', chrome_options=chrome_options) print(link) driver.get(link) # find update date messy_date = get_update_date(driver) # find and click all necessary checkboxes driver = handle_checkboxes(driver, 0.4, 0.5) # save the document driver = save_doc(driver) update_date = scraper_tools.extract_date(messy_date[0]) # puts file in right folder and waits for files to download old_path = base_loc + city.replace(' ', '') + ".txt" new_path = scraper_tools.downloads_done(old_path, 36) path = scraper_tools.make_path(base_loc, city, update_date) new_path = path + city + ".txt" os.rename(old_path, new_path) # split document by lvl 2 sections lvl2_docs = split_lvl2_docs(new_path) # send each lvl 2 section to s3 as seperate doc for lvl2_header, lvl2_text in lvl2_docs.items(): print(lvl2_header) key = scraper_tools.s3_file_writer(s3_bucket, s3_path, base_loc, city, update_date, lvl2_header, lvl2_text) if key and (key not in list(rs_table.s3_key)): keys_written.append(key) driver.close() driver.quit() return False, keys_written except: return True, keys_written
def q_code_main(s3_bucket, s3_path, rs_table, base_loc, start_link): cwd = os.getcwd() chrome_options = webdriver.ChromeOptions() #set download folder #configure multiple file download and turn off prompt prefs = {'download.default_directory' : base_loc, 'profile.default_content_setting_values.automatic_downloads': 1, 'download.prompt_for_download': 'False'} chrome_options.add_experimental_option('prefs', prefs) missing_sections = 0 keys_written = [] my_xpath = "//div[@class='navChildren']//a" showall_xpath = "//a[@class='showAll']" high_title_xpath = "//div[@class='currentTopic']" low_title_xpath = "//div[@class='navTocHeading']" content_xpath = "//div[@class='content-fragment']" up_xpath = "//a[@accesskey='u']" city = start_link[0] links = start_link[1] print(city) for link in links: my_doc = [city] try: # level 1 driver = webdriver.Chrome(f'{cwd}/chromedriver', options=chrome_options) print(link) driver.get(link) # get last updated date driver.switch_to.frame('LEFT') date_xpath = "//body[@class='preface']//p" scraper_tools.waiting_for_presence_of(driver, date_xpath, 3, 0.1) left_text = driver.find_elements_by_xpath(date_xpath) for p in left_text: if 'current' in p.text.lower(): update_date_messy = p.text my_doc.append(update_date_messy) driver.switch_to.default_content() driver.switch_to.frame('RIGHT') scraper_tools.waiting_for_presence_of(driver, my_xpath, 3, 0.1) # level 2 if len(driver.find_elements_by_xpath(my_xpath)) <= 4: for h_sec_num in range(len(driver.find_elements_by_xpath(my_xpath))): h_sections = driver.find_elements_by_xpath(my_xpath) level2_title = h_sections[h_sec_num].text if 'code' in level2_title.lower(): scraper_tools.click_n_wait(driver, my_xpath, h_sections, h_sec_num, 3, 0.1) for h_sec_num in range(len(driver.find_elements_by_xpath(my_xpath))): h_sections = driver.find_elements_by_xpath(my_xpath) level2_title = h_sections[h_sec_num].text print(level2_title) my_doc.append(level2_title) if ('reserved' in level2_title.lower()) or (level2_title.lower() == 'note'): continue scraper_tools.click_n_wait(driver, my_xpath, h_sections, h_sec_num, 3, 0.1) # level 3 for l_sec_num in range(len(driver.find_elements_by_xpath(my_xpath))): try: l_sections = driver.find_elements_by_xpath(my_xpath) my_doc.append(l_sections[l_sec_num].text) # skip sections that are reserved or notes if ('reserved' in l_sections[l_sec_num].text.lower()) or (l_sections[l_sec_num].text.lower() == 'note'): continue l_sections[l_sec_num].click() # if there is no showall button use the brute force way to go back if len(driver.find_elements_by_xpath(showall_xpath)) != 0: waiting_for_presence_of(driver, showall_xpath, 3, 0.1) scraper_tools.find_click_n_wait(driver, showall_xpath, high_title_xpath, 0, 3, 0.1) # get text for content, l_title in zip(driver.find_elements_by_xpath(content_xpath), driver.find_elements_by_xpath(low_title_xpath)): my_doc.append(l_title.text) my_doc.append(content.text) # go to previous page find_click_n_wait(driver, up_xpath, my_xpath, 0, 3, 0.1) elif len(driver.find_elements_by_xpath(content_xpath)) != 0: # get text for content in driver.find_elements_by_xpath(content_xpath): my_doc.append(content.text) # go to previous page find_click_n_wait(driver, up_xpath, my_xpath, 0, 3, 0.1) else: driver.get(link) driver.switch_to.frame('RIGHT') scraper_tools.find_click_n_wait(driver, my_xpath, my_xpath, h_sec_num, 3, 0.1) except: my_doc.append("-_-_-missing-_-_-") missing_sections += 1 scraper_tools.find_click_n_wait(driver, up_xpath, my_xpath, 0, 3, 0.1) update_date = scraper_tools.extract_date(update_date_messy) key = scraper_tools.s3_file_writer(s3_bucket, s3_path, base_loc, city, update_date, level2_title, '\n'.join(my_doc)) if key: keys_written.append(key) my_doc = [city] except: return True, keys_written driver.close() driver.quit() print("-"*5) if missing_sections > 0: return True, keys_written return False, keys_written