Python s3_file_writer Exemples, scraper_tools.s3_file_writer Python Exemples

Exemple #1

0

Afficher le fichier

def code_pub_main(s3_bucket, s3_path, rs_table, base_loc, start_link):
    cwd = os.getcwd()
    chrome_options = webdriver.ChromeOptions()
    # set download folder
    # configure multiple file download and turn off prompt
    prefs = {'download.default_directory': base_loc,
             'profile.default_content_setting_values.automatic_downloads': 1,
             'download.prompt_for_download': 'false',
             'default_content_settings.automatic_downloads': 1,
             'profile.content_settings.exceptions.automatic_downloads': 1}
    chrome_options.add_experimental_option('prefs', prefs)
    failed_cities = []
    city = start_link[0]
    links = start_link[1]
    print(city)
    for link in links:
        try:
            driver = webdriver.Chrome(f'{cwd}/chromedriver', chrome_options=chrome_options)
            print(link)
            driver.get(link)
            # find update date
            messy_date = get_update_date(driver)
            # find and click all necessary checkboxes
            driver = handle_checkboxes(driver, 0.4, 0.5)
            # save the document
            driver = save_doc(driver)
            update_date = scraper_tools.extract_date(messy_date)
            # puts file in right folder and waits for files to download
            old_path = base_loc+city+".txt"
            new_path = downloads_done(old_path, 36)
            #path = scraper_tools.make_path(base_loc, city, update_date)
            path = scraper_tools.make_path(base_loc, city, update_date)
            new_path = path+city+".txt"
            os.rename(old_path, new_path)
            lvl2_docs = split_lvl2_docs(new_path)
            for lvl2_header, lvl2_text in lvl2_docs.items():
                scraper_tools.s3_file_writer(s3_bucket, s3_path, base_loc, city, update_date, lvl2_header, lvl2_text)
            driver.close()
            driver.quit()
            return (False)
        except:
            return (True)

Exemple #2

0

Afficher le fichier

def q_code_main(s3_bucket, s3_path, rs_table, base_loc, start_link):
    cwd = os.getcwd()
    chrome_options = webdriver.ChromeOptions()
    #set download folder
    #configure multiple file download and turn off prompt
    prefs = {'download.default_directory' : base_loc,
            'profile.default_content_setting_values.automatic_downloads': 1,
            'download.prompt_for_download': 'False'}
    chrome_options.add_experimental_option('prefs', prefs)
    missing_sections = 0
    keys_written = []
    my_xpath = "//div[@class='navChildren']//a"
    showall_xpath = "//a[@class='showAll']"
    high_title_xpath = "//div[@class='currentTopic']"
    low_title_xpath = "//div[@class='navTocHeading']"
    content_xpath = "//div[@class='content-fragment']"
    up_xpath = "//a[@accesskey='u']"
    city = start_link[0]
    links = start_link[1]
    print(city)
    for link in links:
        my_doc = [city]
        try:
            # level 1
            driver = webdriver.Chrome(f'{cwd}/chromedriver', options=chrome_options)
            print(link)
            driver.get(link)
            # get last updated date
            driver.switch_to.frame('LEFT')
            date_xpath = "//body[@class='preface']//p"
            scraper_tools.waiting_for_presence_of(driver, date_xpath, 3, 0.1)
            left_text = driver.find_elements_by_xpath(date_xpath)
            for p in left_text:
                if 'current' in p.text.lower():
                    update_date_messy = p.text
                    my_doc.append(update_date_messy)
            driver.switch_to.default_content()
            driver.switch_to.frame('RIGHT')
            scraper_tools.waiting_for_presence_of(driver, my_xpath, 3, 0.1)
            # level 2
            if len(driver.find_elements_by_xpath(my_xpath)) <= 4:
                for h_sec_num in range(len(driver.find_elements_by_xpath(my_xpath))):
                    h_sections = driver.find_elements_by_xpath(my_xpath)
                    level2_title = h_sections[h_sec_num].text
                    if 'code' in level2_title.lower():
                        scraper_tools.click_n_wait(driver, my_xpath, h_sections, h_sec_num, 3, 0.1)
            for h_sec_num in range(len(driver.find_elements_by_xpath(my_xpath))):
                h_sections = driver.find_elements_by_xpath(my_xpath)
                level2_title = h_sections[h_sec_num].text
                print(level2_title)
                my_doc.append(level2_title)
                if ('reserved' in level2_title.lower()) or (level2_title.lower() == 'note'):
                    continue
                scraper_tools.click_n_wait(driver, my_xpath, h_sections, h_sec_num, 3, 0.1)
                # level 3
                for l_sec_num in range(len(driver.find_elements_by_xpath(my_xpath))):
                    try:
                        l_sections = driver.find_elements_by_xpath(my_xpath)
                        my_doc.append(l_sections[l_sec_num].text)
                        # skip sections that are reserved or notes
                        if ('reserved' in l_sections[l_sec_num].text.lower()) or (l_sections[l_sec_num].text.lower() == 'note'):
                            continue
                        l_sections[l_sec_num].click()
                        # if there is no showall button use the brute force way to go back
                        if len(driver.find_elements_by_xpath(showall_xpath)) != 0:
                            waiting_for_presence_of(driver, showall_xpath, 3, 0.1)
                            scraper_tools.find_click_n_wait(driver, showall_xpath, high_title_xpath, 0, 3, 0.1)
                            # get text
                            for content, l_title in zip(driver.find_elements_by_xpath(content_xpath), driver.find_elements_by_xpath(low_title_xpath)):
                                my_doc.append(l_title.text)
                                my_doc.append(content.text)
                            # go to previous page
                            find_click_n_wait(driver, up_xpath, my_xpath, 0, 3, 0.1)
                        elif len(driver.find_elements_by_xpath(content_xpath)) != 0:
                            # get text
                            for content in driver.find_elements_by_xpath(content_xpath):
                                my_doc.append(content.text)
                            # go to previous page
                            find_click_n_wait(driver, up_xpath, my_xpath, 0, 3, 0.1)
                        else:
                            driver.get(link)
                            driver.switch_to.frame('RIGHT')
                            scraper_tools.find_click_n_wait(driver, my_xpath, my_xpath, h_sec_num, 3, 0.1)
                    except:
                        my_doc.append("-_-_-missing-_-_-")
                        missing_sections += 1
                scraper_tools.find_click_n_wait(driver, up_xpath, my_xpath, 0, 3, 0.1)
                update_date = scraper_tools.extract_date(update_date_messy)
                key = scraper_tools.s3_file_writer(s3_bucket, s3_path, base_loc, city, update_date, level2_title, '\n'.join(my_doc))
                if key:
                    keys_written.append(key)
                my_doc = [city]
        except:
            return True, keys_written
    driver.close()
    driver.quit()
    print("-"*5)
    if missing_sections > 0:
        return True, keys_written
    return False, keys_written

Exemple #3

0

Afficher le fichier

def amlegal_main(s3_bucket, s3_path, rs_table, base_loc, start_link):
    cwd = os.getcwd()
    chrome_options = webdriver.ChromeOptions()
    # set download folder
    # configure multiple file download and turn off prompt
    prefs = {
        'download.default_directory': base_loc,
        'profile.default_content_setting_values.automatic_downloads': 1,
        'download.prompt_for_download': 'false',
        'default_content_settings.automatic_downloads': 1,
        'profile.content_settings.exceptions.automatic_downloads': 1
    }
    chrome_options.add_experimental_option('prefs', prefs)
    keys_written = []
    date_xpath = "//div[@class='currency-info']"
    lvl_1_xpath = "//div[@class='codenav__toc roboto']/div/div/a[@class='toc-link']"
    lvl_2_xpath = "//div[@class='codenav__toc roboto']/div/div/div/div/a[@class='toc-link']"
    collapse_xpath = "//div[@class='codenav__toc roboto']/div/div/button[@class='toc-caret dropdown-toggle btn-toggle']"
    download_xpath = "//button[@class='btn btn-white-circle']"
    checkedbox_xpath = "//label[@class='toc-link check--partial form-check-label']"
    checkbox_xpath = "//div[@class='toc-entry toc-entry--code']/div/div/label[@class='toc-link form-check-label']"
    pull_file_xpath = "//button[@class='btn btn-primary']"
    file_type_xpath = "//button[@class='export-button btn btn-export']"
    final_download_xpath = "//a[@class='btn btn-secondary request__open']"
    clear_downloads_xpath = "//div[@class='react-tabs__tab-panel react-tabs__tab-panel--selected']/div/div/span/button[@class='btn btn-primary']"
    city = start_link[0]
    links = start_link[1]
    print(city)
    for link in links:
        try:
            driver = webdriver.Chrome(f'{cwd}/chromedriver',
                                      options=chrome_options)
            print(link)
            driver.get(link)
            # get date
            messy_date = driver.find_elements_by_xpath(date_xpath)
            match = re.search(r'effective\s(\D+\d+,\s\d+).',
                              messy_date[0].text)
            match_date = datetime.strptime(match.group(1), '%B %d, %Y').date()
            my_date = match_date.strftime('%m-%d-%y')
            # collect the sections
            lvl_1_section = driver.find_elements_by_xpath(lvl_1_xpath)
            collapse_buttons = driver.find_elements_by_xpath(collapse_xpath)
            download_buttons = driver.find_elements_by_xpath(download_xpath)
            for sec1_num in range(
                    len(driver.find_elements_by_xpath(lvl_1_xpath))):
                level_1_title = lvl_1_section[sec1_num].text
                print(level_1_title)
                scraper_tools.click_n_wait(driver, lvl_1_xpath, lvl_1_section,
                                           sec1_num, 10, 4)
                scraper_tools.click_n_wait(driver, download_xpath,
                                           download_buttons, 2, 10, 2)
                # if last section go straight to downloading cuz this section is bugged
                if sec1_num < len(
                        driver.find_elements_by_xpath(lvl_1_xpath)) - 1:
                    # click once to de-select everything...
                    scraper_tools.click_single_wait(driver,
                                                    checkedbox_xpath,
                                                    2,
                                                    num=0)
                    # ...and again to select everything
                    scraper_tools.click_single_wait(driver,
                                                    checkbox_xpath,
                                                    2,
                                                    num=sec1_num)
                else:
                    time.sleep(3)
                # click proceed button
                scraper_tools.click_single_wait(driver, pull_file_xpath, 2)
                # check if they ask to clear data we've downloaded so far
                if len(driver.find_elements_by_xpath(
                        clear_downloads_xpath)) > 0:
                    clear_downloads = driver.find_elements_by_xpath(
                        clear_downloads_xpath)
                    clear_downloads[0].click()
                # click type of file you want downloaded
                file_type = driver.find_elements_by_xpath(file_type_xpath)
                scraper_tools.click_n_wait(driver, final_download_xpath,
                                           file_type, 2, 300, 5)
                # click the most recent download when it is finished downloading
                for i in range(60):
                    final_download = driver.find_elements_by_xpath(
                        final_download_xpath)
                    if len(final_download) == 2 or sec1_num == 0:
                        break
                    else:
                        time.sleep(5)
                scraper_tools.click_n_wait(driver, final_download_xpath,
                                           final_download, 0, 10, 3)
                # rename and move downloaded file
                name_sec = (sec1_num % 3) + 1
                old_path = base_loc + city.replace(
                    ' ', '_') + "-ca-" + str(name_sec) + ".txt"
                old_path = scraper_tools.downloads_done(old_path, 36)
                path = scraper_tools.make_path(base_loc, city, my_date)
                new_path = path + city + "_" + level_1_title + ".txt"
                os.rename(old_path, new_path)
                # collapse past sections of toc to reduce website slowdown
                scraper_tools.click_n_wait(driver, collapse_xpath,
                                           collapse_buttons, sec1_num, 10, 3)
                # move text to s3
                with open(new_path, 'r') as f:
                    lvl1_text = f.readlines()
                key = scraper_tools.s3_file_writer(s3_bucket, s3_path,
                                                   base_loc, city, update_date,
                                                   level_1_title,
                                                   '\n'.join(lvl1_text))
                if key and (key not in list(rs_table.s3_key)):
                    keys_written.append(key)
            driver.close()
            driver.quit()
            return False, keys_written
        except:
            return True, keys_written