def reset(self): self.counter = 0 for bot in self.bots_list: webdriver = bot.state.form_element.parent webdriver.close() self.bots_list = [] self.create_bots()
def RespCode(domain): webdriver = Chrome() domain = "http://www." + str(domain) response = webdriver.request('GET', domain) response = str(response)[11:][:-2] webdriver.close() return response
def menu(): menu = {} menu['1'] = "\033[0;33mWatch Instagram Stories\033[m" menu['2'] = "\033[0;33mLike Hashtagged posts\033[m" menu['3'] = "\033[0;33mEdit hashtag list\033[m" menu['4'] = "\033[0;33mExit\033[m" while True: options = menu.keys() for entry in options: print(entry, menu[entry]) selection = str(input("What would you like to do? ")) if selection == '1': os.system('clear') watchstories() elif selection == '2': os.system('clear') likes() elif selection == '3': os.system('clear') hashtag_menu() elif selection == '4': os.system('clear') webdriver.close() sys.exit() else: print("\nYou have to choose an option between 1 and 4. \n") menu()
def scrape(self): try: print( "scraping www.cnn.com , please wait as it might take a while :)" ) webdriver.get('https://edition.cnn.com/') element = WebDriverWait(webdriver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "cn__title "))) source = webdriver.page_source soup = BeautifulSoup(source, 'html.parser') article_links = [] for ul in soup.find_all('ul'): if ( ul.h2 and 'Top' in ul.h2.string ): #if the ul element has a h2 child, and Top is in the h2 string for li in ul.find_all('li'): article_links.append(li.find('a').get('href')) self.scrape_from_articles(article_links) except TimeoutException as exception: print('timed out, check internet connection and try again') finally: print('done') webdriver.close() return ''
def scrape_page(webdriver, links, username): '''This function will go to all links provided and scrape each picture for the number of likes and the caption. If the link is a video no information is recorded. The function will only save the caption if the title is the identified user Parameters: the active webdriver, a set of picture links, the username of the page your are scraping Returns: a list of lists with the number of likes and caption ''' picture_info = [] for link in links: # Open new tab webdriver.execute_script("window.open('');") time.sleep(3) # Switch to the new window webdriver.switch_to.window(webdriver.window_handles[1]) webdriver.get(link) time.sleep(5) try: likes_list = webdriver.find_elements_by_class_name('zV_Nj') if len(likes_list) != 0: #If the length is 0, then it is a video if len(likes_list) == 1: #No common friends liked the photo num_likes = webdriver.find_elements_by_class_name('Nm9Fw')[0].text.split(' ')[0] else: num_likes = int(likes_list[1].text.split(' ')[0]) + 1 try: title = webdriver.find_element_by_class_name('_6lAjh').text if title == username: caption_list = webdriver.find_elements_by_xpath("//div[@class='C4VMK']//span") '''This code works but not active since I did not use the information num_of_comments = len(caption_list)''' caption = caption_list[0].text else: caption = None #if the user was not the title except: caption = None #photo does not have a caption or any comments picture_info.append([num_likes, caption]) except: pass webdriver.close() # Switch focus back to main tab webdriver.switch_to.window(webdriver.window_handles[0]) time.sleep(5) return picture_info
def clean_up(head=None,webdriver=None): print "Cleaning up..." if head: print "Restoring original head (%s)" % head head.checkout() if webdriver: print "Closing browser" webdriver.close()
def take_screenshot(url): try: webdriver = selenium.webdriver.PhantomJS('vendor/phantomjs/bin/phantomjs') webdriver.get(url) webdriver.set_window_size(1280,800) imagedata = webdriver.get_screenshot_as_base64() webdriver.close() webdriver.quit() except Exception, e: raise
def wait_for_content(webdriver, id, timer=15.0, poll_frequency=0.5): """ (WebDriver object, str[, float, float]) -> NoneType Wait for page to load content with id we specified as function's argument. Stop script if time is out. """ try: WebDriverWait(webdriver, timer, poll_frequency).until(EC.presence_of_element_located((By.ID, id))) except TimeoutException: print('Content on the page has not been loaded... Stopping script execution.') webdriver.close() sys.exit()
def make_decorator(): print('现在开始装饰') try: func() stepresult = 'Pass' except: stepresult = 'Error' finally: print('现在结束装饰') print(stepresult) webdriver.close()
def main(): webdriver = setup_webdriver() evaluate_trade(webdriver, 'Alteration', buyAlteration, sellAlteration) evaluate_trade(webdriver, 'Fusing', buyFusing, sellFusing) evaluate_trade(webdriver, 'Scouring', buyScouring, sellScouring) evaluate_trade(webdriver, 'Vaal', buyVaal, sellVaal) evaluate_trade(webdriver, 'Regret', buyRegret, sellRegret) evaluate_trade(webdriver, 'Chiesel', buyChiesel, sellChiesel) evaluate_trade(webdriver, 'Alchemy', buyAlchemy, sellAlchemy) evaluate_trade(webdriver, 'Jeweller', buyJeweller, sellJeweller) webdriver.close()
def session_end(driver, sessions_to_do, file, questions, answers): sessions_to_do = sessions_to_do - 1 if sessions_to_do == 0: write_words(questions, answers, file) webdriver.close() exit() print(str(sessions_to_do) + " sessions left\n") driver.find_element(By.ID, "return_mainpage").click() WebDriverWait(driver, 8).until( ec.presence_of_element_located((By.CLASS_NAME, "btn-session"))) driver.find_element(By.CLASS_NAME, "btn-session").click() return sessions_to_do
def timerlock(): # ===================================== speak("For how many seconds?") seconds = int(takeCommand().lower()) speak("timer set..") for i in range(seconds): print(str(seconds - i) + " remaining") time.sleep(1) # ========================================== subprocess.Popen("rundll32.exe user32.dll,LockWorkStation") webdriver.close() exit()
def cleanUpAfterError(error=None, webdriver=None): """ Overview 例外エラー後の一連処理を行う。 Args webdriver: 終了させるwebdriverインスタンス。未起動時は引数なし Return なし """ if webdriver is not None: webdriver.close() if error is not None: logger.exception(str(getCurLineNo()) + ' ' + str(error)) sys.exit()
def do_testharness(self, webdriver, url, timeout): format_map = { "abs_url": url, "url": strip_server(url), "window_id": self.window_id, "timeout_multiplier": self.timeout_multiplier, "timeout": timeout * 1000 } parent = webdriver.current_window_handle handles = [item for item in webdriver.window_handles if item != parent] for handle in handles: try: webdriver.switch_to_window(handle) webdriver.close() except exceptions.NoSuchWindowException: pass webdriver.switch_to_window(parent) webdriver.execute_script(self.script % format_map) try: # Try this, it's in Level 1 but nothing supports it yet win_s = webdriver.execute_script("return window['%s'];" % self.window_id) win_obj = json.loads(win_s) test_window = win_obj["window-fcc6-11e5-b4f8-330a88ab9d7f"] except Exception: after = webdriver.window_handles if len(after) == 2: test_window = next(iter(set(after) - set([parent]))) elif after[0] == parent and len(after) > 2: # Hope the first one here is the test window test_window = after[1] else: raise Exception("unable to find test window") assert test_window != parent handler = CallbackHandler(webdriver, test_window, self.logger) while True: result = webdriver.execute_async_script(self.script_resume % format_map) done, rv = handler(result) if done: break return rv
def do_testharness(self, webdriver, url, timeout): format_map = {"abs_url": url, "url": strip_server(url), "window_id": self.window_id, "timeout_multiplier": self.timeout_multiplier, "timeout": timeout * 1000} parent = webdriver.current_window_handle handles = [item for item in webdriver.window_handles if item != parent] for handle in handles: try: webdriver.switch_to_window(handle) webdriver.close() except exceptions.NoSuchWindowException: pass webdriver.switch_to_window(parent) webdriver.execute_script(self.script % format_map) try: # Try this, it's in Level 1 but nothing supports it yet win_s = webdriver.execute_script("return window['%s'];" % self.window_id) win_obj = json.loads(win_s) test_window = win_obj["window-fcc6-11e5-b4f8-330a88ab9d7f"] except: after = webdriver.window_handles if len(after) == 2: test_window = next(iter(set(after) - set([parent]))) elif after[0] == parent and len(after) > 2: # Hope the first one here is the test window test_window = after[1] else: raise Exception("unable to find test window") assert test_window != parent handler = CallbackHandler(webdriver, test_window, self.logger) while True: result = webdriver.execute_async_script( self.script_resume % format_map) done, rv = handler(result) if done: break return rv
def create_user(first_name, last_name, email, user_name, type, profile, role=''): browser.get(baseurl) try: browser.implicitly_wait(5) open_new_record() except: print "Unexpected error 1:", sys.exc_info()[0] #Wait for page to load. browser.implicitly_wait(10) browser.find_element_by_name("new").click() try: fill_out_form(first_name, last_name, email, user_name, type, profile, role='') except: print "Unexpected error 2:", sys.exc_info()[0] try: browser.implicitly_wait(15) displayed = browser.find_element_by_id('errorDiv_ep').is_displayed() print displayed if displayed: the_error = browser.find_element_by_class_name('errorMsg').text print 'Oh no there is an error! \n' print the_error else: print 'We are good!' webdriver.close() except common.exceptions.NoSuchElementException, e: print '3', e
def get_table_rows(webdriver, csvwriter): """ (WebDriver object, _csv.writer object) -> NoneType Search content of the page for 'player-data' table. Collect table's rows and write them in csv file. """ global PAGE_COUNT try: soup = BeautifulSoup(webdriver.page_source, 'html.parser') except AttributeError: print('Was not able to parse content of the page. Stopping script execution.') webdriver.close() sys.exit() salary_table = soup.find('table', {'id': 'player-data'}) rows = salary_table.find('tbody').findAll('tr') for row in rows: row_data = [] for cell in row.findAll('td'): row_data.append(cell.get_text().strip()) csvwriter.writerow(row_data) print('Page %s is collected.' % PAGE_COUNT) PAGE_COUNT += 1 return
def report(): print("Script finished!") time_end = datetime.datetime.now() sleep(1) print("Script Started at: {}".format( time_start.strftime("%Y-%m-%d %H:%M:%S"))) sleep(1) print("Script Ended at: {}".format(time_end.strftime("%Y-%m-%d %H:%M:%S"))) sleep(1) print("Total Stories watched from {}'s followers: {}".format( limits.user_watch_followers_stories, stories_bf_watched)) sleep(1) print("Total Stories watched from feed: {}".format(total_stories_watched)) sleep(1) print("Total suggestions followed: {}".format(total_suggestion_followed)) sleep(1) print("Total number of likes: {}".format(total_likes)) sleep(1) print("Total number of comments: {}".format(total_comments)) sleep(1) print("closing browser...") sleep(1) webdriver.close()
def open_website_and_quit(website, browser, webdriver): sleep(3) try: webdriver.get(website) sleep(30) webdriver.close() sleep(2) browser.terminate() except TimeoutException as e: logging.warning(f"TIMEOUT {website}") sleep(90) webdriver.close() browser.kill() except Exception as e: webdriver.close() browser.kill() raise e
import time from selenium import webdriver n = 0 while (n < 30): driver = webdriver.Chrome(executable_path="D:/appium\chromedriver.exe") driver.get("https://www.youtube.com/watch?v=Q-sgJ3xMJmg") time.sleep(10) n = n + 1 if (n > 30): webdriver.close()
def cleanup_webdriver(self, webdriver=None): try: webdriver.quit() webdriver.close() except Exception: pass
def quit_driver(webdriver): webdriver.close() webdriver.quit()
def user7(): from selenium import webdriver from selenium.webdriver.common.keys import Keys from time import sleep from user8 import user8 # Opens the Chrome browser and go to the login Instagram page. driverPath = 'D:\Programming\Python Projects\Personal Projects\chromedriver.exe' webdriver = webdriver.Chrome(executable_path=driverPath) webdriver.get( 'https://www.instagram.com/accounts/login/?source=auth_switcher') sleep(2) # Holds the user's username and password that will be used. username = '' password = '' # Find the username and password elements and fills them in with the user's login. userElement = webdriver.find_element_by_name('username') userElement.send_keys(username) passElement = webdriver.find_element_by_name('password') passElement.send_keys(password) # Find the login button element and click it to login into the user's account. login = webdriver.find_element_by_css_selector( '#react-root > section > main > div > article > div > div:nth-child(1) > div > form > div:nth-child(4) > button > div' ) login.click() sleep(3) # Find the "Not Now" element of the notification popup and click it to make it go away. notNow = webdriver.find_element_by_css_selector( 'body > div.RnEpo.Yx5HN > div > div > div.mt3GC > button.aOOlW.HoLwm') notNow.click() sleep(1) # Direct the browser to Blanson's Chick-fil-a page. webdriver.get('https://www.instagram.com/p/B2epau2FUiI/') sleep(1) # Find the comment box on the page and click on it. commentBox = webdriver.find_element_by_css_selector( '#react-root > section > main > div > div > article > div.eo2As > section.sH9wk._JgwE > div > form > textarea' ) commentBox.click() # This will be the comment that will be posted. comment = 'Blanson Bold! Blanson Gold!' # Comment infinitely (will be stopped inevitably by Instagram however). while True: # Find the comment box again to let the program know we are working with it again. commentBox = webdriver.find_element_by_css_selector( '#react-root > section > main > div > div > article > div.eo2As > section.sH9wk._JgwE > div > form > textarea' ) # Input the comment in the comment box. commentBox.send_keys(comment) # Enter to post the comment. commentBox.send_keys(Keys.ENTER) sleep(.5) # Try to scan the page for the popup that blocks the user from commenting. try: webdriver.find_element_by_css_selector( 'body > div.Z2m7o > div > div > div > p') sleep(2) # If it gets to this point then it has blocked the user, and we close this browser. webdriver.close() # Call the next function to start another browser and user. user8() # If it is not there, then it will cause an error and we will let the program run normally. except: pass # Wait 7 seconds to give the comment time to be uploaded. sleep(7)
import unittest import selenium.webdriver as driver import time from selenium import webdriver from selenium.webdriver.common.keys import Keys driver = webdriver.Chrome("C://chromedriver.exe") driver.get("http://www.google.ru") xzcdxgfd = driver.find_element_by_name("q") xzcdxgfd.send_keys("kek") time.sleep(5) driver.close() hui
def exchange_walutomat(username, password, transaction_type, first_currency, second_currency, amount, rate): webdriver.implicitly_wait(10) webdriver.get('https://panel.walutomat.pl/moj-walutomat') webdriver.find_element_by_id('username').send_keys(username) webdriver.find_element_by_id('password').send_keys(password) webdriver.find_element_by_class_name('bem-button__inner-text').click() time.sleep(5) webdriver.get('https://user.walutomat.pl/#/order-placement') element = webdriver.find_element_by_id('order-volume') element.clear() element.send_keys(str(amount)) #send amount time.sleep(3) #TODO: choose transaction type from a dropdown menu. Buy is by default. ''' webdriver.find_element_by_id('order-type').click() #click on buy/sell time.sleep(2) # element from a dropdown menu is wrongly selected. To be fixed if transaction_type == 'buy': #choose buy/sell webdriver.find_element_by_class_name('select2-results__option select2-results__option--highlighted') elif transaction_type == 'sell': webdriver.find_element_by_link_text('Chcę sprzedać') ''' #TODO: find a way to select element for a different currencies. USD/PLN is by default. # element selector from a dropdown menu doesn't work ''' element.send_keys(Keys.TAB, Keys.SPACE) #click to choose first currency time.sleep(2) webdriver.find_element_by_class_name('icon-{}'.format(first_currency)).click() #choose first currency time.sleep(2) webdriver.send_keys(Keys.TAB) #click on second currency time.sleep(2) webdriver.send_keys(Keys.SPACE) webdriver.find_element_by_class_name('icon-{}'.format(second_currency)).click() #choose second currency time.sleep(2) webdriver.find_element_by_id('price-type-fixed').click() #choose custom exchange rate time.sleep(2) ''' webdriver.find_element_by_id('order-at-price').send_keys(str(rate)) #send custom exchange rate time.sleep(3) webdriver.find_element_by_id('order-preliminary-submit').click() #confirm transaction parameters time.sleep(3) element = webdriver.find_elements_by_class_name('content') podsumowanie = element[3].text.split('\n') podsumowanie = '{}, kurs {} {}\n{}\n'.format(' '.join(podsumowanie[1:3]), podsumowanie[4].lower(), podsumowanie[5], ' '.join(podsumowanie[6:8])) print(podsumowanie) confirmation = input('Czy potwierdzasz?') if confirmation in ['T', 't', 'Tak', 'tak', 'Y', 'y', 'Yes', 'yes']: try: webdriver.find_element_by_id('confirm-exchange').click() print('Zlecenie zostało złożone.') except: 'Something goes wrong. Laaambaada!' else: print('Operacja anulowana.') webdriver.close() return
def close_up(request): webdriver = Firefox() user_link = request.user.profile.link # list need for reverse reversed_list = [] with open(f'files_of_users/links_of_books_{user_link}.txt', 'r', encoding='utf-8') as f: if not os.path.exists(f'files_of_users/list_of_books_{user_link}.txt'): open(f'files_of_users/list_of_books_{user_link}.txt', 'w', encoding='utf 8').close() with open (f'files_of_users/list_of_books_{user_link}.txt', 'r', encoding='utf 8') as d: list_of_books = d.read() # there is need reverse, because new books go to the link list first, not last for link in f: reversed_list.append(link) for link in reversed(reversed_list): link = link.replace('\n', '') if link not in list_of_books: r = webdriver.request('GET', link) soup = BeautifulSoup(r.content, 'lxml') overview = [link] book = soup.find('div', class_='block-border card-block') author = [] if book.find('h2', class_='author-name unreg'): authors = book.find('h2', class_='author-name unreg') names = authors.find_all('a') for name in names: author.append(name.text) overview.append(author) else: author.append('Сборник') overview.append(author) title = book.span.text overview.append(title) tags = book.find_all('a', class_='label-genre') list_of_tags = [] for tag in tags: if tag.text.startswith('№'): tag = tag.text.split('в\xa0')[1] list_of_tags.append(tag) else: list_of_tags.append(tag.text) overview.append(list_of_tags) cover = book.find('img', id='main-image-book')['src'] overview.append(cover) if book.find('span', itemprop='ratingValue'): rating = book.find('span', itemprop='ratingValue').text else: rating = 0 overview.append(rating) description = book.p.text overview.append(description) data = [] if os.stat(f'files_of_users/list_of_books_{user_link}.txt').st_size != 0: with open(f'files_of_users/list_of_books_{user_link}.txt', 'r') as f: old = json.load(f) for i in old: data.append(i) data.append(overview) with open(f'files_of_users/list_of_books_{user_link}.txt', 'w') as f: json.dump(data, f) webdriver.close() return render(request, 'liv/test.html')
def fill_forms(email_producer, num_links, page_timeout, debug, visit_id, webdriver, proxy_queue, browser_params, manager_params, extension_socket, failfile, furl): """Finds a newsletter form on the page. If not found, visits <num_links> internal links and scans those pages for a form. Submits the form if found. """ # skipping: load the site # skipping: connecting to logger # try to find a newsletter form on the landing page if _find_and_fill_form(webdriver, email_producer, visit_id, debug, browser_params, manager_params): return # otherwise, scan more pages print("couldn't find form, going to click around") main_handle = webdriver.current_window_handle visited_links = set() for i in range(num_links): # get all links on the page links = webdriver.find_elements_by_tag_name('a') random.shuffle(links) current_url = webdriver.current_url current_ps1 = domain_utils.get_ps_plus_1(current_url) # find links to click match_links = [] start_time = timeit.default_timer() for link in links: try: if not link.is_displayed(): continue # check if link is valid and not already visited href = link.get_attribute('href') if href is None or href in visited_links: continue # check if this is an internal link if not _is_internal_link(href, current_url, current_ps1): continue link_text = link.text.lower() # skip links with blacklisted text blacklisted = False for bl_text in _LINK_TEXT_BLACKLIST: if bl_text in link_text: blacklisted = True break if blacklisted: continue # should we click this link? link_rank = 0 for type, s, rank, flags in _LINK_TEXT_RANK: if (type == _TYPE_TEXT and s in link_text) or (type == _TYPE_HREF and s in href): if flags & _FLAG_IN_NEW_URL_ONLY: # don't use this link if the current page URL already matches too if type == _TYPE_HREF and s in current_url: continue # link matches! link_rank = rank match_links.append( (link, rank, link_text, href, flags)) break if link_rank >= _LINK_RANK_SKIP: # good enough, stop looking break except: print("ERROR while looping through links...") sys.exit(1) # quit if too much time passed (for some reason, this is really slow...) if match_links and timeit.default_timer( ) - start_time > _LINK_MATCH_TIMEOUT: break # find the best link to click if not match_links: break # no more links to click match_links.sort(key=lambda l: l[1]) next_link = match_links[-1] visited_links.add(next_link[3]) # click the link try: # load the page print("clicking on link '%s' - %s" % (next_link[2], next_link[3])) next_link[0].click() time.sleep(_PAGE_LOAD_TIME) wait_until_loaded(webdriver, _PAGE_LOAD_TIME) # if browser_params['bot_mitigation']: # bot_mitigation(webdriver) # find newsletter form if _find_and_fill_form(webdriver, email_producer, visit_id, debug, browser_params, manager_params): return # should we stay on this page? if next_link[4] & _FLAG_STAY_ON_PAGE: continue # go back webdriver.back() wait_until_loaded(webdriver, _PAGE_LOAD_TIME) # check other windows (ex. pop-ups) windows = webdriver.window_handles if len(windows) > 1: form_found_in_popup = False for window in windows: if window != main_handle: webdriver.switch_to_window(window) wait_until_loaded(webdriver, _PAGE_LOAD_TIME) # find newsletter form if _find_and_fill_form(webdriver, email_producer, visit_id, debug, browser_params, manager_params): form_found_in_popup = True webdriver.close() webdriver.switch_to_window(main_handle) time.sleep(1) if form_found_in_popup: return except: pass # if you reach here, signup wasn't successful -- save the information with open(failfile, 'a') as wh: wh.write(furl + '\n')
def _find_and_fill_form(webdriver, email_producer, visit_id, debug, browser_params, manager_params): """Finds and fills a form, and returns True if accomplished.""" current_url = webdriver.current_url current_site_title = webdriver.title.encode('ascii', 'replace') main_handle = webdriver.current_window_handle in_iframe = False # debug: save before/after screenshots and page source debug_file_prefix = str(visit_id) + '_' debug_form_pre_initial = debug_file_prefix + 'form_initial_presubmit' debug_form_post_initial = debug_file_prefix + 'form_initial_result' debug_form_pre_followup = debug_file_prefix + 'form_followup_presubmit' debug_form_post_followup = debug_file_prefix + 'form_followup_result' debug_page_source_initial = debug_file_prefix + 'src_initial' debug_page_source_followup = debug_file_prefix + 'src_followup' # try to find newsletter form on landing page newsletter_form = _find_newsletter_form(webdriver) if newsletter_form is None: # search for forms in iframes (if present) iframes = webdriver.find_elements_by_tag_name('iframe') for iframe in iframes: # switch to the iframe webdriver.switch_to_frame(iframe) # is there a form? newsletter_form = _find_newsletter_form(webdriver) if newsletter_form is not None: if debug: dump_page_source(debug_page_source_initial, webdriver, _SRC_DUMP_PATH) in_iframe = True break # form found, stay on the iframe # switch back webdriver.switch_to_default_content() # still no form? if newsletter_form is None: return False elif debug: dump_page_source(debug_page_source_initial, webdriver, _SRC_DUMP_PATH) email = email_producer(current_url, current_site_title) user_info = _get_user_info(email) _form_fill_and_submit(newsletter_form, user_info, webdriver, False, browser_params, manager_params, debug_form_pre_initial if debug else None) print('submitted form on [%s] with email [%s]' % (current_url, email)) time.sleep(_FORM_SUBMIT_SLEEP) _dismiss_alert(webdriver) # if debug: save_screenshot(debug_form_post_initial, webdriver, browser_params, manager_params) # fill any follow-up forms... wait_until_loaded(webdriver, _PAGE_LOAD_TIME) # wait if we got redirected follow_up_form = None # first check other windows (ex. pop-ups) windows = webdriver.window_handles if len(windows) > 1: form_found_in_popup = False for window in windows: if window != main_handle: webdriver.switch_to_window(window) # find newsletter form if follow_up_form is None: follow_up_form = _find_newsletter_form(webdriver) if follow_up_form is not None: if debug: dump_page_source(debug_page_source_initial, webdriver, _SRC_DUMP_PATH) _form_fill_and_submit( follow_up_form, user_info, webdriver, True, browser_params, manager_params, debug_form_pre_followup if debug else None) time.sleep(_FORM_SUBMIT_SLEEP) _dismiss_alert(webdriver) # if debug: save_screenshot(debug_form_post_followup, webdriver, browser_params, manager_params) webdriver.close() webdriver.switch_to_window(main_handle) time.sleep(1) # else check current page if follow_up_form is None: follow_up_form = _find_newsletter_form(webdriver) if follow_up_form is not None: if debug: dump_page_source(debug_page_source_initial, webdriver, _SRC_DUMP_PATH) _form_fill_and_submit(follow_up_form, user_info, webdriver, True, browser_params, manager_params, debug_form_pre_followup if debug else None) time.sleep(_FORM_SUBMIT_SLEEP) _dismiss_alert(webdriver) # if debug: save_screenshot(debug_form_post_followup, webdriver, browser_params, manager_params) # switch back if in_iframe: webdriver.switch_to_default_content() # close other windows (ex. pop-ups) windows = webdriver.window_handles if len(windows) > 1: for window in windows: if window != main_handle: webdriver.switch_to_window(window) webdriver.close() webdriver.switch_to_window(main_handle) time.sleep(1) return True
def tearDownBrowser(browser, webdriver): print("-------------------------------------") print("Run " + browser + " Completed at :" + str(datetime.datetime.now())) webdriver.close() webdriver.quit()
def close_browser(cls, webdriver): '''Closes the browser after a delay. This method will be called in data_parser.py''' WebDriverWait(webdriver, delay) webdriver.close()
def retrieve_image(search_query, webdriver, dir_name, img_name): try: logger.log("image_scraping function start") image_name = '' # Variable that holds the number of images to fetch number_of_images_to_fetch = 1 index = 0 # Scroll down the webpage to load more images scroll_down(webdriver) time.sleep(5) # Save all of the html image elements from our google search # 'rg_i' is the class name that the images have image_elements = webdriver.find_elements_by_class_name('rg_i') target_dir = basest_dir + "/" + dir_name # Check if the directory that we want to put our iamges in already exists if not os.path.exists(target_dir): # If not, make that directory os.mkdir(target_dir) found_image_count = 0 attempt_count = 0 logger.log("begin finding images") for element in image_elements: attempt_count += 1 try: # Check if you've downloaded all the images you want if found_image_count == number_of_images_to_fetch: break # Click on the image you want to download element.click() # Give the browser some time to catch up time.sleep(2) # After clicking on the image, get the larger version found_image = webdriver.find_element_by_class_name('n3VNCb') # find the source of the image, it's url image_url = found_image.get_attribute('src') logger.log("attempt " + str(attempt_count) + ": " + image_url[0:10]) # Make sure that the image url is a valid source if 'http' in image_url: logger.log("successful image found") # Download this image as a BytesIO object image_file = io.BytesIO(requests.get(image_url).content) # Convert our BytesIO object into an actual image image = Image.open(image_file).convert('RGB') # Create the the name of this image we're downloaded image_name = img_name + '.jpg' logger.log(image_name) # Save the path that we want to save the image to # The directory will be the same name as the search query image_path = target_dir + '/' + image_name # Save the image image.save(image_path, 'JPEG', quality=85) found_image_count += 1 # endif statement # end try block except: logger.log("couldn't find enhanced images") # end except block # End for loop loop # close the web browser #webdriver.close() if attempt_count > 3: logger.log("multiple attempts: " + search_query + "<=======") else: logger.log(image_name) return image_name except: logger.log("retrieve image crash") webdriver.close()
def close_up(request): print('start close_up') webdriver = Firefox() userlink = request.user.profile.link # список для реверса ll = [] with open(f'files_of_users/links_of_books_{userlink}.txt', 'r', encoding='utf-8') as f: if not os.path.exists(f'files_of_users/list_of_books_{userlink}.txt'): open(f'files_of_users/list_of_books_{userlink}.txt', 'w', encoding='utf 8').close() with open(f'files_of_users/list_of_books_{userlink}.txt', 'r', encoding='utf 8') as d: list_of_books = d.read() # нужен реверс, т.к. в список ссылок новые книги идут первыми, а не последними for link in f: ll.append(link) for link in reversed(ll): link = link.replace('\n', '') print('\n', link) if link not in list_of_books: print('Обрабатывается', link) # sleep против капчи time.sleep(5) r = webdriver.request('GET', link) soup = BeautifulSoup(r.content, 'lxml') # для обработки ошибок with open('files_of_users/current_book.txt', 'w', encoding='utf-8') as f: f.write(soup.prettify()) overview = [link] book = soup.find('div', class_='block-border card-block') author = [] if book.find('h2', class_='author-name unreg'): authors = book.find('h2', class_='author-name unreg') names = authors.find_all('a') for name in names: author.append(name.text) overview.append(author) else: author.append('Сборник') overview.append(author) title = book.span.text overview.append(title) tags = book.find_all('a', class_='label-genre') list_of_tags = [] for tag in tags: if tag.text.startswith('№'): tag = tag.text.split('в\xa0')[1] list_of_tags.append(tag) else: list_of_tags.append(tag.text) overview.append(list_of_tags) cover = book.find('img', id='main-image-book')['src'] overview.append(cover) if book.find('span', itemprop='ratingValue'): rating = book.find('span', itemprop='ratingValue').text else: rating = 0 overview.append(rating) description = book.p.text overview.append(description) data = [] if os.stat(f'files_of_users/list_of_books_{userlink}.txt' ).st_size != 0: with open( f'files_of_users/list_of_books_{userlink}.txt', 'r') as f: old = json.load(f) for i in old: data.append(i) data.append(overview) with open(f'files_of_users/list_of_books_{userlink}.txt', 'w') as f: json.dump(data, f) print('Обработана') else: print('Уже обработана', link) webdriver.close() print('finish close_up') return render(request, 'liv/test.html')
def crawler(self): url = "https://www.instagram.com/explore/tags/무신사/" # 포스트 내 컨텐츠 담을 리스트 선언 tagList = [] # 페이지 스크롤 변수 pagedowns = 0 # dict(hashtag,cnt) hashtag = {} # 엑셀 저장 데이터 feedList = [] # 리턴 데이터 returnList = {} # 크롤링 결과 데이터 crawlingList = {} # 크롬 옵션 설정 # options = webdriver.ChromeOptions() # print(options) # #headless 모드 # options.add_argument('headless') # options.add_argument('window-size=1920x1080') # options.add_argument('disable-gpu') # #headless 모드 탐지 방지 언어 및 headless로 보이지 않도록 플러그인 수정 # options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36") # options.add_argument("lang=ko_KR") # 한국어! # print(options) # driver = webdriver.Chrome('chromedriver',chrome_options=options) #네비게이터에 올바른 브라우저 환경처럼 보이도록 세팅해준다 #driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});") #언어 #driver.execute_script("Object.defineProperty(navigator, 'languages', {get: function() {return ['ko-KR', 'ko']}})") #위에서 차단한 렌더링 가속 가짜로 넣어서 위장 #driver.execute_script("const getParameter = WebGLRenderingContext.getParameter;WebGLRenderingContext.prototype.getParameter = function(parameter) {if (parameter === 37445) {return 'NVIDIA Corporation'} if (parameter === 37446) {return 'NVIDIA GeForce GTX 980 Ti OpenGL Engine';}return getParameter(parameter);};") # 브라우저가 실행되며 해당 url로 이동 # 파이어폭스 옵션 설정 profile = webdriver.FirefoxProfile() profile.set_preference("network.proxy.type", 1) profile.set_preference("network.proxy.socks", "127.0.0.1") profile.set_preference("network.proxy.type", 9150) profile.set_preference('general.useragent.override', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0') profile.update_preferences() options = webdriver.FirefoxOptions() options.add_argument("--headless") try: driver = webdriver.Firefox(executable_path='/crawler/repo/blog/geckodriver.exe',firefox_profile=profile,firefox_options=options) except WebDriverException: webdriver.close() #코드 시작시간 start = datetime.datetime.now().strftime("%Y_%m_%d %H:%M:%S") print(start) driver.get(url) # 웹자원 대기 driver.implicitly_wait(1) # 총 게시물 수 태그 클래스이름으로 찾기 ttlFeed = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.CLASS_NAME,"g47SY"))) print("총 게시물:", ttlFeed.text) # body 태그를 태그 이름으로 찾기 time.sleep(1) # 페이지 내 첫번째 게시물 클릭 driver.find_elements_by_class_name("eLAPa")[0].click() # failCnt failCnt = 0 count = self.count # 데이터 스크래핑 시작 while pagedowns < count: # 페이지 호출 후 대기 #driver.implicitly_wait(5) #게시물 본문 try: post = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.CLASS_NAME,"C4VMK"))) try: driver.find_element_by_class_name('XQXOT').send_keys(Keys.HOME) driver.find_element_by_class_name('XQXOT').find_element_by_xpath("//ul/li/div/button").click() driver.find_element_by_class_name('XQXOT').send_keys(Keys.HOME) except (NoSuchElementException,ElementNotInteractableException): pass # 게시물 글자수 160자 # 댓글포함 최대 30개 # 하나의 해시트그 내 글자수 100자 #id = driver.execute_script("document.body.getElementsByClassName('C4VMK')[0].getElementsByTagName('a')[0].innerText") #content = driver.execute_script("document.body.getElementsByClassName('C4VMK')[0].getElementsByTagName('span')[0].innerText") req = driver.page_source soup = BeautifulSoup(req,'html.parser') replyCount = soup.find_all("div",class_="C4VMK") tagCount = replyCount[0].select('span>a') id = replyCount[0].find_all(class_="_6lAjh")[0].select("a")[0].text content = replyCount[0].select('span')[0].text like = '0' tags=[] feedRow = {} try: #like = driver.find_element_by_class_name("Nm9Fw").find_element_by_tag_name("span").text like = soup.find_all("div",class_="Nm9Fw")[0].select("span")[0].text except (NoSuchElementException,IndexError): try: like = soup.find_all("span",class_="vcOH2")[0].select("span")[0].text except IndexError: pass #데이터 가공 emoji_pattern = re.compile("[\U00010000-\U0010ffff]", flags=re.UNICODE) content = emoji_pattern.sub('',content) #태그 로직 끝난 후에 긍정,부정 체크 메서드 만들것 #본문의 해시태그 if len(tagCount) > 0: for i in range(0,len(tagCount)): tag = tagCount[i].text if "#" in tag: tag = tag.replace("#","").replace(" ","") tags.append(tag) #댓글의 해시태그 if len(replyCount) > 0: for i in range(1,len(replyCount)): #replyid = "document.body.getElementsByClassName('C4VMK')["+i+"].getElementsByTagName('a')[0].innerText" replyid = replyCount[i].find_all("a")[0].text if id == replyid: #replyTagCount = driver.execute_script("document.body.getElementsByClassName('C4VMK')["+i+"].getElementsByTagName('span')[0].getElementsByTagName('a').length") replyTagCount = replyCount[i].find_all("a") if len(replyCount) > 1: for j in range(0,len(replyTagCount)): #reply = driver.execute_script("document.body.getElementsByClassName('C4VMK')["+i+"].getElementsByTagName('span')[0].getElementsByTagName('a')["+j+"].innerText") reply = replyTagCount[j].text if "#" in reply: reply = reply.replace("#","").replace(" ","") tags.append(reply) #중복제거 tags = list(set(tags)) tagList.append(tags) print("=======================================================================================") print("====================================pagedowns : ",pagedowns,"====================================") print("=======================================================================================") print("id===============================",id) print("content==========================",content) print("like=============================",like) print("finaltag=========================",tags) feedRow["id"] = id feedRow["content"] = content feedRow["tag"] = tags feedRow["like"] = like feedList.append(feedRow) time.sleep(1) #다음 게시물 클릭 try: driver.find_element_by_class_name("HBoOv").click() except NoSuchElementException: # 웹자원 대기 driver.get(url) driver.implicitly_wait(1) for i in range(0,pagedowns): driver.find_elements_by_class_name("eLAPa")[0].click() #html = driver.find_element_by_tag_name("html") #html.send_keys(Keys.DOWN) pagedowns += 1 print("=======================================================================================") print("=======================================================================================") except (NoSuchElementException,StaleElementReferenceException,TimeoutException): failCnt += 1 print("=======================================================================================") print("====================================failcount : ",failCnt,"=====================================") print("=======================================================================================") if failCnt > 3: driver.find_element_by_class_name("HBoOv").click() time.sleep(120) pass print("끝!!") # 해시태그 중복 검사 후 리스트로 재할당 tagList = list([tuple(set(tag)) for tag in tagList]) # 해시태그 갯수 구하기 for htags in tagList: for htag in htags: # 해시태그 카운트 업 if not (htag in hashtag): hashtag[htag] = 1 else: hashtag[htag] += 1 # 정렬 keys = sorted(hashtag.items(), key = lambda x:x[1], reverse = True) # n순위 까지 출력 for k, v in keys[:15]: print("{}({})".format(k, v)) end = datetime.datetime.now().strftime("%Y_%m_%d %H:%M:%S") print("start======",start) print("end======",end) print("enddivision=========",datetime.datetime.strptime(end,"%Y_%m_%d %H:%M:%S")-datetime.datetime.strptime(start,"%Y_%m_%d %H:%M:%S")) # result = pd.DataFrame(feedList) # result.columns = ['id','content','tag','like'] # result.head() #웹자원 종료 driver.close crawlingList["ttlfeed"] = ttlFeed.text crawlingList["crwfeed"] = len(tagList) crawlingList["succnt"] = pagedowns crawlingList["failcnt"] = failCnt crawlingList["created_at"] = start crawlingList["updated_at"] = end crawlingList["working_while"] = str(datetime.datetime.strptime(end,"%Y_%m_%d %H:%M:%S")-datetime.datetime.strptime(start,"%Y_%m_%d %H:%M:%S")) returnList["crawlingList"] = crawlingList returnList["tagList"] = keys returnList["excelList"] = feedList return returnList