def crawl(): PATIENCE = 15 MAX_RETRY = 3 SOURCE_NAME = "조선일보" MAIN_URL = "http://www.chosun.com/" driver = util.get_driver() driver.get(MAIN_URL) driver.set_page_load_timeout(PATIENCE) article_link_patterns = ["news.chosun.com/site/data/html_dir/"] link_list = [] timeout_cnt = 0 skipped_cnt = 0 # href_elms = driver.find_elements_by_class_name("sec_con")[1].find_elements_by_css_selector("[href]") href_elms = WebDriverWait(driver, PATIENCE) \ .until(EC.presence_of_all_elements_located((By.CLASS_NAME, "sec_con")))[1] \ .find_elements_by_css_selector("[href]") # live_elms = driver.find_elements_by_css_selector("#today_live_con_id [href]") live_elms = WebDriverWait(driver, PATIENCE) \ .until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#today_live_con_id [href]"))) href_elms += live_elms for i in href_elms: href = i.get_attribute("href") for p in article_link_patterns: if p in href: try: link_list.index(href) except ValueError: link_list.append(href) break print("%d articles found" % len(link_list)) for i in link_list: # Retry loop for retry in range(0, 3): try: article = extract_news(driver, i) if article is not None: util.post(article, SOURCE_NAME) break else: continue except (TimeoutException, NoSuchElementException, StaleElementReferenceException): if retry == MAX_RETRY - 1: skipped_cnt += 1 else: driver.refresh() timeout_cnt += 1 driver.quit() print("Done with %d timeouts and %d skipped pages in %d links" % (timeout_cnt, skipped_cnt, len(link_list)))
def crawl(): PATIENCE = 15 MAX_RETRY = 3 SOURCE_NAME = "동아일보" MAIN_URL = "http://www.donga.com/" driver = util.get_driver() driver.set_page_load_timeout(PATIENCE) driver.get(MAIN_URL) INCLUDE_URLS = ["news.donga.com/Main", "news.donga.com/MainTop"] article_links = [] timeout_cnt = 0 skipped_cnt = 0 href_elms = driver.find_elements_by_css_selector("[href]") for e in href_elms: href = e.get_attribute("href") for i in INCLUDE_URLS: if i in href: article_links.append(href) break print("%d articles found" % len(article_links)) for i in article_links: for retry in range(0, MAX_RETRY): try: article = extract(driver, i) util.post(article, SOURCE_NAME) break except (TimeoutException, NoSuchElementException, StaleElementReferenceException): if retry == MAX_RETRY - 1: skipped_cnt += 1 else: driver.refresh() timeout_cnt += 1 driver.quit() print("Done with %d timeouts and %d skipped pages in %d links" % (timeout_cnt, skipped_cnt, len(article_links)))
def xmlrpc_get_driver(self, iface): """ Returns driver version """ return util.get_driver(iface)
from time import sleep from util import get_driver driver = get_driver() # 1 获取手机分辨率,通常用于swipe方法完成滑动操作 size = driver.get_window_size() print('分辨率:', size) width = size.get('width') height = size.get('height') # 向下滑动 driver.swipe(width * 0.5, height * 0.8, width * 0.5, height * 0.2, 2000) sleep(3) driver.close_app() # 向右滑动 driver.swipe(width * 0.8, height * 0.5, width * 0.2, height * 0.5, 2000) sleep(3) driver.quit()
def xmlrpc_get_driver(self, iface): """ Returns driver version """ return util.get_driver(iface)
def setup_class(self): # 声明我们的driver对象 self.driver = get_driver()
#!/usr/bin/env python # coding: utf-8 # In[1]: from util import get_driver, read_lines, write_lines, dump_pickle, load_pickle from pprint import pprint from tqdm import tqdm from collections import OrderedDict import os # In[2]: driver = get_driver(browser='chrome', headless=True) # In[3]: entries = read_lines('roots.txt') vocab = OrderedDict() for entry in tqdm(entries): lst = entry.split() word = lst[1] # Ignore same word with different pos tag if word not in vocab: vocab[word] = [lst[0]] + lst[2:] # In[4]: def get_num_related(word, verbose=False): driver.get(f'https://www.merriam-webster.com/thesaurus/{word}')
def start_listener(): global activities backup_reload(False) while service_on: with util.get_driver(True) as driver: insta_login(driver) x = 0 while service_on and x < 500: if x == 0: do_follows(driver) x = 40 else: x = x - 1 messages = load_messages(driver) for user, msgs in messages.items(): for msg in msgs: if isinstance(msg, list) and len(msg) == 2: activity_type = "sent a Text '{0}'".format(msg[1]) if msg[0] == "Image": activity_type = "shared a photo. URL '{0}'".format( msg[1]) send_message(driver, user, [ ":robot_face: {0} - Hübsches Foto :heart_eyes:, aber damit kann ich im Moment nichts anfangen :confounded:" .format(config.BOT_NAME) ]) if msg[0] == "Video": activity_type = "shared a video. URL '{0}'".format( msg[1]) send_message(driver, user, [ ":robot_face: {0} - Super Video :ok_hand:, aber damit kann ich im Moment nichts anfangen :confounded:" .format(config.BOT_NAME) ]) if msg[0] == "Post": activity_type = "shared a Post or Story. URLs: '{0}'".format( msg[1]) if msg[1]: send_message(driver, user, [ ":arrow_down: Post Downloader - hier sind die Links zu den Medien:" ]) for url in msg[1]: send_message(driver, user, [url]) send_message(driver, user, [ ":muscle: powered by: @{0} :sunglasses:" .format(config.INSTA_USER) ]) else: send_message(driver, user, [ ":warning: Post Downloader - keine Medien gefunden!" ]) if msg[0] == "Text": if msg[1].startswith("!"): cmd = msg[1].split(" ") activity_type = "issued Command: '{0}' with Args: '{1}'".format( cmd[0], " ".join(cmd[1:])) process_command(driver, user, cmd[0], cmd[1:]) log_activity("[{0}] '{1}' {2}".format( time.ctime(time.time()), user, activity_type)) util.append_list(activities, os.path.join('data', 'activities_log')) activities = [] time.sleep(random.uniform(5, 10)) backup_reload(True)
from tqdm import tqdm import json import os from util import get_driver, read_lines, dump_pickle URL = "https://www.collinsdictionary.com/us/dictionary/english/" XPATH = { "pos": '//span[@class="gramGrp pos"]', "meaning": '//div[@class="def"]', "sent": '//div[@class="cit type-example quote"]', } lines = read_lines('sorted_importance.txt') driver = get_driver(headless=True) if os.path.exists('book.json'): with open('book.json') as fp: dictionary = json.load(fp) else: dictionary = {} for i, line in enumerate(tqdm(lines)): word, freq, use = line.split()[:3] if word not in dictionary: dictionary[word] = { 'freq': freq, 'use': use, } driver.get(URL + word.lower()) for key, xpath in XPATH.items(): try:
def crawl(): PATIENCE = 15 MAX_RETRY = 3 SOURCE_NAME = "중앙일보" MAIN_URL = "https://joongang.joins.com/" INCLUDE_URLS = [ "news.joins.com/article" ] inclusion_filtered = [] link_list = [] timeout_cnt = 0 skipped_cnt = 0 driver = util.get_driver() driver.set_page_load_timeout(PATIENCE) done = False for r in range(0, PATIENCE): try: driver.get(MAIN_URL) done=True # href_elms = driver.find_elements_by_css_selector("[href]") href_elms = WebDriverWait(driver, PATIENCE) \ .until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[href]"))) except TimeoutException: continue if not done: driver.quit() sys.exit(1) for i in href_elms: href = i.get_attribute("href") for j in INCLUDE_URLS: if j in href: inclusion_filtered.append(href) break # 링크 정제 과정 for href in inclusion_filtered: if "?" in href: href = href.split("?")[0] if href is not None: try: link_list.index(href) except ValueError: # 추가할 링크가 리스트에 없는 경우 => 중복되지 않는 경우 link_list.append(href) print("%d articles found" % len(link_list)) for i in link_list: for retry in range(0, MAX_RETRY): try: article = extract(driver=driver, url=i) if article is not None: util.post(article, SOURCE_NAME) break except (TimeoutException, NoSuchElementException, StaleElementReferenceException): if retry == MAX_RETRY - 1: skipped_cnt += 1 else: try: driver.refresh() except TimeoutException: skipped_cnt += 1 continue timeout_cnt += 1 driver.quit() print("Done with %d timeouts and %d skipped pages in %d links" % (timeout_cnt, skipped_cnt, len(link_list)))