def catalog_url(url='http://www.meitun.com/'): # catalog_url is AJAX,use phantomJS driver = PhantomJS() driver.get(url) driver.maximize_window() mov_ele = driver.find_element_by_css_selector('.nav>ul>li:nth-child(1)') # the mouse move to the lazy layout element,and perform ActionChains(driver).move_to_element(mov_ele).perform() time.sleep(3) response = driver.page_source driver.quit() # use pyquery parser the page source,more quickly d = pq(response) return map(lambda x: 'http:' + pq(x).attr('href'), d.find('.cg-pdts a'))
def on_start_again(self, url): driver = PhantomJS() driver.get(url) time.sleep(2) driver.maximize_window() t = driver.find_element_by_css_selector('.page-txt').text res_t = [] if t: t = int(t.split('/')[1][:-1]) - 1 # get the page count # the count of page turning should be i-1 while t: t -= 1 move_ele = driver.find_element_by_css_selector('#next') ActionChains(driver).move_to_element(move_ele).click() time.sleep(1) res_t.append(driver.page_source) driver.quit() for item in res_t: self.step_first(item)
class CamaraCGCrawler(object): """ Camara CG Ementa Crawler """ def __init__(self, starting_year): self.base_url = "http://187.115.174.90:8080/ScanLexWeb" self.starting_year = starting_year self.browser = None @staticmethod def get_ementa_id(published_date, ementa_type, ementa_doc_number, ementa_situation): """ Return the Ementa Unique Id """ return "%s#%s#%s#%s" % (datetime.strftime( published_date, "%Y-%m-%d"), ementa_type, ementa_doc_number, ementa_situation) def get_all_ementas_summary(self): """ Yield the next ementa information row """ browser_table = self.browser.find_element_by_id( "frmMenu:tabEmentas_data") bs_ementa_table = BeautifulSoup( browser_table.get_attribute("innerHTML")) for row in bs_ementa_table.find_all("tr"): cols = row.find_all("td") if len(cols) == 6: published_date = datetime.strptime( cols[0].span.text.encode("utf-8"), "%d/%m/%Y") doc_number = int(cols[1].span.text.encode("utf-8")) title = cols[2].span.text.encode("utf-8") ementa_type = cols[3].span.text.encode("utf-8") ementa_situation = cols[4].span.text.encode("utf-8") details_js = cols[5].a['onclick'].encode("utf-8") if published_date > datetime.now(): continue yield published_date, doc_number, title, ementa_type, ementa_situation, details_js def get_ementa_details(self, ementa_details_js): """ Crawl the second ementa page """ # Waiting... _ = WebDriverWait(self.browser, 30).until( EC.visibility_of_element_located( (By.ID, "frmfuncao:j_idt13_content"))) _ = WebDriverWait(self.browser, 30).until( EC.visibility_of_element_located( (By.ID, "frmfuncao:tabProponentes"))) # Get Ementail Details bs_ementa_details = BeautifulSoup(self.browser \ .find_element_by_id("frmfuncao:j_idt13_content").get_attribute("innerHTML")) rows = bs_ementa_details.find_all("tr") source = rows[3].td.text main_theme = rows[7].td.text sys_enter_date = datetime.strptime(rows[9].td.text, "%d/%m/%Y") approval_date = datetime.strptime(rows[11].td.text, "%d/%m/%Y") process_number = int(rows[15].td.text or "-1") autograph_number = int(rows[19].td.text or "-1") process_year = int(rows[21].td.text or "-1") has_image = rows[23].td.text == "Sim" # Get Proponent names bs_proponent = BeautifulSoup( self.browser.find_element_by_id( "frmfuncao:tabProponentes").get_attribute("innerHTML")) proponents = ",".join( [col.text for col in bs_proponent.find_all("td")]) return source, proponents, main_theme, sys_enter_date, approval_date, process_number, \ autograph_number, process_year, has_image def next_ementa(self, select_curs): """ Iterate in the years onwards and collect all the ementas """ try: LOGGER.info("Opening Browser") self.browser = PhantomJS() LOGGER.info("GET [%s]", self.base_url) self.browser.maximize_window() cur_year = int(datetime.now().year) # Define the initial collection year select_curs.execute( "SELECT EXTRACT (YEAR FROM MAX(published_date)) FROM ementas;") last_exec_year = select_curs.fetchone() if last_exec_year: collection_year = max(self.starting_year, last_exec_year[0]) else: collection_year = self.starting_year all_proponents = [ "ANDERSON MAIA", "Afonso Alexandre Régis", "Alcides Cavalcante", "Alcindor Villarim", "Aldo Cabral", "Alexandre do Sindicato", "Antonio Pereira", "Antônio Alves Pimentel Filho", "Aragão Júnior", "Bruno Cunha Lima Branco", "Bruno Gaudêncio", "Buchada", "Cassiano Pascoal", "Cozete Babosa", "Cássio Murilo Galdino de Araujo", "Daniella Ribeiro", "Dr. Nunes", "Executivo", "Fabrinni Brito", "Fernando carvalho", "Francisco Dantas Lira", "Galego do Leite", "Inacio Falcao", "Ivan Batista", "Ivonete Ludgerio", "Joao Dantas", "Josimar Henrique da Silva", "José Marcos Raia ", "José Ribamar", "João Dantas", "Jóia Germano", "Laelson Patricio", "Lafite", "Lindaci Medeiros Nápolis", "Lourdes Costa", "Lula Cabral", "Marcos Marinho", "Maria Lopes Barbosa", "Marinaldo Cardoso", "Metuselá Agra", "Miguel Rodrigues da Silva", "Miguel da Construção", "Napoleão Maracajá", "Nelson Gomes Filho", "Olimpio Oliveira", "Orlandino Farias", "Paulo Muniz", "Paulo de Tarso", "Peron Ribeiro Japiassú", "Renato Feliciano", "Rodolfo Rodrigues", "Rodrigo Ramos Victor", "Romero Rodrigues", "Rostand Paraíba", "Rômulo Gouveia", "Saulo Germano", "Saulo Noronha", "Tia Mila", "Tovar Correia Lima", "Vaninho Aragão", "Veneziano Vital do rego", "Walter Brito Neto", "Todos" ] while collection_year <= cur_year: for i_prop in range(len(all_proponents)): ementa_prop = all_proponents[i_prop].decode("utf-8") self.browser.get(self.base_url) # Waiting... WebDriverWait(self.browser, 30).until( EC.element_to_be_clickable((By.ID, "frmMenu:button1"))) LOGGER.info("Collecting Ementas from [%d][%s - %d/%d]", collection_year, ementa_prop, i_prop + 1, len(all_proponents)) # Set Year year_field = self.browser.find_element_by_id("frmMenu:ano") year_field.send_keys(collection_year) # Set Proponent proponent_field = self.browser.find_element_by_id( "frmMenu:autoridade") proponent_field.send_keys(ementa_prop) # Submit the form self.browser.find_element_by_id("frmMenu:button1").click() # Waiting... # _ = WebDriverWait(self.browser, 60).until(EC.visibility_of_element_located((By.ID, "frmMenu:tabEmentas_data"))) time.sleep(3) for published_date, document_number, title, ementa_type, ementa_situation, ementa_details_js in self.get_all_ementas_summary( ): ementa_id = self.get_ementa_id(published_date, ementa_type, document_number, ementa_situation) select_curs.execute(""" SELECT ementa_id FROM ementas WHERE ementa_id = '%s'; """ % ementa_id) if not select_curs.fetchone(): # Run the details script self.browser.execute_script(ementa_details_js) ementa_source, proponents, main_theme, sys_enter_date, approval_date, \ process_number, autograph_number, process_year, has_image = self.get_ementa_details(ementa_details_js) # Come back to the table page self.browser.back() # Waiting... _ = WebDriverWait(self.browser, 60).until( EC.visibility_of_element_located( (By.ID, "frmMenu:tabEmentas_data"))) yield ementa_id, published_date, ementa_type, document_number, title, \ ementa_source, proponents, ementa_situation, main_theme, sys_enter_date, \ approval_date, process_number, autograph_number, process_year, has_image LOGGER.info("DONE [%d]", collection_year) self.browser.back() collection_year += 1 finally: if self.browser: self.browser.quit()
class CNStock(SentimentCrawler): def __init__(self): super().__init__(init=False) self.driver = PhantomJS() self.driver.maximize_window() self.wait = WebDriverWait(self.driver, 15) self.url = 'http://www.cnstock.com/' self.name = '中国证券网' def crawl_main_page(self, keyword): self.driver.set_page_load_timeout(10) try: self.driver.get(self.url) except TimeoutException: self.driver.execute_script('window.stop();') try: self.wait.until( ec.presence_of_element_located((By.ID, 'nav_keywords'))) except: CustomLogging.log_to_file('中国证券网打开失败', LogType.ERROR) self.driver.find_element_by_id('nav_keywords').clear() self.driver.find_element_by_id('nav_keywords').send_keys(keyword + Keys.ENTER) return self.crawl_search_results() def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_element_located( (By.CLASS_NAME, 'result-cont'))) except TimeoutException: CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name( 'result-article') for each_article in result_articles: item = Entity() publish_date = each_article.find_element_by_class_name( 'g').text item.publish_date = re.search( re.compile( '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d' ), publish_date).group() if not in_date_range( conv_pub_date(item.publish_date, 'cnstock'), self.year_range): exit_flag = 1 # 跳出for循环 break item.short_description = each_article.find_element_by_class_name( 'des').text item.title = each_article.find_element_by_tag_name( 'a').text if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_tag_name( 'a').get_attribute('href') threading.Thread(target=super().download_and_save_item, args=(item, )).start() if exit_flag == 1: break except NoSuchElementException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_xpath( '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]' ) self.driver.get(next_page.get_attribute('href')) # next_page.click() except NoSuchElementException: break return search_results def parse_html(self, url, html): bs = BeautifulSoup(html, 'lxml') try: full_content = bs.find('div', attrs={'id': 'qmt_content_div'}).text return full_content except Exception: CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR) pass
Created on Tue Aug 7 15:48:28 2018 @author: 肖 原网址:http://ac.qq.com/ComicView/index/id/521825/cid/1 ,但是该网址拒绝查看源代码,通过观察进一步发现给原网址添加一个view-source: 就可以出现园代码了,即view-source:http://ac.qq.com/ComicView/index/id/521825/cid/1 """ from selenium.webdriver import PhantomJS, DesiredCapabilities import time import re header = DesiredCapabilities.CHROME.copy() # DesiredCapabilities可以伪装谷歌浏览器 web = PhantomJS(desired_capabilities=header, executable_path='F:/phantomjs-2.1.1-windows/bin/phantomjs' ) # 需要设置PhantomJS的路径,否则无法运行 web.maximize_window() # 设置浏览器屏幕最大化 web.get('http://ac.qq.com/ComicView/index/id/521825/cid/1') # 获取网页 web.get_screenshot_as_file( './abc.png') # 网页截图,可以看到一个网页图片,以png的格式保存到指定位置,名称为abc.png for page in range(1, 30): # window.scrollTo(0,{})往下翻页 web.execute_script('window.scrollTo(0,{})'.format( 1080 * page)) # execute_script表示执行翻页的脚本,1080*1表示第一页,1080*2表示第二页,以此类推。。。 time.sleep(1) web.get_screenshot_as_file('./abc.png') # 下载最后一页 pat = 'https://manhua.qpic.cn/vertical/0/(.*?)"' # 通过正则获取图片地址 ls = re.compile(pat, re.S).findall(web.page_source) # web.page_source表示源代码 import urllib.request as r