def main_func(search_term): search_term = url_encode(search_term) browser = None browser = webdriver.Chrome("%AZ_BATCH_NODE_WORKING_DIR%\\chromedriver 2") scrapeCNN(browser, search_term) scrapeBBC(browser, search_term) scrapeFOX(browser, search_term) export_json() # Set the limit for number of articles to download LIMIT = 30 data = {} data['newspapers'] = {} documents = {"documents": []} count = 1 # Iterate through each news company for company, value in all_data.items(): if 'rss' in value: d = fp.parse(value['rss']) print("Downloading articles from ", company) newsPaper = { "rss": value['rss'], "link": value['link'], "articles": [] } for entry in d.entries: # Check if publish date is provided, if no the article is skipped. # This is done to keep consistency in the data and to keep the script from crashing. if hasattr(entry, 'published'): if count > LIMIT: break article = {} article['link'] = entry.link date = entry.published_parsed article['published'] = datetime.fromtimestamp( mktime(date)).isoformat() try: content = Article(entry.link) content.download() content.parse() except Exception as e: # If the download for some reason fails (ex. 404) the script will continue downloading # the next article. print(e) print("continuing...") continue article['title'] = content.title article['text'] = content.text newsPaper['articles'].append(article) print(count, "articles downloaded from", company, ", url: ", entry.link) count = count + 1 else: # This is the fallback method if a RSS-feed link is not provided. # It uses the python newspaper library to extract articles print("Building site for ", company) for link in value['link']: content = Article(link) newsPaper = {"link": link, "articles": []} noneTypeCount = 0 if count > LIMIT: break try: content.download() content.parse() except Exception as e: print(e) print("continuing...") continue # Again, for consistency, if there is no found publish date the article will be skipped. # After 10 downloaded articles from the same newspaper without publish date, the company will be skipped. article = {} article['title'] = content.title article['text'] = content.text article['link'] = content.url if content.publish_date is not None: article['published'] = content.publish_date.isoformat() newsPaper['articles'].append(article) info = {} if len(content.text) < 5100: info["id"] = company + str(count) info["title"] = content.title info['link'] = content.url info['source'] = company info["language"] = "en" info["text"] = content.text documents["documents"].append(info) print(count, "articles downloaded from", company, " using newspaper, url: ", content.url) count = count + 1 noneTypeCount = 0 data['newspapers'][company] = newsPaper run_sample()
from selenium import webdriver driver = webdriver.Chrome( '/home/alejandro/Desarrollo de Software/Laboratorio4/chromedriver') driver.get("https://www.python.org") boton = driver.find_element_by_xpath('//*[@id="news"]/a') boton.click() input = driver.find_element_by_xpath('//*[@id="id-search-field"]') input.send_keys('disc') boton.clear()
# You should run retrain.py before using this import datetime, json, math, re from html import unescape from time import sleep import tweepy from selenium import webdriver from selenium.common.exceptions import ( NoSuchElementException, StaleElementReferenceException, ) from selenium.webdriver.common.keys import Keys import Markov # Configurable options delay = 10 # time to wait on each page load before reading the page driver = webdriver.Chrome() # options are Chrome() Firefox() Safari() try: with open("data.json") as f: data = json.load(f) except IOError as e: print(e) exit() print("Connecting to Twitter API") # connect to twitter api auth = tweepy.OAuthHandler( data["keys"]["consumer_token"], data["keys"]["consumer_secret"], "https://auth.r0uge.org", )
class runTestCase(object): log = Logger(level="debug").logger toFileLog = Logger(level="debug") module_path = path.pathutil().rootPath imagepath = '' datatypemenu = ['ElementsInfo', 'InputData', 'OutputData'] browser = webdriver.Chrome() urlReg = re.compile( r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... r'localhost|' # localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) def __init__(self): self.switch = { "打开": lambda casedealinfoDic, ifrerun: self.test_open( casedealinfoDic, ifrerun), "点击": lambda casedealinfoDic, ifrerun: self.test_click( casedealinfoDic, ifrerun), "输入": lambda casedealinfoDic, ifrerun: self.test_input( casedealinfoDic, ifrerun), "清除": lambda casedealinfoDic, ifrerun: self.test_clear( casedealinfoDic, ifrerun), "等待": lambda casedealinfoDic, ifrerun: self.test_wait( casedealinfoDic, ifrerun), "断言": lambda casedealinfoDic, ifrerun: self.test_assertion( casedealinfoDic, ifrerun), "引入": lambda casedealinfoDic, ifrerun: self.test_introduce( casedealinfoDic, ifrerun), "最大化": lambda casedealinfoDic, ifrerun: self.test_maximize( casedealinfoDic, ifrerun) } # 定义一个保存截图函数 def save_img(self, img_name): self.imagepath = self.module_path + '/Output/Resources/' + img_name + '.png' self.browser.get_screenshot_as_file(self.imagepath) # 启动函数,每个用例测试前,都会执行该函数 def setUp(self, url): self.starttime = parse( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) self.browser.set_window_size(1920, 1080) print("开始测试时间:", self.starttime) self.browser.get(url) #time.sleep(3) # 结束函数,每个用例测试结束后,都会执行该函数 def tearDown(self): #time.sleep(3) self.browser.quit() self.endtime = parse( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) print("测试结束时间:", self.endtime) totaltime = (self.endtime - self.starttime).total_seconds() print("总时长:", totaltime, "秒") def switchRun(self, casedealinfoDic): ifrerun = False if casedealinfoDic[1][0] is not None: if self.urlReg.match(DPD.getAllData().checkIfExists( self.datatypemenu[1], casedealinfoDic[1][0])): ifrerun = True try: self.switch[casedealinfoDic[1][2]](casedealinfoDic, ifrerun) except KeyError as e: self.log(e) def test_open(self, casedealinfoDic, ifrerun): if ifrerun == True: self.setUp(DPD.getAllData().checkIfExists(self.datatypemenu[1], casedealinfoDic[1][0])) self.save_img(casedealinfoDic[0]) self.tearDown() def test_click(self, casedealinfoDic, ifrerun): pass def test_input(self, casedealinfoDic, ifrerun): pass def test_clear(self, casedealinfoDic, ifrerun): pass def test_wait(self, casedealinfoDic, ifrerun): pass def test_assertion(self, casedealinfoDic, ifrerun): pass def test_introduce(self, casedealinfoDic, ifrerun): pass def test_maximize(self, casedealinfoDic, ifrerun): pass
def __init__(self): super().__init__() self.driver = webdriver.Chrome( os.path.join(settings.BASE_DIR, "drivers/chromedriver"))
def ChromeDriverBrowser(self): driverChrome = webdriver.Chrome() return driverChrome
def setUp(self): self.browser = webdriver.Chrome() self.browser.implicitly_wait(3) # selenium会等待3秒
from selenium.webdriver.common.alert import Alert import re from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException # import config # import config_request workbook = xlsxwriter.Workbook('kdx_3001~3500.xlsx') worksheet = workbook.add_worksheet() chrome_options = Options() chrome_options.add_argument('headless') chrome_options.add_argument('--log-level=3') driver = webdriver.Chrome('./chromedriver/chromedriver', chrome_options=chrome_options) data = [] for num in range(3001, 3182): driver.get(f'https://kdx.kr/data/view?product_id={num}') try: WebDriverWait(driver, 1).until(EC.alert_is_present()) Alert(driver).accept() print(f'id:{num} 에러') continue except TimeoutException: print(f'id:{num} 크롤링') # print("Alert not found. Move on...")
# go and slowly add all the 1m who follow magic_fox? or slected? or Samsoe et Samsoe? # how about you graph the numbers of followers the people who follow magic_fox have. # bet there is a bunch of bots at the high end, but the vast majority have < 2000 follows. # These are the real people to target. # Look at the people who like the #menswear posts --- follow them and like a photo # Look at the people who follow the menswear bloggers --- follow them and like a photo TagList = ['mensfashion', 'menswear', 'menstyle', 'mensstyle', 'menwithclass', 'menwithstyle'] Posts = [] driver = webdriver.Chrome('/Users/brianb/Dropbox/Instagram/chromedriver') def RandomSleep(low,high): sleep(randint(low,high)) def RandomLike(percent): NumForUse = 100 - percent RanNum = randint(0,100) if RanNum > NumForUse: return True else: return False def Login(): driver.get('https://www.instagram.com/') User = Secrets.get('User')
def set_driver(self): self.driver = webdriver.Chrome() print('[+] Driver started...')
def __init__(self): self.driver = webdriver.Chrome() self.url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=" self.positions = []
def __init__(self): self.driver = webdriver.Chrome( executable_path="E:\chromedriver\chromedriver.exe")
for alist in span.find_all('a'): each_authors.append(alist.string) all_authors.append(each_authors) ''' if __name__ == "__main__": # 某个老师的百度学术主页 url = "http://xueshu.baidu.com/scholarID/CN-B8748R1J" chrome_options = Options() # specify headless mode chrome_options.add_argument("--headless") browser = webdriver.Chrome(options=chrome_options) browser.set_page_load_timeout(600) browser.set_script_timeout(600) browser.get(url) wait = WebDriverWait(browser, 10) wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="articlelist_container"]/div[2]/div[1]/div[20]'))) # index="19" #browser.implicitly_wait(10) title = browser.find_elements_by_xpath('//*[@id="articlelist_container"]/div[2]/div[1]') content = title[0].get_attribute('innerHTML') soup = BeautifulSoup(content, 'lxml') Title, ArticleUrl = HomePageInfo(browser) # 开始第二页以后的内容
import time from selenium import webdriver from bs4 import BeautifulSoup import pandas as pd import os import re # Enter the path of the driver # browser = webdriver.Chrome() browser = webdriver.Chrome(executable_path=r"C:\Users\Parth\Documents\540\android_review_crawler\ios_Xamarin_and_React_apps_reviews\chromedriver.exe") ## change path depending if its native or framework # path = "./Native_ios_apps_reviews" path = "C:\\Users\\Parth\\Documents\\540\\android_review_crawler\\ios_Xamarin_and_React_apps_reviews" # Tell Selenium to get the URL you're interested in. data = pd.read_csv("apps data.csv", index_col=False) main_info = [] for d in range(len(data)): # app_name = data.iloc[d][0] url = data.iloc[d][1] # url = "https://play.google.com/store/apps/details?id=com.facebook.katana&showAllReviews=true" browser.get(url) SCROLL_PAUSE_TIME = 0.5 time.sleep(5) # wait dom ready page = browser.page_source soup_expatistan = BeautifulSoup(page, "html.parser") app_name = soup_expatistan.find("h1", class_="product-header__title app-header__title").text.split()[0] overall_and_number = soup_expatistan.find("li",
def activateAndSubscribe(name): goToServiceTemplates() clickOnGlobalListName(name) # Activate subscription if driver.find_elements_by_class_name("s-btn")[3].text == 'Activate': driver.find_elements_by_class_name("s-btn")[3].click() # go to subscriptions tab driver.find_element_by_id('subscriptions').click() # create new subscription driver.find_element_by_id('subscriptions_create').click() # give the first customer a brand new subscription! clickOnGlobalList(1) submitForm() driver.find_element_by_class_name("action").click() driver=webdriver.Chrome('/chromedriver/chromedriver') #def deleteAllResources(): if True: try: driver.get("http://z.pyer.apsdemo.org:8080/") instance="cdn" #assert 'Parallels® Automation' in driver.title login('admin','1qazXSW@') services=['VDN Embratel globals', 'VDN Embratel Management', 'VDN Live Channels', 'VDN Content', 'VDN Job', 'Content Delivery Network', 'VDN_HTTP_Traffic', 'VDN_HTTPS_Traffic', 'VDN_VOD_Encoding_Minutes', 'VDN_VOD_Storage_MbH', 'VDN_Live_Encoding_Minutes', 'VDN_Live_DVR_Minutes'] createAppReference(services[0])
except Exception as e: print("Error en GetAll " + str(e)) def Paginacion(self): try: buton = driver.find_element_by_class_name("next") buton.click() return except: print("An exception occurred Paginacion") if "__main__" == __name__: PATH = "C:\Pentaho\chromedriver.exe" driver = webdriver.Chrome(PATH) driver.implicitly_wait(2) Url = "https://www.nationalcrimeagency.gov.uk/most-wanted-search" driver.get(Url) table = Table(driver) Lista = [] rows = [] ##Lista=table.GetAll() #total = len(Lista) for i in range(22): elemento = table.GetAll() if elemento is not None: if len(elemento) > 0: elemento[i].click() data = table.get_rows() if data is not None:
def ChromeDriverNOBrowser(self): chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') driverChrome = webdriver.Chrome(chrome_options=chrome_options) return driverChrome
from selenium import webdriver driver webdriver.Chrome()
def setUp(self): self.driver = webdriver.Chrome() self.driver.maximize_window()
def browser(request): wd = webdriver.Chrome() request.addfinalizer(wd.quit) return wd
from selenium import webdriver from selenium.webdriver.common.keys import Keys import time driver = webdriver.Chrome() driver.get("http://localhost/progetto/") driver.maximize_window() time.sleep(2) link = driver.find_element_by_name("nome").send_keys("admin") link = driver.find_element_by_name("password").send_keys("admin") link = driver.find_element_by_name("login").click() time.sleep(2) driver.execute_script("window.scrollTo(0,document.body.scrollHeight)") time.sleep(2) ################################ #inserisci specie animale driver.find_element_by_link_text("Amministrazione").click() time.sleep(2) driver.find_element_by_xpath( '//*[@id="navbarSupportedContent"]/ul/li[9]/div/a[1]').click() time.sleep(2) driver.find_element_by_name("Classe").send_keys("REPTILIA") driver.find_element_by_name("NomeLatino").send_keys("Nuova Viperw") driver.find_element_by_name("NomeItaliano").send_keys("Vipera Velenosa") driver.find_element_by_name("AnnoClassificazione").send_keys("1999")
def setup_method(self, method): self.driver = webdriver.Chrome() self.vars = {}
def selenium_test_run(self,url): driver = webdriver.Chrome("C:\chromedriver4.exe") driver.get(url) driver.quit()
from selenium import webdriver browser = "ie" if browser == "chrome": driver = webdriver.Chrome( executable_path= "C:/Users/Heggade/PycharmProjects/s_class/drivers/chromedriver.exe") elif browser == "firefox": driver = webdriver.Firefox( executable_path= "C:/Users/Heggade/PycharmProjects/s_class/drivers/geckodriver.exe") elif browser == "ie": driver = webdriver.Ie( executable_path= "C:/Users/Heggade/PycharmProjects/s_class/drivers/IEDriverServer.exe") else: print("provide appropriate browser name") driver.get("http://facebook.com") driver.maximize_window() driver.find_element_by_id("email").send_keys("test") driver.find_element_by_id("pass").send_keys("pass") driver.find_element_by_id("u_0_2").click() driver.get
body = json.dumps({'cmd': cmd, 'params': params}) response = driver.command_executor._request('POST', url, body) if response['status']: raise Exception(response.get('value')) return response.get('value') def add_script(driver, script): '''在页面加载前执行js''' send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script}) # 给 webdriver.Chrome 添加一个名为 add_script 的方法 webdriver.Chrome.add_script = add_script # 这里(webdriver.Chrome)可能需要改,当调用不同的驱动时 # *************** 专业造假 ################### browser = webdriver.Chrome( executable_path=driver_path, chrome_options=options ) # ################## 辅助调试 ********************* existed = { 'executor_url': browser.command_executor._url, # 浏览器可被远程连接调用的地址 'session_id': browser.session_id # 浏览器会话ID } pprint(existed) with open('existed.json', 'wt', encoding='utf-8') as f: json.dump(existed, f, ensure_ascii=False, indent=4) # ********************* 辅助调试 ################## # ############### 专业造假 *************************** browser.add_script(""" Object.defineProperty(navigator, 'webdriver', {
from selenium import webdriver from selenium.webdriver.common.keys import Keys import io #save the abstracts driver = webdriver.Chrome("C:/Users/Supreme Ruler/Desktop/chromedriver.exe") def goto_page(url): """Navigates to given url, url must be string""" driver.get(url) def enter_query(query): """Enters the wanted query into the pubmed search bar and presses RETURN""" text_bar = driver.find_element_by_id("term") text_bar.send_keys(query) text_bar.send_keys(Keys.RETURN) def get_href(): """Ïterates through all results, adding href and title to list. Returns that list.""" href_list = [] try: total_pages = driver.find_element_by_name( "EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.cPage" ) page_end = total_pages.get_attribute("last") except: page_end = 1
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys import json,time,random with open('data/config.json') as config_file: data = json.load(config_file) url_login = r'https://www.linkedin.com/login' url_job_search = r'https://www.linkedin.com/jobs/search/?currentJobId=2514734378&f_AL=true&f_E=2&f_WRA=true&geoId=103644278&keywords=software%20engineer&location=United%20States' webdriver = webdriver.Chrome(data['driver_path']) webdriver.get(url_login) username_element = webdriver.find_element_by_css_selector('#username') username_element.clear() username_element.send_keys(data['email']) password_element = webdriver.find_element_by_css_selector('#password') password_element.clear() password_element.send_keys(data['password']) password_element.send_keys(Keys.RETURN) webdriver.get(url_job_search) random_time = random.uniform(3.5, 4.9) time.sleep(random_time) amount_of_results = webdriver.find_element_by_css_selector('body > div.application-outlet > div.authentication-outlet > div.job-search-ext > div > div > section.jobs-search__left-rail > div > header > div.jobs-search-results-list__title-heading > small') print(f'amount_of_results: {amount_of_results.text}')
import traceback from tkinter import filedialog from tkinter import * from googleapiclient.discovery import build from google_auth_oauthlib.flow import InstalledAppFlow from google.auth.transport.requests import Request from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup options = webdriver.ChromeOptions() options.add_argument("--user-data-dir=/Users/Vanze/Library/Application Support/Google/Chrome/Profile 6") browser = webdriver.Chrome('./chromedriver', options = options) delay = 30 # Google Sheets stuff # You should change these to match your own spreadsheet if os.path.exists('gsheet_id.txt'): with open('gsheet_id.txt', 'r') as file: json_repr = file.readline() data = json.loads(json_repr) GSHEET_ID = data["GSHEET_ID"] RANGE_NAME = data["RANGE_NAME"] else: GSHEET_ID = '10PmGsjxMXvIMDIig1QiS-YVYxqOClZEvEu8B9Z69MeA' RANGE_NAME = 'Transactions!A:I'
# -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.common.keys import Keys import sys import time reload(sys) sys.setdefaultencoding('utf8') #fo = open('111.log','w') #sys.stdout = fo browser = webdriver.Chrome() #browser=webdriver.PhantomJS(service_args=['--ssl-protocol=any'])#PhantomJS browser.get('//rate.taobao.com/user-rate-95cf5ce4398a3bd471b08cbe9bf6e0fb.htm?spm=2013.1.1000126.10.68ab26c2ws4Vvj') #browser=webdriver.PhantomJS(service_args=['--ssl-protocol=any'])#PhantomJS #browser.get(url) time.sleep(5) decribe_score = browser.find_element_by_xpath('//*[@id="dsr"]/li[1]/div[1]/em[1]') print 'decribe_score:',decribe_score attitude_score = browser.find_element_by_xpath('//*[@id="dsr"]/li[2]/div[1]/em[1]') print 'attitude_score:',attitude_score logistics_score = browser.find_element_by_xpath('//*[@id="dsr"]/li[3]/div[1]/em[1]') print 'logistics_score:',logistics_score store_rating = browser.find_element_by_xpath('//*[@id="dsr"]/li[2]/div[2]/div/div[1]/em') print 'store_rating:',store_rating browser.quit()
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC driver = webdriver.Chrome( executable_path= r'C:\Users\Akumar4\Downloads\chromedriver_win32\chromedriver.exe') driver.get('https://www.expedia.com/') driver.maximize_window() try: # path = '//*[@id="reasons-to-believe-banner"]/li[1]/span[2]' # wait = WebDriverWait(driver, 10) # ele = wait.until(EC.presence_of_element_located((By.XPATH, path)))# # ele = wait.until(EC.title_is('Expedia Travel: Search Hotels, Cheap Flights, Car Rentals & Vacations')) # ele = wait.until( EC.element_to_be_clickable( (By.XPATH, '//*[@id="header-account-menu"]'))) finally: driver.quit()