def virgin_atlantic(origin, dest, searchdate, returndate, searchkey, returnkey): driver = webdriver.PhantomJS(service_args=[ '--ignore-ssl-errors=true', '--ssl-protocol=any', '--load-images=false' ]) driver.set_window_size(1120, 1080) # sys.stdout=codecs.getwriter('utf-8')(sys.stdout) currentdatetime = datetime.datetime.now() stime = currentdatetime.strftime('%Y-%m-%d %H:%M:%S') # try: if not DEV_LOCAL: db = customfunction.dbconnection() cursor = db.cursor() dt = datetime.datetime.strptime(searchdate.strip(), '%m/%d/%Y') date = dt.strftime('%d/%m/%Y') if returndate != 'None' and returndate: dt1 = datetime.datetime.strptime(returndate, '%m/%d/%Y') retdate = dt1.strftime('%d/%m/%Y') if returndate != 'None' and returndate: url = "http://www.virgin-atlantic.com/us/en/book-your-travel/book-your-flight/flight-search-results.html?departure=" + origin + "&arrival=" + dest + "&adult=1&departureDate=" + str( date ) + "&search_type=redeemMiles&classType=10&classTypeReturn=10&bookingPanelLocation=Undefined&isreturn=yes&returnDate=" + str( retdate) else: url = "http://www.virgin-atlantic.com/us/en/book-your-travel/book-your-flight/flight-search-results.html?departure=" + origin + "&arrival=" + dest + "&adult=1&departureDate=" + str( date ) + "&search_type=redeemMiles&classType=10&classTypeReturn=10&bookingPanelLocation=BookYourFlight&isreturn=no" print url, '@@@@@@@@@@@@@22' driver.get(url) time.sleep(2) html_page = driver.page_source soup = BeautifulSoup(html_page, "lxml") # except: # print "before data page" # print "searchkey",searchkey # print "returnkey",returnkey # if not DEV_LOCAL: # storeFlag(searchkey,stime) # if returnkey: # storeFlag(returnkey,stime) # print "return key stored" # driver.quit() # return def virgindata(tbody, keyid): recordcount = 1 value_string = [] try: if tbody.findAll("tr", {"class": "directRoute "}): trbody = tbody.findAll("tr", {"class": "directRoute "}) else: if tbody.findAll("tr", {"class": "indirectRoute "}): trbody = tbody.findAll("tr", {"class": "indirectRoute "}) except: return keyid for row in trbody: econo = 0 econotax = 0 business = 0 busstax = 0 first = 0 firsttax = 0 stp = '' lyover = '' details = row.find("td", {"class": "flightSearchDetails"}) # error economy = '' #============= price block ================================================================ if row.find("td", {"class": "cellOption economy hasLowestCostMessage"}): economy = row.find( "td", {"class": "cellOption economy hasLowestCostMessage"}) if economy == '' and row.find("td", {"class": "cellOption economy "}): economy = row.find("td", {"class": "cellOption economy "}) if economy: "--------------economy--------------------------------" economy_price = economy.find("span", {"class": "price"}) econprice1 = economy_price.text econprice = re.findall("\d+.\d+", econprice1) if len(econprice) > 0: econo = econprice[0] if ',' in econo: econo = econo.replace(',', '') if len(econprice) > 1: if "USD" not in econprice1: cprice = 0 if ',' in econprice[1]: cprice = econprice[1].replace(',', '') else: cprice = econprice[1] currency_symbol = (re.findall("[a-zA-Z]+", econprice1)) currencychange = urllib.urlopen( "https://www.exchangerate-api.com/%s/%s/%f?k=e002a7b64cabe2535b57f764" % (currency_symbol[1], "USD", float(cprice))) chaged_result = currencychange.read() econotax = chaged_result else: econotax = econprice[1] pre_economy = '' if row.find("td", {"class": "cellOption premEconomy "}): pre_economy = row.find("td", {"class": "cellOption premEconomy "}) if pre_economy == '' and row.find( "td", {"class": "cellOption premEconomy hasLowestCostMessage"}): pre_economy = row.find( "td", {"class": "cellOption premEconomy hasLowestCostMessage"}) if pre_economy: "--------------pre economy--------------------------------" pre_economy_price = pre_economy.find("span", {"class": "price"}) pre_economy = pre_economy_price.text #print pre_economy pre_econo_price = re.findall("\d+.\d+", pre_economy) if len(pre_econo_price) > 0: business = pre_econo_price[0] if ',' in business: business = business.replace(',', '') if len(pre_econo_price) > 1: if "USD" not in pre_economy: eprice = 0 if ',' in pre_econo_price[1]: eprice = pre_econo_price[1].replace(',', '') else: eprice = pre_econo_price[1] currency_symbol = (re.findall("[a-zA-Z]+", pre_economy)) currencychange = urllib.urlopen( "https://www.exchangerate-api.com/%s/%s/%f?k=e002a7b64cabe2535b57f764" % (currency_symbol[1], "USD", float(eprice))) chaged_result = currencychange.read() busstax = chaged_result else: busstax = pre_econo_price[1] #print "pre_econotax",busstax upper_class = '' if row.find("td", {"class": "cellOption upperclass last"}): "--------------upper class--------------------------------" upper_class = row.find( "td", {"class": "cellOption upperclass last"}) else: if row.find("td", { "class": "cellOption upperclass last hasLowestCostMessage" }): upper_class = row.find( "td", { "class": "cellOption upperclass last hasLowestCostMessage" }) if upper_class: upper_class_price = upper_class.find("span", {"class": "price"}) upperclass_price = upper_class_price.text upperprice = re.findall("\d+.\d+", upperclass_price) if len(upperprice) > 0: first = upperprice[0] if ',' in first: first = first.replace(',', '') if len(upperprice) > 1: if "USD" not in upperclass_price: uprice = 0 if ',' in upperprice[1]: uprice = upperprice[1].replace(',', '') else: uprice = upperprice[1] currency_symbol = (re.findall("[a-zA-Z]+", upperclass_price)) currencychange = urllib.urlopen( "https://www.exchangerate-api.com/%s/%s/%f?k=e002a7b64cabe2535b57f764" % (currency_symbol[1], "USD", float(uprice))) chaged_result = currencychange.read() firsttax = chaged_result else: firsttax = upperprice[1] #============================= end price block ========================================================= sourcestn = '' destinationstn = '' depttime = '' arivaltime = '' total_duration = '' heading = details.find("ul") depart = heading.find("li", {"class": "depart"}) departinfo = depart.findAll("p") if len(departinfo) > 0: depttime = departinfo[0].text departfrom1 = departinfo[1].text if 'from' in departfrom1: departfrom = (departfrom1.replace('from', '')).strip() if '(' in departfrom: departfrom1 = departfrom.split('(') sourcestn = departfrom1[1].replace(')', '') arive = heading.find("li", {"class": "arrive"}) ariveinfo = arive.findAll("p") if len(ariveinfo) > 0: arivaltime = ariveinfo[0].text if '+' in arivaltime: arivaltimesplit = arivaltime.split('+') arivaltime = arivaltimesplit[0] ariveat1 = ariveinfo[1].text if 'at' in ariveat1: ariveat = (ariveat1.replace('at', '')).strip() if '(' in ariveat: ariveat2 = ariveat.split('(') destinationstn = ariveat2[1].replace(')', '') stop = heading.find("li", {"class": "stops"}) durations = heading.find("li", {"class": "duration"}) stoppage = stop.text if '0' in stoppage: stp = "NONSTOP" elif '1' in stoppage: stp = "1 STOP" elif '2' in stoppage: stp = "2 STOPS" else: if '3' in stoppage: stp = "3 STOPS" total_duration = (durations.text).strip() if 'Duration' in total_duration: total_duration = (total_duration.replace('Duration', '')).strip() ''' #print "total_duration",total_duration operator = details.find("dl",{"class":"operator"}) operatedby = (operator.find("dd").text).strip() print "operatedby",operatedby ''' #===============================details block==================================================== details_block = details.find("div", {"class": "tooltip"}) details_tr = details_block.findAll("tr") counter = 0 departdlist = [] arivelist = [] planelist = [] operatedby = [] departdetails = '' arivedetails = '' planedetails = '' operatedbytext = '' while (counter < len(details_tr)): #print "counter",counter from_to = details_tr[counter].find("td", {"class": "flightDetails"}) operator = from_to.find("span", {"class": "operator"}).text operatedby.append(operator) #print "operator",operator from_to1 = from_to.find("span", {"class": "flightFromTo"}).text departing_from = '' ariving_at = '' departing_date = '' detaildetptime = '' detailarivetime = '' deptextraday = '' ariveextraday = '' if 'to' in from_to1: from_to1 = from_to1.split('to') departing_from = from_to1[0] if '\n' in departing_from: departing_from1 = departing_from.split("\n") departing_from = departing_from1[0].strip( ) + " " + departing_from1[1].strip() #print "departing_from",departing_from ariving_at = from_to1[1] if '\n' in ariving_at: ariving_at1 = ariving_at.split("\n") ariving_at = ariving_at1[0].strip( ) + " " + ariving_at1[1].strip() #print "ariving_at",ariving_at departing_date = from_to.find("span", { "class": "fullDate" }).text if 'Departing' in departing_date: departing_date = (departing_date.replace('Departing', '')).strip() counter = counter + 1 departtime = details_tr[counter].find("td", {"class": "departs"}) fl_dept_time = departtime.find("span", {"class": "flightDeparts"}) detaildetptime = fl_dept_time.text if departtime.find("span", {"class": "extraDays"}): extradeptdate = departtime.find("span", {"class": "extraDays"}) deptextraday = extradeptdate.text nod = re.findall("\d+.\d+", deptextraday) #print "nod",nod if "+1" in deptextraday: deptextraday = "+1 day" elif "+2" in deptextraday: deptextraday = "+2 day" else: if "+3" in deptextraday: deptextraday = "+3 day" arivetime = details_tr[counter].find("td", {"class": "arrives"}) fl_arive_time = arivetime.find("span", {"class": "flightArrives"}) detailarivetime = fl_arive_time.text if arivetime.find("span", {"class": "extraDays"}): extra_ariveday = arivetime.find("span", {"class": "extraDays"}) ariveextraday = extra_ariveday.text duration = details_tr[counter].find("td", {"class": "duration"}) fl_duration1 = duration.find("span", {"class": "flightDuration"}) fl_duration = (fl_duration1.text).strip() fl_flightno = '' planeno = '' flight_no = details_tr[1].find("td", {"class": "number"}) fl_flightno1 = flight_no.find("span", {"class": "flightNumber"}) planeno = (''.join(fl_flightno1.find('br').next_siblings)) fl_flightno = (fl_flightno1.text).replace(planeno, '') departinfo_time = departing_date + " " + detaildetptime departinfo_time = datetime.datetime.strptime( departinfo_time, '%A %d %B %Y %H:%M') departinfo_time = departinfo_time.strftime('%Y/%m/%d %H:%M') airport_ = customfunction.get_airport_detail( get_airport_code(departing_from)) or departing_from deptdetail = departinfo_time + " | from " + airport_ departdlist.append(deptdetail) departinfo_time = departing_date + " " + detailarivetime departinfo_time = datetime.datetime.strptime( departinfo_time, '%A %d %B %Y %H:%M') departinfo_time = departinfo_time.strftime('%Y/%m/%d %H:%M') airport_ = customfunction.get_airport_detail( get_airport_code(ariving_at)) or ariving_at arivedetail = departinfo_time + " | at " + airport_ arivelist.append(arivedetail) planetext = fl_flightno + " | " + planeno + " (" + fl_duration + ")" planelist.append(planetext) counter = counter + 1 departdetails = '@'.join(departdlist) arivedetails = '@'.join(arivelist) planedetails = ('@'.join(planelist)).strip() operatedbytext = '@'.join(operatedby) value_string.append( (fl_flightno, str(keyid), stime, stp, lyover, sourcestn, destinationstn, depttime, arivaltime, total_duration, str(econo), str(econotax), str(business), str(busstax), str(first), str(firsttax), "Economy", "Business", "First", "virgin_atlantic", departdetails, arivedetails, planedetails, operatedbytext)) recordcount = recordcount + 1 if recordcount > 50: cursor.executemany( "INSERT INTO pexproject_flightdata (flighno,searchkeyid,scrapetime,stoppage,stoppage_station,origin,destination,departure,arival,duration,maincabin,maintax,firstclass,firsttax,business,businesstax,cabintype1,cabintype2,cabintype3,datasource,departdetails,arivedetails,planedetails,operatedby) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);", value_string) db.commit() value_string = [] recordcount = 1 if len(value_string) > 0: if not DEV_LOCAL: cursor.executemany( "INSERT INTO pexproject_flightdata (flighno,searchkeyid,scrapetime,stoppage,stoppage_station,origin,destination,departure,arival,duration,maincabin,maintax,firstclass,firsttax,business,businesstax,cabintype1,cabintype2,cabintype3,datasource,departdetails,arivedetails,planedetails,operatedby) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);", value_string) db.commit() else: print value_string #driver.quit() tbody = soup.findAll("tbody", {"class": "flightStatusTbody"}) if searchkey: if len(tbody) > 0: virgindata(tbody[0], searchkey) if not DEV_LOCAL: storeFlag(searchkey, stime) if returnkey: if len(tbody) > 1: virgindata(tbody[1], returnkey) if not DEV_LOCAL: storeFlag(returnkey, stime) driver.quit() return searchkey
def __init__(self): ''' !!! FOR WINDOWS USERS ''' #self.driver = webdriver.PhantomJS(executable_path="c:/phantomjs-2.1.1-windows/bin/phantomjs.exe") self.driver = webdriver.PhantomJS(executable_path="C:/Users/flipp/phantomjs-2.1.1-windows/bin/phantomjs.exe")
u'\u5eb7\u5b9a': u'KGT', u'\u53f0\u4e2d': u'RMQ' } cityList = [u'北京', u'广州'] dateList = ['2018-04-20', '2018-04-21'] for dept_city in cityList: for arv_city in cityList: if dept_city != arv_city: dept_city_code = cityToCodeList[dept_city] arv_city_code = cityToCodeList[arv_city] for date in dateList: url = 'https://m.ctrip.com/html5/flight/swift/domestic/' + dept_city_code + '/' + arv_city_code + '/' + date # url = 'https://m.ctrip.com/html5/flight/swift/index' driver = webdriver.PhantomJS() driver.maximize_window() driver.implicitly_wait(2) print('Waiting...') driver.get(url) print('Waiting...') execute(10, driver, 3) flight = driver.find_elements_by_css_selector( "div[id^=flight_]") name = [] remote_cookies = driver.get_cookies() local_cookies = {} for each in remote_cookies:
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.select import Select from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup as bs import requests as rq import re import csv # Using PhantomJS to render the webpage driver = webdriver.PhantomJS( executable_path= "/Users/rosegaray/Desktop/phantomjs-2.1.1-macosx/bin/phantomjs") driver.set_window_size(1920, 1080) driver.get("http://salaryguide.diamondbacklab.com/#/salGuide?year=2017") #2017 is 1029 #2016 is 1010 #2015 is 1022 #2014 is 1251 #2013 is 1210 # Wait for site to load wait = WebDriverWait(driver, 20) # Stores each page's data data = {} for page in range(1029): # Grab "tbody" tag
def check_flights(): URL = "https://www.google.com/flights/explore/#explore;f=JFK,EWR,LGA;t=HND,NRT,TPE,HKG,KIX;s=1;li=8;lx=12;d=2018-04-01" driver = webdriver.PhantomJS() dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36" ) driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path="/usr/local/bin/phantomjs") driver.implicitly_wait(20) driver.get(URL) wait = WebDriverWait(driver, 20) wait.until( EC.visibility_of_element_located((By.CSS_SELECTOR, "div.CTPFVNB-w-e"))) s = BeautifulSoup(driver.page_source, "lxml") best_price_tags = s.findAll('div', 'CTPFVNB-w-e') # check if scrape worked - alert if it fails and shutdown if len(best_price_tags) < 4: print('Failed to Load Page Data') requests.post( 'https://maker.ifttt.com/trigger/fare_alert/with/key/API_TOKEN', data={ "value1": "script", "value2": "failed", "value3": "" }) sys.exit(0) else: print('Successfully Loaded Page Data') best_prices = [] for tag in best_price_tags: best_prices.append(int(tag.text.replace('$', ''))) best_price = best_prices[0] best_height_tags = s.findAll('div', 'CTPFVNB-w-f') best_heights = [] for t in best_height_tags: best_heights.append( float(t.attrs['style'].split('height:')[1].replace('px;', ''))) best_height = best_heights[0] # price per pixel of height pph = np.array(best_price) / np.array(best_height) cities = s.findAll('div', 'CTPFVNB-w-o') hlist = [] for bar in cities[0]\ .findAll('div', 'CTPFVNB-w-x'): hlist.append( float(bar['style'].split('height: ')[1].replace('px;', '')) * pph) fares = pd.DataFrame(hlist, columns=['price']) px = [x for x in fares['price']] ff = pd.DataFrame(px, columns=['fare']).reset_index() # begin the clustering X = StandardScaler().fit_transform(ff) db = DBSCAN(eps=1.5, min_samples=1).fit(X) labels = db.labels_ clusters = len(set(labels)) pf = pd.concat([ff, pd.DataFrame(db.labels_, columns=['cluster'])], axis=1) rf = pf.groupby('cluster')['fare'].agg(['min', 'count' ]).sort_values('min', ascending=True) # set up our rules # must have more than one cluster # cluster min must be equal to lowest price fare # cluster size must be less than 10th percentile # cluster must be $100 less the next lowest-priced cluster if clusters > 1 and ff['fare'].min() == rf.iloc[0]['min']\ and rf.iloc[0]['count'] < rf['count'].quantile(.10)\ and rf.iloc[0]['fare'] + 100 < rf.iloc[1]['fare']: city = s.find('span', 'CTPFVNB-v-c').text fare = s.find('div', 'CTPFVNB-w-e').text r = requests.post( 'https://maker.ifttt.com/trigger/fare_alert/with/key/API_TOKEN', data={ "value1": city, "value2": fare, "value3": "" }) else: print('no alert triggered')
def run_crawler(url, inst, user, pwd): # Initialize Webdriver and Input Login Information print("Initializing crawler...") browser = webdriver.PhantomJS() browser.get(url) institution = Select(browser.find_element_by_name("inst")) username = browser.find_element_by_name("instkey") password = browser.find_element_by_name("perskey") submit = browser.find_element_by_xpath("//input[@type='button']") sleep(randint(2, 10)) institution.select_by_value(str(inst)) username.send_keys(str(user)) password.send_keys(str(pwd)) submit.click() print("Login complete.") # Administration Page sleep(randint(2, 10)) administration = browser.find_element_by_xpath( "//input[@value='Administration']") administration.click() print("Step 1 complete.") # Next Page sleep(randint(2, 10)) database = browser.find_element_by_xpath( "//input[@value = 'Database Search']") database.click() print("Step 2 complete.") # Database Page sleep(randint(2, 10)) search = browser.find_element_by_name("sf_aq") submit_query = browser.find_element_by_xpath( "//input[@value='Submit Search']") search.send_keys(query.run_query(query.query_dict)) submit_query.click() print("Input complete.") # Get Xpath get_xpath = browser.page_source get_xpath = BeautifulSoup(get_xpath, 'lxml') get_xpath = get_xpath.find_all( "a", attrs={"href": re.compile("javascript:subViewResult")}) xpath_list = [] for item in get_xpath: xpath_list.append(item.get_text()) xpath_no = xpath_list[xpath_list.index('NEXT>') - 1] print("Found Xpath Key.") # Get CT Evaluation show_results = browser.find_element_by_xpath( '//*[@id="pagedone"]/p[2]/table[1]/tbody/tr[3]/td/center/a[' + xpath_no + ']') show_results.click() html = browser.page_source get_data = BeautifulSoup(html, 'lxml') get_data table = get_data.find("div", attrs={'id': 'pagedone'}) table = table.find("tbody") text_list = [] rows = table.find_all('tr') for row in rows: cols = row.find_all('td') cols = [element.text.strip() for element in cols] text_list.append([element for element in cols if element]) # Get rid of empty values text_list = [item for item in text_list if len(item) > 1] text_list = text_list[2:] text_list = [[word.replace('-', '') for word in item] for item in text_list] text_list = [item for item in text_list if len(item) > 2] headers = [ "Study ID", "SR", "Last Name", "First Name", "MRN", "Intake", "IR", "CT Evaluation", "CR" ] df = pd.DataFrame(text_list, columns=headers) df.drop(["SR", "IR", "CR"], axis=1, inplace=True) df[df == ''] = np.NaN df = df.fillna(method='ffill') df["Intake"] = pd.to_datetime(df["Intake"]) df["CT Evaluation"] = pd.to_datetime(df["CT Evaluation"]) df = df.sort_values(by="CT Evaluation", ascending=False) df.to_csv("update.csv") print("CT Evaluations scraped.") # Download Data sleep(randint(2, 10)) download = browser.find_element_by_xpath("//input[@value='Download Data']") download.click() print("Download complete.") # Close Crawler sleep(randint(2, 10)) if os.path.exists(credentials.path_cwd + "/update.csv"): browser.quit() print("Crawling complete.") return
class FlightCard: def __init__(self, airline, departure_time, arrival_time): self.airline = airline self.departure_time = departure_time self.arrival_time = arrival_time def noti_slack(message): token = WHALE_BOT_TOKEN slack = Slacker(token) slack.chat.post_message(CHANNEL, message) # 드라이버 선택 # driver = webdriver.Chrome('/Users/deplax/Downloads/chromedriver') driver = webdriver.PhantomJS('/Users/whale/Downloads/phantomjs') # 드라이버 세팅 driver.set_page_load_timeout(30) # 페이지를 가져온다. driver.get( 'https://store.naver.com/flights/results/domestic?trip=OW&scity1=CJU&ecity1=GMP&sdate1=2018.10.19.&adult=1&child=0&infant=0&fareType=YC&airlineCode=&nxQuery=항공권' ) time.sleep(10) for x in range(15): driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) contents = driver.page_source.encode('utf-8')
#!/usr/bin/env python3 # coding:utf-8 # ********** 需要安装模块 ************ # pip install selenium # 下载PhantomJS http://phantomjs.org/download.html # ************************************ from selenium import webdriver from selenium.webdriver.common.by import By import json import time # 双色球爬取 driver = webdriver.PhantomJS( executable_path='D:/Programs/phantomjs/bin/phantomjs') if __name__ == '__main__': urls = [] items = [] # 起始页 start_url = "http://kaijiang.500.com/shtml/ssq/03001.shtml" driver.get(start_url) list = driver.find_elements_by_css_selector( 'body > div.wrap > div.kj_main01 > div.kj_main01_right > div.kjxq_box02 > div.kjxq_box02_title > div.kjxq_box02_title_right > span > div > a' ) for a in list: url = a.get_attribute("href") urls.append(url) print(url) print("urls加载完成。。。")
from selenium import webdriver driver = webdriver.PhantomJS(executable_path='/opt/phantomjs/bin/phantomjs') driver.get('http://pythonscraping.com') driver.implicitly_wait(1) print(driver.get_cookies()) savedCookies = driver.get_cookies() driver2 = webdriver.PhantomJS(executable_path='/opt/phantomjs/bin/phantomjs') driver2.get('http://pythonscraping.com') driver2.delete_all_cookies() for cookie in savedCookies: driver2.add_cookie(cookie) driver2.get('http://pythonscraping.com') driver2.implicitly_wait(1) print(driver2.get_cookies()) print(savedCookies == driver2.get_cookies())
def get_phantomjs_path(phantomjs_path=None): # if phantomjs_path is provided, use it as PATH if phantomjs_path: return phantomjs_path # Download Phantomjs Binary if not exist def download_phantomjs(filename): # Download PhantomJS Binary file_path = os.path.join(TMP_DIR, filename) if not os.path.exists(file_path): print("::Download PhantomJS::") response = requests.get(phantomjs_url, stream=True) f = open(file_path, "wb+") for chunk in tqdm(response.iter_content(chunk_size=1024)): if chunk: f.write(chunk) f.close() print("::Download Finish::") return file_path if not os.path.exists(os.path.join(TMP_DIR, 'tmp')): os.makedirs(os.path.join(TMP_DIR, 'tmp')) try: # Check 'phantomjs' in Executable PATH webdriver.PhantomJS() return 'phantomjs' except WebDriverException as e: # No 'phantomjs' in PATH if 'PATH' not in str(e): raise e os_name = platform.system() if os_name.lower() == 'windows': print("::OS Detected - Windows::") phantomjs_url = 'https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-windows.zip' filename = 'phantomjs-2.1.1-windows.zip' file_path = download_phantomjs(filename) elif os_name.lower() == 'linux': print("::OS Detected - Linux::") phantomjs_url = 'https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2' filename = 'phantomjs-2.1.1-linux-x86_64.tar.bz2' file_path = download_phantomjs(filename) elif os_name.lower() == 'darwin': print("::OS Detected - macOS::") phantomjs_url = 'https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-macosx.zip' filename = 'phantomjs-2.1.1-macosx.zip' file_path = download_phantomjs(filename) else: raise Exception('Currently, Automatic phantomjs download does not supported in "{}" OS.\n' 'You can download and add phantomjs to PATH on your own,\n' 'Download Link: http://phantomjs.org/download.html'.format(os_name)) if filename.endswith('zip'): folder_name = filename.replace('.zip', '') file = zipfile.ZipFile(file_path) try: file.extract(folder_name + '/bin/phantomjs', TMP_DIR) phantom_path = os.path.join(TMP_DIR, folder_name + '/bin/phantomjs') os.chmod(phantom_path, 755) # Fix permission return phantom_path except KeyError as e: if 'windows' not in str(e): raise e file.extract(folder_name + '/bin/phantomjs.exe', TMP_DIR) return os.path.join(TMP_DIR, folder_name + '/bin/phantomjs.exe') elif filename.endswith('tar.bz2'): folder_name = filename.replace('.tar.bz2', '') file = tarfile.open(file_path, 'r:bz2') file.extract(folder_name + '/bin/phantomjs', TMP_DIR) phantom_path = os.path.join(TMP_DIR, folder_name + '/bin/phantomjs') os.chmod(phantom_path, 755) # Fix permission return phantom_path else: raise Exception('File Name is not zip or tar.bz2')
# -*- coding:utf-8 -*- import time from selenium import webdriver from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait import pymongo import re from pyquery import PyQuery as pq from bs4 import BeautifulSoup from lxml import etree # 定义一个浏览器 # browser = webdriver.Chrome() # 定义一个无界面的浏览器 browser = webdriver.PhantomJS( service_args=['--load-images=false', '--disk-cache=true']) # 50s无响应就down掉 wait = WebDriverWait(browser, 50) #定义一个窗口大小(无界面也需要定义) browser.set_window_size(1400, 900) def search(): """ 此函数的作用为完成首页点击搜索的功能,替换标签可用于其他网页使用 :return: """ #访问页面 browser.get('https://www.jd.com/') try: #选择到京东首页的输入框
def get_mkdir(self): jsonobj = json.loads(self.get_html().decode('utf-8')) # 列表页 - 图片 imgList = jsonpath.jsonpath(jsonobj, '$..img') # 列表页 - 价格 pricelist = jsonpath.jsonpath(jsonobj, '$..price') # 列表页 - 商品名 titleList = jsonpath.jsonpath(jsonobj, '$..title') # 列表页 - 商品id -- skuId skuIdList = jsonpath.jsonpath(jsonobj, '$..promotionInfo.skuId') # 商品价格 priceList = jsonpath.jsonpath(jsonobj, '$..price') # 商品品牌 brandList = jsonpath.jsonpath(jsonobj, '$..brandName') # 商品分类 categoryList = jsonpath.jsonpath(jsonobj, '$..thirdCatName') listdata = zip(titleList, imgList, pricelist, skuIdList, priceList, brandList, categoryList) for item in listdata: print(item) # 替换'/' import re strinfo = re.compile('/') itemdir = strinfo.sub('-', item[0]) print(itemdir) time.sleep(1) # 商品名称目录 if not os.path.exists(itemdir): os.makedirs(itemdir) else: print(itemdir + ' -- 目录已存在!') self.dataurl = '' # 存储本地主页图片链接地址 self.pimg = '' # 列表页 - 图片 # 文件夹和文件命名不能出现这9个字符:/ \ : * " < > | ? if os.path.exists(itemdir + '/' + item[1][-20:].replace( '/', '-').replace('\\', '-').replace(':', '-').replace( '*', '-').replace('"', '-').replace('<', '-').replace( '>', '-').replace('|', '-').replace('?', '-') + '.webp'): print('文件已存在!') # return 0 else: if item[1].startswith('//'): self.dataurl = "http:" + item[1] else: self.dataurl = item[1] try: req = request.Request(self.dataurl, headers=self.headers) reponse = request.urlopen(req) get_img = reponse.read() self.pimg = '/pimgs/' + itemdir + '/' + self.dataurl[ -20:].replace('/', '-').replace('\\', '-').replace( ':', '-').replace('*', '-').replace( '"', '-').replace('<', '-').replace( '>', '-').replace('|', '-').replace( '?', '-') + '.webp' with open( itemdir + '/' + self.dataurl[-20:].replace('/', '-').replace( '\\', '-').replace(':', '-').replace( '*', '-').replace('"', '-').replace( '<', '-').replace('>', '-').replace( '|', '-').replace('?', '-') + '.webp', 'wb') as fp: fp.write(get_img) except Exception as e: print(e) # 详情目录 if not os.path.exists(itemdir + '/详情'): os.makedirs(itemdir + '/详情') else: print('详情' + ' -- 目录已存在!') driver = webdriver.PhantomJS( executable_path='./phantomjs-2.1.1-macosx/bin/phantomjs') time.sleep(5) driver.get(self.detailurl + str(item[3])) time.sleep(5) driver.find_element_by_class_name('tipinfo').click() time.sleep(5) html = etree.HTML(driver.page_source) imglist = html.xpath('//img/@src') print(self.detailurl + str(item[3])) # 轮番图 lunfantu = html.xpath('//img[@class="detail-img"]/@src') # 猜你喜欢 # like = html.xpath('//img[@class="J_ItemImage recommend-img"]/@src') # 商品宣传图 xuanchuan = html.xpath( '//div[@class="J_descriptionDetail parameter"]//img/@src') # 规格 # 左边的参数名 leftspec = html.xpath( '//div[@class="left attr_key border-1px border-r border-b"]/text()' ) # 右边的参数值 rightspec = html.xpath( '//div[@class="left attr_value border-1px border-b"]/span/text()' ) spec = zip(leftspec, rightspec) # time.sleep(5) # print(driver.page_source) print(str(item[3])) print( "-------------------------- 轮播图 --------------------------------" ) print(lunfantu) print( "--------------------------- 规格 ---------------------------------" ) print(spec) print( "-------------------------- 介绍图 ---------------------------------" ) print(xuanchuan) print( "-------------------------- 主页图 ---------------------------------" ) print(self.dataurl) for simple in imglist: if not os.path.exists( itemdir + '/详情/' + simple[-20:].replace('/', '-'). replace('\\', '-').replace(':', '-').replace('*', '-'). replace('"', '-').replace('<', '-').replace('>', '-'). replace('|', '-').replace('?', '-') + '.webp'): request.urlretrieve( simple, itemdir + '/详情' + '/' + simple[-20:].replace('/', '-').replace( '\\', '-').replace(':', '-').replace( '*', '-').replace('"', '-').replace( '<', '-').replace('>', '-').replace( '|', '-').replace('?', '-') + ".webp") print("正在下载......") else: print('文件已存在!') # NOT # NULL # AUTO_INCREMENT, title # VARCHAR(1000), img # VARCHAR(1000), lunfanimg # VARCHAR(1000), spec # VARCHAR(1000), xcimg # VARCHAR(1000), # 插入数据库l # 判断数据库是否有skuId,有就不插入,无则插入 result = self.cur.execute( "select skuid from duodian WHERE skuid=" + str(item[3])) print(str(result) + '-----------------------') if result: print("数据库里面存在此数据") else: # 不存在,存数据 lunfantu1 = {} specpagram = {} xuanchuan1 = {} # 轮番图 for index1, item1 in enumerate(lunfantu): lunfantu1[index1] = item1 # 规格 speckey = 0 for itemspec in spec: specvalue = str(itemspec[0]) + '-' + str(itemspec[1]) specpagram[str(speckey)] = specvalue speckey += 1 # 介绍图 for index3, item3 in enumerate(xuanchuan): xuanchuan1[index3] = item3 # 存储本地图片链接地址 plunfantu = {} pxuanchuan = {} for pindex1, pitem1 in enumerate(lunfantu): plunfantu[pindex1] = '/pimgs/' + itemdir + '/详情/' + pitem1[ -20:].replace('/', '-').replace('\\', '-').replace( ':', '-').replace('*', '-').replace( '"', '-').replace('<', '-').replace( '>', '-').replace('|', '-').replace( '?', '-') + '.webp' for pindex2, pitem2 in enumerate(xuanchuan): pxuanchuan[ pindex2] = '/pimgs/' + itemdir + '/详情/' + pitem2[ -20:].replace('/', '-').replace('\\', '-').replace( ':', '-').replace('*', '-').replace( '"', '-').replace('<', '-').replace( '>', '-').replace('|', '-').replace( '?', '-') + '.webp' self.cur.execute( 'INSERT INTO ' + self.tablename + ' (title, img, lunfanimg, spec, xcimg,skuid,pimg, plunfanimg, pxcimg,categoryid,price,brandname,categoryname) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s,%s, %s, %s,%s)', (itemdir, self.dataurl, json.dumps(lunfantu1, ensure_ascii=False), json.dumps(specpagram, ensure_ascii=False), json.dumps(xuanchuan1, ensure_ascii=False), str(item[3]), self.pimg, json.dumps(plunfantu, ensure_ascii=False), json.dumps(pxuanchuan, ensure_ascii=False), '11386', '%.2f' % (item[4] / 100), str(item[5]), str(item[6]))) self.cur.connection.commit() print( "------------------------ 插入成功 ----------------------------------" )
from urllib.request import urlopen from bs4 import BeautifulSoup from selenium import webdriver import time url = "http://www.indiavotes.com/pc/info?eid=16&state=0" driver = webdriver.PhantomJS( r"G:\study\software\phantomjs-2.1.1-windows\phantomjs-2.1.1-windows\bin\phantomjs.exe" ) driver.get(url) # This will get the initial html - before javascript html1 = driver.page_source soup = BeautifulSoup(html1) a_tag = soup.find_all('a') c = soup.find_all('td', attrs={'class': 'tal sorting_1'}) c = c[:-13] c #z=str(c[1].find_all('a')[0]) #z #z=z[9:55] #z.replace("'",'') #for i in c: # z='' # z=str(i.find_all('a')[0]) # z=z[9:55] # z=z.replace('"','') # print(z)
async def getBukkensFromYamlInPage(yaml, pageUrl): # 物件情報のDBIOインスタンスを作成 dbio = dbiomaker() # webサイトから取得した物件リストを格納 bukkens = [] #開発環境と本番環境でPhantomJSの呼び出し方が異なるため、ホスト名で振り分け if os.uname()[1] == "kira-no-MacBook-Air.local": driver = webdriver.PhantomJS( executable_path='/Applications/phantomjs-1.9.2-macosx/bin/phantomjs' ) else: driver = webdriver.PhantomJS() # 新規タブをあけるキー操作を設定 newtab = Keys.CONTROL + 't' # Mac かどうかの判定、キーがMac だと違う if sys.platform == 'darwin': newtab = Keys.COMMAND + 't' #webサイトからデータ取得 print("start driver") #open tab #driver.find_element_by_tag_name('body').send_keys(newtab) driver.get(pageUrl) print("end driver") #HTMLは未使用にみえるが、文字列指定の形でevalで使用している HTML = lxml.html.fromstring(driver.page_source) #登録用物件辞書 bukkenDic = {} bukkenSetter = BukkenSetter() #mainルーチン # g is GROUP # u is UNIT # pcs is UNIT item #共通情報設定 yamlid = "website" bukkenDic.update({yamlid: yaml[yamlid]}) yamlid = "websiteURL" bukkenDic.update({yamlid: yaml[yamlid]}) #print("G1 --YAML[GROUPS] => YAML[GROUP]--:YAMLファイルからGROUPの検索条件を取得") for g in yaml: if g == "GROUP": gp = yaml[g]["PROTOCOL"] gc = yaml[g]["COMMAND"] gs = yaml[g]["SELECTOR"] #print("G2 --YAML[GROUP] => HTML[GROUPS]--:GROUP検索条件よりHTMLのGROUP群を抽出") groups = eval("HTML" + "." + gp + '("' + gc + '")' + gs) #print("G3 --HTML[GROUPS] => HTML[GROUP]--:HTMLのGROUP群を1つづつループ処理") for group in groups: #print("U1 --YAML[GROUP] => YAML[UNIT]--:YAMLファイルからUNITの検索条件を取得") for u in yaml[g]: if u == "UNIT": up = yaml[g][u]["PROTOCOL"] uc = yaml[g][u]["COMMAND"] us = yaml[g][u]["SELECTOR"] #print("U2 --YAML[UNIT] => HTML[UNITS]--:UNIT検索条件よりHTMLのUNIT群を抽出") #<div class="article-box clearfix"> units = eval("group" + "." + up + '("' + uc + '")' + us) #print("U3 --HTML[UNITS] => HTML[UNIT]--:HTMLのUNIT群を1つづつループ処理") for unit in units: #print("UI1--YAML[UNIT] => YAML[UNITITEMS]--:YAMLファイルからUNITITEM群の検索条件を取得") for uis in yaml[g][u]: if uis == "UNITITEMS": #print("UI2--YAML[UNITITEMS] => YAML[UNITITEM]--:YAMLファイルからUNITITEMの検索条件を取得") for ui in yaml[g][u][uis]: if ui != "IGNORE": p = yaml[g][u][uis][ui]["PROTOCOL"] c = yaml[g][u][uis][ui]["COMMAND"] s = yaml[g][u][uis][ui]["SELECTOR"] h = yaml[g][u][uis][ui]["HEADER"] #print("UI3 --YAML[UNITITEM] => HTML[UNITITEM]--:UNITITEM検索条件よりHTMLのUNITITEM情報を抽出") #print(ui+":"+htmlItemSelector(unit,p,c,s)) #登録用物件辞書に追加 bukkenDic.update({ ui: htmlItemSelector( unit, p, c, s, h) }) #物件情報設定 bukkeninfo = bukkenSetter.getBukkenInfoByDic( bukkenDic) bukkens.append(bukkeninfo) #️DBへ格納 dbio.insert(bukkens)
''' 用selenium和phantomjs来模拟抓取包含ajax的页面 ajax执行完后会更改页面的某些元素, 如果获取这些元素, 直接获取是不行的,需要用下列方法---- ''' from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains import pandas as pd from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC # 构造driver driver = webdriver.PhantomJS( executable_path='/home/chin/company/program/phantomjs/bin/phantomjs') driver.get('http://pvp.qq.com/web201605/herolist.shtml') try: # 10是超时时间 # element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'loadedButton'))) heros_node = driver.find_elements_by_xpath( '/html/body/div[3]/div/div/div[2]/div[2]/ul/li/a') hero_list = [] url_list = [] for node in heros_node: hero_list.append(node.text) url_list.append(node.get_attribute('href')) #17173 pvp data = pd.DataFrame({'hero': hero_list, 'url1': url_list})
def get_issues_xml(web_url): driver = webdriver.PhantomJS( executable_path="C:/Users/khavaninzadeh/Desktop/phantomjs-2.1.1-windows/bin/phantomjs.exe") driver.get(web_url) if not get_webpage_lang(driver.find_element_by_tag_name('body').text): print("this journal is not in persian !!\n passing it ....") return False Issues = [] driver.maximize_window() links_total = len(driver.find_elements_by_xpath("//a[contains(@onclick, 'loadIssues')]")) if not links_total: return False print("all Volumes in this web page = ", str(links_total)) i = 1 for plus in driver.find_elements_by_xpath( "//a[contains(@onclick, 'loadIssues')]"): # browser.find_elements_by_class_name("dv_archive_vol"): try: if i == 1: print("Volume number ", i, "is found ") print("plus number ", i, " is NOT clicked cause it is a minus ! ") i += 1 time.sleep(10) else: print(' i = ', i) element = WebDriverWait(driver, 20).until( EC.element_to_be_clickable((By.XPATH, "//a[contains(@onclick, 'loadIssues')]"))) print("Volume number ", i, "is found ") # print(plus) plus.click() print("plus number ", i, " is clicked ", str(plus.click())) time.sleep(5) # plus.find_element_by_tag_name("a").click() i += 1 except Exception as exc: print("something went wrong ! in Volume number : ", i) print(exc) i += 1 soup = BeautifulSoup(driver.page_source, 'html.parser') # with open('PhJs_out.txt', 'w', encoding='utf8') as out: # for line in soup.prettify(): # out.write(line) # print("the page source is now in the file !") # print(soup) # driver.implicitly_wait(time_to_wait=5) c = 0 link_len = len(soup.findAll('div', {"class": "issue_dv"})) for ana in soup.findAll('div', {"class": "issue_dv"}): # print(ana) c = c + 1 Issues.append(ana) # for issue in Issues: # print("link = ", issue.find('a').get('href')) # print("all issues count = ", c) # print("web url is ... ", web_url) # print("issue link is ... ", issue_link) # # correct_url = urljoin(web_url, issue_link) # # print(correct_url) # name = "" # path_to_save = "./" # name = "Sweb_Volume_" # final_save_loc = "" # parse_object = urlparse(web_url) # base_url = "http://" + parse_object.netloc # soup2 = BeautifulSoup(request.urlopen(correct_url), 'html.parser') issue_number = 1 for issue in Issues: try: time.sleep(20) path = "./" issue_link = issue.find('a').get('href') parse_object = urlparse(web_url) base_url = "http://" + parse_object.netloc corrected_url = urljoin(web_url, issue_link) issue_soup = BeautifulSoup(request.urlopen(corrected_url), 'html.parser') issue_xml = issue_soup.findAll("a", attrs={"title": "XML"}, href=True) # finds the xml file print("Going to get : ", corrected_url) href = issue_xml[0].get('href') get_xml_url = urljoin(base_url, href) print('Directed to = > ', get_xml_url) except Exception as exp: print("an error occured : ", exp) with open("missed_journals.txt", 'a') as file: file.write(web_url + '\n') return False with open(path + str(issue_number) + '.xml', 'wb') as file: try: file.write(requests.get(get_xml_url).content) print("xml file ", issue_number, " is downloaded") except Exception as exp: print("there was a problem geting : ", issue_number, ".xml") print("going to save it in missed_xmls") missed_xmls.append(get_xml_url) with open('missed_xmls.txt', 'a') as missed_file: missed_file.writelines(get_xml_url) missed_file.write('\n') print(exp) issue_number += 1 print( "__________________________________________________________________________________________________________") return issue_number
def main(): start_time = datetime.now() #配置 service_args = [ '--proxy=http://127.0.0.1:1087', # 代理 IP:port (eg:192.168.0.28:808) '--proxy-type=https', # 代理类型:http/https '--load-images=no', # 关闭图片加载(可选) '--disk-cache=yes', # 开启缓存(可选) '--ignore-ssl-errors=true' # 忽略https错误(可选) ] dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36" ) #完成登录以及数据库连接 global browser browser = webdriver.PhantomJS(service_args=service_args, desired_capabilities=dcap) global photo_query_hash photo_query_hash = 'bd0d6d184eefd4d0ce7036c11ae58ed9' login() cursor = connect_db() #user=Follower('luojunjie20','8091752170','false') toParseList = select_to_Parse(cursor) for toParse_user in toParseList: #print(toParse_user.name) if toParse_user.is_private == 'True': modify_flag = 'update followers set parsed="1" where follower_name="%s"' % toParse_user.name cursor.execute(modify_flag) else: try: person = get_followers(toParse_user) except: continue modify_flag_add(cursor, toParse_user, person) toParsePhotoList = get_user_to_parse_photos(cursor) global first_count first_count = 12 global photo_count photo_count = 0 base_path = '/Users/junjieluo/MyGit/instagram/instagram_photos' for user in toParsePhotoList: if user.is_private == 'True': modify_sen = 'update followers set photo_parsed="1" where follower_name="%s"' % user.name cursor.execute(modify_sen) else: url = 'https://www.instagram.com/graphql/query/?query_hash={}&variables=%7B%22id%22%3A%22{}%22%2C%22first%22%3A{}%7D'.format( photo_query_hash, user.id, first_count) try: urls = get_photo_urls(url, user, []) except: continue save_photo_in_database(cursor, user, urls) real_dir = create_dir(base_path, user) p = Pool() for url in urls: p.apply_async(save_photos, args=(url, real_dir)) p.close() p.join() browser.quit() print('selenium已退出') end_time = datetime.now() print('本程序于{}时启动,于{}时关闭'.format(start_time, end_time))
#!/usr/bin/python import os import getpass from selenium import webdriver import login import training import squad ######################################################################################## driver = webdriver.PhantomJS("lib/phantomjs") ######################################################################################## if not os.path.exists("lib"): os.makedirs("lib") if not os.path.exists("config"): os.makedirs("config") if not os.path.exists("squad"): os.makedirs("squad") if not os.path.exists("training_reports"): os.makedirs("training_reports") if not os.path.exists("errors"): os.makedirs("errors") if not os.path.exists("upload"): os.makedirs("upload") if not os.path.exists("upload/training_reports"): os.makedirs("upload/training_reports") if not os.path.exists("upload/squad"): os.makedirs("upload/squad") if login.login(driver):
def open_pjs_wd(self): wd = webdriver.PhantomJS() wd.set_window_size(1280, 900) return wd
def __init__(self, *args, **kwargs): self.driver = webdriver.PhantomJS(executable_path='/usr/local/lib/node_modules/phantomjs-prebuilt/bin/phantomjs') super(JobspiderSpider, self).__init__(*args, **kwargs) self.start_urls = ['https://www.jobs.gov.hk']
print("complete task ok, task=" + json.dumps(task)) else: print("complete task error, rsp=" + json.dumps(rsp)) except Exception as err: print(err) if __name__ == "__main__": print("main") try: runCount = 1 if len(sys.argv) == 2: runCount = int(sys.argv[1]) print("##### RunCount={0}".format(runCount)) driver = webdriver.PhantomJS('phantomjs.exe') for index in range(0, runCount, 1): print("##### Run: {0} / {1}".format(index, runCount)) errorCode, t = getTask() if errorCode == "NO_MORE_TASK": break elif errorCode == "OK" and t != None: errorCode, html = fetchTask(driver, t) if errorCode == 'ROBOT': print('Robot error, exit') break elif errorCode == 'OK': if html != None: rc = sendPage(t, html)
def setUp(self): self.driver = webdriver.PhantomJS(service_args=['--webdriver-loglevel=DEBUG']) self.driver.set_window_size(1400, 1000) self.driver.implicitly_wait(2) if not os.environ.get('BIRDSEYE_SERVER_RUNNING'): Thread(target=lambda: app.run(port=7777)).start()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) # Install our opener (note that this changes the global opener to the one # we just made, but you can also just call opener.open() if you want) urllib2.install_opener(opener) # Build our Request object (supplying 'data' makes it a POST) req = urllib2.Request(url, postData) # Make the request and read the response try: resp = urllib2.urlopen(req) web_pg = resp.read() except urllib2.HTTPError, error: web_pg = error.read() else: if not dynamicData: wd = webdriver.PhantomJS() wd.get(url) web_pg = wd.page_source else: wd = webdriver.PhantomJS() wd.get(url) listOfWebs = [] ccyPairs = [] if len(dynamicData) == 3: baseList = dynamicData[0] counterList = dynamicData[1] nameList = dynamicData[2] #1st loop for each baseCCY in baseList to all counterCCY in counterList
''' 아이캠 자동출책용 Test 이녀석은 일단 공개강의 - 국제어강의학습전략(인문사회과학도 에 들어가서 강좌를 클릭하는 기능 조회수가 +1되고 3초 기다린 후 체크하면, 윈도우 창이 2개 열려있는 것을 확인가능 창을 바꿔주고 스크린샷을 찍으면, 강의 플레이어가 스크린샷에 찍힘 즉, 강의 잘 찾아가서 강의 재생까지 누른 것. ''' from selenium import webdriver import time id = input('아이디 :') pw = input('패스워드 :') driver = webdriver.PhantomJS('C:\\Users\\박천욱\\Downloads\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe') #팬텀JS사용. 자신의 팬텀JS위치로 설정하세요 driver.get('http://www.icampus.ac.kr/front/main/MainAction.do?method=list') delay=3 driver.implicitly_wait(delay) print('icampus로 로그인합니다.') driver.find_element_by_name('uid').send_keys(id) driver.find_element_by_name('pwd').send_keys(pw) driver.find_element_by_xpath('//*[@id="mlogin01"]/div/a').click() driver.implicitly_wait(delay) print('로그인 완료\n\n') driver.find_element_by_xpath('//*[@id="mainmenu"]/li[3]/span/a').click() #공개강의로 이동 driver.find_element_by_link_text("국제어강의학습전략(인문사회과학도)").click() #세부 강의 선택 driver.find_element_by_xpath('//a[img/@src="/images/front/ko/icon_test.gif"]').click() #강의 재생 클릭
import os from selenium import webdriver from selenium.webdriver.remote.webelement import WebElement from selenium.webdriver.common.keys import Keys import config print(os.path.abspath(config.PHANTOMJS_PATH)) driver = webdriver.PhantomJS( executable_path=os.path.abspath(config.PHANTOMJS_PATH)) # WebDriver will wait until the page has fully loaded (that is, the “onload” event has fired) driver.get("http://www.python.org") assert "Python" in driver.title # find element by its class name elem = driver.find_element_by_class_name( 'slide-code').find_element_by_tag_name('code') elem.screenshot('test.png') # The quit will exit entire browser whereas close` will close one tab, but if just one tab was open # driver.quit() # driver.close()
passcode = "passcode" browser = "firefox" # phantomjs,firefox,chrome #url = "http://google.com" url = "http://172.18.4.1" #delay = 5 # delay till the browser closes after sign in. The server needs to process your sign in ########## # import # from selenium import webdriver import time ########## #set your web driver if browser == "phantomjs": driver = webdriver.PhantomJS() #needs PhantomJS elif browser == "firefox": driver = webdriver.Firefox() #needs geckodriver elif browser == "chrome": driver = webdriver.Chrome() #needs N/A not sure if this works #Open the page driver.get(url) #Find the Username box driver.find_element_by_name("user").send_keys(username) #Find the password box driver.find_element_by_name("password").send_keys(passcode) #Find the check box for terms of use
def setUpClass(cls): cls.selenium = webdriver.PhantomJS() super(LiveTests, cls).setUpClass()
'*/*', 'Accept-Encoding': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Cache-Control': 'max-age=0', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36' } params = DesiredCapabilities.PHANTOMJS # 设置浏览器的Headers反反爬虫 params['phantomjs.page.customHeaders'] = headers driver = webdriver.PhantomJS( executable_path= r'D:\Python2\phantomjs\phantomjs-2.1.1-windows\bin\phantomjs.exe', desired_capabilities=params) def parse_score(page_source): page = BeautifulSoup(page_source, 'lxml') academic_year = page.find('span', id="lbl_bt").text # 初始化总学分 credit_sum = 0 # 初始化总绩点 grade_point_sum = 0 # 获取每行的课程内容 courses = page.find('table', id="Datagrid1").find('tbody').find_all('tr') print('- - - - - - - - - - %s - - - - - - - - - ' % academic_year) for i in range(1, len(courses)): # 获取学期数
import xlwt from selenium import webdriver home = 'CLE' home_last_game = '201612250CLE' last_game_qualifier = 'SUN VS GSW' players = [] player_names_split = [] firstname_whitelist = ['James Michael'] # open phantom js session browser = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true']) browser.maximize_window() # navigate to team site and parse data browser.get('http://www.basketball-reference.com/teams/' + home + '/2017.html') player_numbers = browser.find_elements_by_xpath( "//*[@id='roster']/tbody/tr/th") player_numbers = [number.text for number in player_numbers] player_names = browser.find_elements_by_xpath( "//*[@id='roster']/tbody/tr/td[1]") player_names = [name.text for name in player_names] player_career = browser.find_elements_by_xpath( "//*[@id='roster']/tbody/tr/td[1]/a") player_career = [career.get_attribute('href') for career in player_career] player_career = dict(zip(player_names, player_career)) # create roster sheet for idx, player_name in enumerate(player_names): player = {'first': '', 'last': '', 'number': ''} for whitelist_name in firstname_whitelist: if whitelist_name in player_name:
def crawl_repost(self, weiboid, pages=None): """ Crawl repost information around a weibo. :param weiboid: weibo's id itself :param pages: pages to crawl :return: a list of repost information """ if not self.test_cookies(): if not self.login(): return None driver = webdriver.PhantomJS() for cookie in self.cookies: driver.add_cookie(cookie) try: if pages is None: driver.get('https://weibo.cn/repost/' + weiboid) try: pages = int(driver.find_element_by_xpath('//input[@name="mp"]').get_attribute('value')) except exceptions.NoSuchElementException: pages = 1 reposters = [] print("======获取原微博======") driver.get('https://weibo.cn/repost/' + weiboid) nickname = driver.find_element_by_xpath('//div[@id="M_"]//a').text uid = driver.find_element_by_xpath('//div[@id="M_"]//a').get_attribute('href').split('/')[-1] content = driver.find_element_by_xpath('//div[@id="M_"]//span').text[1:] print(content) repost_info = { 'from_weibo_id': None, 'nickname': nickname, 'uid': uid, 'content': content, # 'from_uid': [repost_from_uid], 'weibo_id': weiboid, } reposters.append(repost_info) print("======获取转发======") for i in range(1, pages + 1): driver.get('https://weibo.cn/repost/' + weiboid + '?page=' + str(i)) repost_list = driver.find_elements_by_xpath('//div[@class="c"]') # print(len(repost_list)) for repost in repost_list: if 'attitude' not in repost.get_attribute('innerHTML'): continue reposter_info = repost.find_element_by_xpath('.//a') reposter_nickname = reposter_info.text reposter_uid = reposter_info.get_attribute('href').split("/")[-1] reposter_content = ":".join(repost.text.split(":")[1:]) reposter_content = reposter_content[:reposter_content.find(u'赞')].split("//@")[0] repost_weibo_id = repost.find_element_by_partial_link_text(u'赞').get_attribute('href').split("/")[-2] print(repost_weibo_id, end=" ") if weiboid is not 'weibo.cn': repost_info = { 'from_weibo_id': weiboid, 'nickname': reposter_nickname, 'uid': reposter_uid, 'content': reposter_content, # 'from_uid': [repost_from_uid], 'weibo_id': repost_weibo_id, } reposters.append(repost_info) print() except Exception as e: return None driver.quit() return reposters