Esempio n. 1
0
def virgin_atlantic(origin, dest, searchdate, returndate, searchkey,
                    returnkey):
    driver = webdriver.PhantomJS(service_args=[
        '--ignore-ssl-errors=true', '--ssl-protocol=any', '--load-images=false'
    ])
    driver.set_window_size(1120, 1080)
    # sys.stdout=codecs.getwriter('utf-8')(sys.stdout)
    currentdatetime = datetime.datetime.now()
    stime = currentdatetime.strftime('%Y-%m-%d %H:%M:%S')
    # try:
    if not DEV_LOCAL:
        db = customfunction.dbconnection()
        cursor = db.cursor()
    dt = datetime.datetime.strptime(searchdate.strip(), '%m/%d/%Y')
    date = dt.strftime('%d/%m/%Y')
    if returndate != 'None' and returndate:
        dt1 = datetime.datetime.strptime(returndate, '%m/%d/%Y')
        retdate = dt1.strftime('%d/%m/%Y')
    if returndate != 'None' and returndate:
        url = "http://www.virgin-atlantic.com/us/en/book-your-travel/book-your-flight/flight-search-results.html?departure=" + origin + "&arrival=" + dest + "&adult=1&departureDate=" + str(
            date
        ) + "&search_type=redeemMiles&classType=10&classTypeReturn=10&bookingPanelLocation=Undefined&isreturn=yes&returnDate=" + str(
            retdate)
    else:
        url = "http://www.virgin-atlantic.com/us/en/book-your-travel/book-your-flight/flight-search-results.html?departure=" + origin + "&arrival=" + dest + "&adult=1&departureDate=" + str(
            date
        ) + "&search_type=redeemMiles&classType=10&classTypeReturn=10&bookingPanelLocation=BookYourFlight&isreturn=no"

    print url, '@@@@@@@@@@@@@22'

    driver.get(url)
    time.sleep(2)
    html_page = driver.page_source

    soup = BeautifulSoup(html_page, "lxml")

    # except:
    #     print "before data page"
    #     print "searchkey",searchkey
    #     print "returnkey",returnkey
    #     if not DEV_LOCAL:
    #         storeFlag(searchkey,stime)
    #         if returnkey:
    #             storeFlag(returnkey,stime)
    #             print "return key stored"
    #     driver.quit()
    #     return

    def virgindata(tbody, keyid):
        recordcount = 1
        value_string = []
        try:
            if tbody.findAll("tr", {"class": "directRoute "}):
                trbody = tbody.findAll("tr", {"class": "directRoute "})
            else:
                if tbody.findAll("tr", {"class": "indirectRoute "}):
                    trbody = tbody.findAll("tr", {"class": "indirectRoute "})
        except:
            return keyid

        for row in trbody:
            econo = 0
            econotax = 0
            business = 0
            busstax = 0
            first = 0
            firsttax = 0
            stp = ''
            lyover = ''
            details = row.find("td", {"class": "flightSearchDetails"})  # error
            economy = ''
            #============= price block ================================================================
            if row.find("td",
                        {"class": "cellOption economy  hasLowestCostMessage"}):
                economy = row.find(
                    "td",
                    {"class": "cellOption economy  hasLowestCostMessage"})
            if economy == '' and row.find("td",
                                          {"class": "cellOption economy "}):
                economy = row.find("td", {"class": "cellOption economy "})
            if economy:
                "--------------economy--------------------------------"
                economy_price = economy.find("span", {"class": "price"})
                econprice1 = economy_price.text

                econprice = re.findall("\d+.\d+", econprice1)
                if len(econprice) > 0:
                    econo = econprice[0]
                    if ',' in econo:
                        econo = econo.replace(',', '')
                if len(econprice) > 1:
                    if "USD" not in econprice1:
                        cprice = 0
                        if ',' in econprice[1]:
                            cprice = econprice[1].replace(',', '')
                        else:
                            cprice = econprice[1]
                        currency_symbol = (re.findall("[a-zA-Z]+", econprice1))
                        currencychange = urllib.urlopen(
                            "https://www.exchangerate-api.com/%s/%s/%f?k=e002a7b64cabe2535b57f764"
                            % (currency_symbol[1], "USD", float(cprice)))
                        chaged_result = currencychange.read()
                        econotax = chaged_result
                    else:
                        econotax = econprice[1]
            pre_economy = ''
            if row.find("td", {"class": "cellOption premEconomy "}):
                pre_economy = row.find("td",
                                       {"class": "cellOption premEconomy "})
            if pre_economy == '' and row.find(
                    "td",
                {"class": "cellOption premEconomy  hasLowestCostMessage"}):
                pre_economy = row.find(
                    "td",
                    {"class": "cellOption premEconomy  hasLowestCostMessage"})
            if pre_economy:
                "--------------pre economy--------------------------------"
                pre_economy_price = pre_economy.find("span",
                                                     {"class": "price"})
                pre_economy = pre_economy_price.text
                #print pre_economy
                pre_econo_price = re.findall("\d+.\d+", pre_economy)
                if len(pre_econo_price) > 0:
                    business = pre_econo_price[0]
                    if ',' in business:
                        business = business.replace(',', '')
                if len(pre_econo_price) > 1:
                    if "USD" not in pre_economy:
                        eprice = 0
                        if ',' in pre_econo_price[1]:
                            eprice = pre_econo_price[1].replace(',', '')
                        else:
                            eprice = pre_econo_price[1]
                        currency_symbol = (re.findall("[a-zA-Z]+",
                                                      pre_economy))
                        currencychange = urllib.urlopen(
                            "https://www.exchangerate-api.com/%s/%s/%f?k=e002a7b64cabe2535b57f764"
                            % (currency_symbol[1], "USD", float(eprice)))
                        chaged_result = currencychange.read()
                        busstax = chaged_result
                    else:
                        busstax = pre_econo_price[1]
                    #print "pre_econotax",busstax
            upper_class = ''
            if row.find("td", {"class": "cellOption upperclass  last"}):
                "--------------upper class--------------------------------"
                upper_class = row.find(
                    "td", {"class": "cellOption upperclass  last"})
            else:
                if row.find("td", {
                        "class":
                        "cellOption upperclass  last hasLowestCostMessage"
                }):
                    upper_class = row.find(
                        "td", {
                            "class":
                            "cellOption upperclass  last hasLowestCostMessage"
                        })
            if upper_class:
                upper_class_price = upper_class.find("span",
                                                     {"class": "price"})
                upperclass_price = upper_class_price.text
                upperprice = re.findall("\d+.\d+", upperclass_price)
                if len(upperprice) > 0:
                    first = upperprice[0]
                    if ',' in first:
                        first = first.replace(',', '')
                if len(upperprice) > 1:
                    if "USD" not in upperclass_price:
                        uprice = 0
                        if ',' in upperprice[1]:
                            uprice = upperprice[1].replace(',', '')
                        else:
                            uprice = upperprice[1]
                        currency_symbol = (re.findall("[a-zA-Z]+",
                                                      upperclass_price))
                        currencychange = urllib.urlopen(
                            "https://www.exchangerate-api.com/%s/%s/%f?k=e002a7b64cabe2535b57f764"
                            % (currency_symbol[1], "USD", float(uprice)))
                        chaged_result = currencychange.read()
                        firsttax = chaged_result
                    else:
                        firsttax = upperprice[1]

            #============================= end price block =========================================================
            sourcestn = ''
            destinationstn = ''
            depttime = ''
            arivaltime = ''
            total_duration = ''
            heading = details.find("ul")
            depart = heading.find("li", {"class": "depart"})
            departinfo = depart.findAll("p")
            if len(departinfo) > 0:
                depttime = departinfo[0].text
                departfrom1 = departinfo[1].text
                if 'from' in departfrom1:
                    departfrom = (departfrom1.replace('from', '')).strip()
                    if '(' in departfrom:
                        departfrom1 = departfrom.split('(')
                        sourcestn = departfrom1[1].replace(')', '')
            arive = heading.find("li", {"class": "arrive"})
            ariveinfo = arive.findAll("p")
            if len(ariveinfo) > 0:
                arivaltime = ariveinfo[0].text
                if '+' in arivaltime:
                    arivaltimesplit = arivaltime.split('+')
                    arivaltime = arivaltimesplit[0]
                ariveat1 = ariveinfo[1].text
                if 'at' in ariveat1:
                    ariveat = (ariveat1.replace('at', '')).strip()
                    if '(' in ariveat:
                        ariveat2 = ariveat.split('(')
                        destinationstn = ariveat2[1].replace(')', '')
            stop = heading.find("li", {"class": "stops"})
            durations = heading.find("li", {"class": "duration"})
            stoppage = stop.text
            if '0' in stoppage:
                stp = "NONSTOP"
            elif '1' in stoppage:
                stp = "1 STOP"
            elif '2' in stoppage:
                stp = "2 STOPS"
            else:
                if '3' in stoppage:
                    stp = "3 STOPS"
            total_duration = (durations.text).strip()
            if 'Duration' in total_duration:
                total_duration = (total_duration.replace('Duration',
                                                         '')).strip()
            '''
            #print "total_duration",total_duration
            operator = details.find("dl",{"class":"operator"})
            operatedby = (operator.find("dd").text).strip()
            print "operatedby",operatedby
            '''
            #===============================details block====================================================
            details_block = details.find("div", {"class": "tooltip"})
            details_tr = details_block.findAll("tr")
            counter = 0
            departdlist = []
            arivelist = []
            planelist = []
            operatedby = []
            departdetails = ''
            arivedetails = ''
            planedetails = ''
            operatedbytext = ''
            while (counter < len(details_tr)):
                #print "counter",counter
                from_to = details_tr[counter].find("td",
                                                   {"class": "flightDetails"})
                operator = from_to.find("span", {"class": "operator"}).text
                operatedby.append(operator)
                #print "operator",operator
                from_to1 = from_to.find("span", {"class": "flightFromTo"}).text
                departing_from = ''
                ariving_at = ''
                departing_date = ''
                detaildetptime = ''
                detailarivetime = ''
                deptextraday = ''
                ariveextraday = ''
                if 'to' in from_to1:
                    from_to1 = from_to1.split('to')
                    departing_from = from_to1[0]
                    if '\n' in departing_from:
                        departing_from1 = departing_from.split("\n")
                        departing_from = departing_from1[0].strip(
                        ) + " " + departing_from1[1].strip()
                    #print "departing_from",departing_from
                    ariving_at = from_to1[1]
                    if '\n' in ariving_at:
                        ariving_at1 = ariving_at.split("\n")
                        ariving_at = ariving_at1[0].strip(
                        ) + " " + ariving_at1[1].strip()
                    #print "ariving_at",ariving_at
                departing_date = from_to.find("span", {
                    "class": "fullDate"
                }).text
                if 'Departing' in departing_date:
                    departing_date = (departing_date.replace('Departing',
                                                             '')).strip()
                counter = counter + 1
                departtime = details_tr[counter].find("td",
                                                      {"class": "departs"})
                fl_dept_time = departtime.find("span",
                                               {"class": "flightDeparts"})
                detaildetptime = fl_dept_time.text
                if departtime.find("span", {"class": "extraDays"}):
                    extradeptdate = departtime.find("span",
                                                    {"class": "extraDays"})
                    deptextraday = extradeptdate.text
                    nod = re.findall("\d+.\d+", deptextraday)
                    #print "nod",nod
                    if "+1" in deptextraday:
                        deptextraday = "+1 day"
                    elif "+2" in deptextraday:
                        deptextraday = "+2 day"
                    else:
                        if "+3" in deptextraday:
                            deptextraday = "+3 day"
                arivetime = details_tr[counter].find("td",
                                                     {"class": "arrives"})
                fl_arive_time = arivetime.find("span",
                                               {"class": "flightArrives"})
                detailarivetime = fl_arive_time.text
                if arivetime.find("span", {"class": "extraDays"}):
                    extra_ariveday = arivetime.find("span",
                                                    {"class": "extraDays"})
                    ariveextraday = extra_ariveday.text

                duration = details_tr[counter].find("td",
                                                    {"class": "duration"})
                fl_duration1 = duration.find("span",
                                             {"class": "flightDuration"})
                fl_duration = (fl_duration1.text).strip()
                fl_flightno = ''
                planeno = ''
                flight_no = details_tr[1].find("td", {"class": "number"})
                fl_flightno1 = flight_no.find("span",
                                              {"class": "flightNumber"})
                planeno = (''.join(fl_flightno1.find('br').next_siblings))
                fl_flightno = (fl_flightno1.text).replace(planeno, '')

                departinfo_time = departing_date + " " + detaildetptime
                departinfo_time = datetime.datetime.strptime(
                    departinfo_time, '%A %d %B %Y %H:%M')
                departinfo_time = departinfo_time.strftime('%Y/%m/%d %H:%M')

                airport_ = customfunction.get_airport_detail(
                    get_airport_code(departing_from)) or departing_from
                deptdetail = departinfo_time + " | from " + airport_
                departdlist.append(deptdetail)

                departinfo_time = departing_date + " " + detailarivetime
                departinfo_time = datetime.datetime.strptime(
                    departinfo_time, '%A %d %B %Y %H:%M')
                departinfo_time = departinfo_time.strftime('%Y/%m/%d %H:%M')

                airport_ = customfunction.get_airport_detail(
                    get_airport_code(ariving_at)) or ariving_at
                arivedetail = departinfo_time + " | at " + airport_
                arivelist.append(arivedetail)
                planetext = fl_flightno + " | " + planeno + " (" + fl_duration + ")"
                planelist.append(planetext)
                counter = counter + 1
            departdetails = '@'.join(departdlist)
            arivedetails = '@'.join(arivelist)
            planedetails = ('@'.join(planelist)).strip()
            operatedbytext = '@'.join(operatedby)

            value_string.append(
                (fl_flightno, str(keyid), stime, stp, lyover, sourcestn,
                 destinationstn, depttime, arivaltime, total_duration,
                 str(econo), str(econotax), str(business), str(busstax),
                 str(first), str(firsttax), "Economy", "Business", "First",
                 "virgin_atlantic", departdetails, arivedetails, planedetails,
                 operatedbytext))
            recordcount = recordcount + 1
            if recordcount > 50:
                cursor.executemany(
                    "INSERT INTO pexproject_flightdata (flighno,searchkeyid,scrapetime,stoppage,stoppage_station,origin,destination,departure,arival,duration,maincabin,maintax,firstclass,firsttax,business,businesstax,cabintype1,cabintype2,cabintype3,datasource,departdetails,arivedetails,planedetails,operatedby) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);",
                    value_string)
                db.commit()
                value_string = []
                recordcount = 1
        if len(value_string) > 0:
            if not DEV_LOCAL:
                cursor.executemany(
                    "INSERT INTO pexproject_flightdata (flighno,searchkeyid,scrapetime,stoppage,stoppage_station,origin,destination,departure,arival,duration,maincabin,maintax,firstclass,firsttax,business,businesstax,cabintype1,cabintype2,cabintype3,datasource,departdetails,arivedetails,planedetails,operatedby) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);",
                    value_string)
                db.commit()
            else:
                print value_string
        #driver.quit()

    tbody = soup.findAll("tbody", {"class": "flightStatusTbody"})
    if searchkey:
        if len(tbody) > 0:
            virgindata(tbody[0], searchkey)
        if not DEV_LOCAL:
            storeFlag(searchkey, stime)
    if returnkey:
        if len(tbody) > 1:
            virgindata(tbody[1], returnkey)
        if not DEV_LOCAL:
            storeFlag(returnkey, stime)
    driver.quit()
    return searchkey
Esempio n. 2
0
 def __init__(self):
     '''
         !!! FOR WINDOWS USERS
     '''
     #self.driver = webdriver.PhantomJS(executable_path="c:/phantomjs-2.1.1-windows/bin/phantomjs.exe")
     self.driver = webdriver.PhantomJS(executable_path="C:/Users/flipp/phantomjs-2.1.1-windows/bin/phantomjs.exe")
Esempio n. 3
0
    u'\u5eb7\u5b9a': u'KGT',
    u'\u53f0\u4e2d': u'RMQ'
}

cityList = [u'北京', u'广州']
dateList = ['2018-04-20', '2018-04-21']

for dept_city in cityList:
    for arv_city in cityList:
        if dept_city != arv_city:
            dept_city_code = cityToCodeList[dept_city]
            arv_city_code = cityToCodeList[arv_city]
            for date in dateList:
                url = 'https://m.ctrip.com/html5/flight/swift/domestic/' + dept_city_code + '/' + arv_city_code + '/' + date
                # url = 'https://m.ctrip.com/html5/flight/swift/index'
                driver = webdriver.PhantomJS()
                driver.maximize_window()
                driver.implicitly_wait(2)
                print('Waiting...')
                driver.get(url)
                print('Waiting...')
                execute(10, driver, 3)
                flight = driver.find_elements_by_css_selector(
                    "div[id^=flight_]")
                name = []

                remote_cookies = driver.get_cookies()

                local_cookies = {}

                for each in remote_cookies:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
import requests as rq
import re
import csv

# Using PhantomJS to render the webpage
driver = webdriver.PhantomJS(
    executable_path=
    "/Users/rosegaray/Desktop/phantomjs-2.1.1-macosx/bin/phantomjs")
driver.set_window_size(1920, 1080)

driver.get("http://salaryguide.diamondbacklab.com/#/salGuide?year=2017")
#2017 is 1029
#2016 is 1010
#2015 is 1022
#2014 is 1251
#2013 is 1210

# Wait for site to load
wait = WebDriverWait(driver, 20)

# Stores each page's data
data = {}

for page in range(1029):
    # Grab "tbody" tag
Esempio n. 5
0
def check_flights():
    URL = "https://www.google.com/flights/explore/#explore;f=JFK,EWR,LGA;t=HND,NRT,TPE,HKG,KIX;s=1;li=8;lx=12;d=2018-04-01"

    driver = webdriver.PhantomJS()

    dcap = dict(DesiredCapabilities.PHANTOMJS)

    dcap["phantomjs.page.settings.userAgent"] = (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"
    )
    driver = webdriver.PhantomJS(desired_capabilities=dcap,
                                 executable_path="/usr/local/bin/phantomjs")
    driver.implicitly_wait(20)
    driver.get(URL)
    wait = WebDriverWait(driver, 20)
    wait.until(
        EC.visibility_of_element_located((By.CSS_SELECTOR, "div.CTPFVNB-w-e")))

    s = BeautifulSoup(driver.page_source, "lxml")

    best_price_tags = s.findAll('div', 'CTPFVNB-w-e')

    # check if scrape worked - alert if it fails and shutdown
    if len(best_price_tags) < 4:
        print('Failed to Load Page Data')
        requests.post(
            'https://maker.ifttt.com/trigger/fare_alert/with/key/API_TOKEN',
            data={
                "value1": "script",
                "value2": "failed",
                "value3": ""
            })
        sys.exit(0)
    else:
        print('Successfully Loaded Page Data')

    best_prices = []
    for tag in best_price_tags:
        best_prices.append(int(tag.text.replace('$', '')))

    best_price = best_prices[0]

    best_height_tags = s.findAll('div', 'CTPFVNB-w-f')
    best_heights = []
    for t in best_height_tags:
        best_heights.append(
            float(t.attrs['style'].split('height:')[1].replace('px;', '')))

    best_height = best_heights[0]

    # price per pixel of height
    pph = np.array(best_price) / np.array(best_height)

    cities = s.findAll('div', 'CTPFVNB-w-o')

    hlist = []
    for bar in cities[0]\
            .findAll('div', 'CTPFVNB-w-x'):
        hlist.append(
            float(bar['style'].split('height: ')[1].replace('px;', '')) * pph)

    fares = pd.DataFrame(hlist, columns=['price'])
    px = [x for x in fares['price']]
    ff = pd.DataFrame(px, columns=['fare']).reset_index()

    # begin the clustering
    X = StandardScaler().fit_transform(ff)
    db = DBSCAN(eps=1.5, min_samples=1).fit(X)

    labels = db.labels_
    clusters = len(set(labels))

    pf = pd.concat([ff, pd.DataFrame(db.labels_, columns=['cluster'])], axis=1)

    rf = pf.groupby('cluster')['fare'].agg(['min', 'count'
                                            ]).sort_values('min',
                                                           ascending=True)

    # set up our rules
    # must have more than one cluster
    # cluster min must be equal to lowest price fare
    # cluster size must be less than 10th percentile
    # cluster must be $100 less the next lowest-priced cluster
    if clusters > 1 and ff['fare'].min() == rf.iloc[0]['min']\
            and rf.iloc[0]['count'] < rf['count'].quantile(.10)\
            and rf.iloc[0]['fare'] + 100 < rf.iloc[1]['fare']:
        city = s.find('span', 'CTPFVNB-v-c').text
        fare = s.find('div', 'CTPFVNB-w-e').text
        r = requests.post(
            'https://maker.ifttt.com/trigger/fare_alert/with/key/API_TOKEN',
            data={
                "value1": city,
                "value2": fare,
                "value3": ""
            })
    else:
        print('no alert triggered')
Esempio n. 6
0
def run_crawler(url, inst, user, pwd):

    # Initialize Webdriver and Input Login Information
    print("Initializing crawler...")
    browser = webdriver.PhantomJS()

    browser.get(url)

    institution = Select(browser.find_element_by_name("inst"))
    username = browser.find_element_by_name("instkey")
    password = browser.find_element_by_name("perskey")
    submit = browser.find_element_by_xpath("//input[@type='button']")

    sleep(randint(2, 10))
    institution.select_by_value(str(inst))
    username.send_keys(str(user))
    password.send_keys(str(pwd))
    submit.click()
    print("Login complete.")

    # Administration Page

    sleep(randint(2, 10))
    administration = browser.find_element_by_xpath(
        "//input[@value='Administration']")
    administration.click()
    print("Step 1 complete.")

    # Next Page

    sleep(randint(2, 10))
    database = browser.find_element_by_xpath(
        "//input[@value = 'Database Search']")
    database.click()
    print("Step 2 complete.")

    # Database Page

    sleep(randint(2, 10))
    search = browser.find_element_by_name("sf_aq")
    submit_query = browser.find_element_by_xpath(
        "//input[@value='Submit Search']")

    search.send_keys(query.run_query(query.query_dict))
    submit_query.click()
    print("Input complete.")

    # Get Xpath

    get_xpath = browser.page_source
    get_xpath = BeautifulSoup(get_xpath, 'lxml')
    get_xpath = get_xpath.find_all(
        "a", attrs={"href": re.compile("javascript:subViewResult")})

    xpath_list = []

    for item in get_xpath:
        xpath_list.append(item.get_text())

    xpath_no = xpath_list[xpath_list.index('NEXT>') - 1]

    print("Found Xpath Key.")

    # Get CT Evaluation

    show_results = browser.find_element_by_xpath(
        '//*[@id="pagedone"]/p[2]/table[1]/tbody/tr[3]/td/center/a[' +
        xpath_no + ']')
    show_results.click()

    html = browser.page_source
    get_data = BeautifulSoup(html, 'lxml')
    get_data

    table = get_data.find("div", attrs={'id': 'pagedone'})
    table = table.find("tbody")
    text_list = []

    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [element.text.strip() for element in cols]
        text_list.append([element for element in cols
                          if element])  # Get rid of empty values

    text_list = [item for item in text_list if len(item) > 1]
    text_list = text_list[2:]
    text_list = [[word.replace('-', '') for word in item]
                 for item in text_list]
    text_list = [item for item in text_list if len(item) > 2]

    headers = [
        "Study ID", "SR", "Last Name", "First Name", "MRN", "Intake", "IR",
        "CT Evaluation", "CR"
    ]
    df = pd.DataFrame(text_list, columns=headers)
    df.drop(["SR", "IR", "CR"], axis=1, inplace=True)

    df[df == ''] = np.NaN
    df = df.fillna(method='ffill')

    df["Intake"] = pd.to_datetime(df["Intake"])
    df["CT Evaluation"] = pd.to_datetime(df["CT Evaluation"])

    df = df.sort_values(by="CT Evaluation", ascending=False)

    df.to_csv("update.csv")

    print("CT Evaluations scraped.")

    # Download Data

    sleep(randint(2, 10))
    download = browser.find_element_by_xpath("//input[@value='Download Data']")
    download.click()
    print("Download complete.")

    # Close Crawler
    sleep(randint(2, 10))
    if os.path.exists(credentials.path_cwd + "/update.csv"):
        browser.quit()
        print("Crawling complete.")

    return
Esempio n. 7
0
class FlightCard:
    def __init__(self, airline, departure_time, arrival_time):
        self.airline = airline
        self.departure_time = departure_time
        self.arrival_time = arrival_time


def noti_slack(message):
    token = WHALE_BOT_TOKEN
    slack = Slacker(token)
    slack.chat.post_message(CHANNEL, message)


# 드라이버 선택
# driver = webdriver.Chrome('/Users/deplax/Downloads/chromedriver')
driver = webdriver.PhantomJS('/Users/whale/Downloads/phantomjs')

# 드라이버 세팅
driver.set_page_load_timeout(30)

# 페이지를 가져온다.
driver.get(
    'https://store.naver.com/flights/results/domestic?trip=OW&scity1=CJU&ecity1=GMP&sdate1=2018.10.19.&adult=1&child=0&infant=0&fareType=YC&airlineCode=&nxQuery=항공권'
)
time.sleep(10)

for x in range(15):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)

contents = driver.page_source.encode('utf-8')
Esempio n. 8
0
#!/usr/bin/env python3
# coding:utf-8

# ********** 需要安装模块 ************
# pip install selenium
# 下载PhantomJS http://phantomjs.org/download.html
# ************************************

from selenium import webdriver
from selenium.webdriver.common.by import By
import json
import time

# 双色球爬取
driver = webdriver.PhantomJS(
    executable_path='D:/Programs/phantomjs/bin/phantomjs')

if __name__ == '__main__':
    urls = []
    items = []
    # 起始页
    start_url = "http://kaijiang.500.com/shtml/ssq/03001.shtml"
    driver.get(start_url)
    list = driver.find_elements_by_css_selector(
        'body > div.wrap > div.kj_main01 > div.kj_main01_right > div.kjxq_box02 > div.kjxq_box02_title > div.kjxq_box02_title_right > span > div > a'
    )
    for a in list:
        url = a.get_attribute("href")
        urls.append(url)
        print(url)
    print("urls加载完成。。。")
from selenium import webdriver

driver = webdriver.PhantomJS(executable_path='/opt/phantomjs/bin/phantomjs')
driver.get('http://pythonscraping.com')
driver.implicitly_wait(1)
print(driver.get_cookies())

savedCookies = driver.get_cookies()

driver2 = webdriver.PhantomJS(executable_path='/opt/phantomjs/bin/phantomjs')
driver2.get('http://pythonscraping.com')
driver2.delete_all_cookies()
for cookie in savedCookies:
    driver2.add_cookie(cookie)

driver2.get('http://pythonscraping.com')
driver2.implicitly_wait(1)
print(driver2.get_cookies())
print(savedCookies == driver2.get_cookies())
Esempio n. 10
0
def get_phantomjs_path(phantomjs_path=None):
    # if phantomjs_path is provided, use it as PATH
    if phantomjs_path:
        return phantomjs_path

    # Download Phantomjs Binary if not exist
    def download_phantomjs(filename):
        # Download PhantomJS Binary
        file_path = os.path.join(TMP_DIR, filename)
        if not os.path.exists(file_path):
            print("::Download PhantomJS::")
            response = requests.get(phantomjs_url, stream=True)
            f = open(file_path, "wb+")
            for chunk in tqdm(response.iter_content(chunk_size=1024)):
                if chunk:
                    f.write(chunk)
            f.close()
            print("::Download Finish::")
        return file_path

    if not os.path.exists(os.path.join(TMP_DIR, 'tmp')):
        os.makedirs(os.path.join(TMP_DIR, 'tmp'))
    try:
        # Check 'phantomjs' in Executable PATH
        webdriver.PhantomJS()
        return 'phantomjs'
    except WebDriverException as e:
        # No 'phantomjs' in PATH
        if 'PATH' not in str(e):
            raise e
        os_name = platform.system()
        if os_name.lower() == 'windows':
            print("::OS Detected - Windows::")
            phantomjs_url = 'https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-windows.zip'
            filename = 'phantomjs-2.1.1-windows.zip'
            file_path = download_phantomjs(filename)
        elif os_name.lower() == 'linux':
            print("::OS Detected - Linux::")
            phantomjs_url = 'https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2'
            filename = 'phantomjs-2.1.1-linux-x86_64.tar.bz2'
            file_path = download_phantomjs(filename)
        elif os_name.lower() == 'darwin':
            print("::OS Detected - macOS::")
            phantomjs_url = 'https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-macosx.zip'
            filename = 'phantomjs-2.1.1-macosx.zip'
            file_path = download_phantomjs(filename)
        else:
            raise Exception('Currently, Automatic phantomjs download does not supported in "{}" OS.\n'
                            'You can download and add phantomjs to PATH on your own,\n'
                            'Download Link: http://phantomjs.org/download.html'.format(os_name))

        if filename.endswith('zip'):
            folder_name = filename.replace('.zip', '')
            file = zipfile.ZipFile(file_path)
            try:
                file.extract(folder_name + '/bin/phantomjs', TMP_DIR)
                phantom_path = os.path.join(TMP_DIR, folder_name + '/bin/phantomjs')
                os.chmod(phantom_path, 755)  # Fix permission
                return phantom_path
            except KeyError as e:
                if 'windows' not in str(e):
                    raise e
                file.extract(folder_name + '/bin/phantomjs.exe', TMP_DIR)
                return os.path.join(TMP_DIR, folder_name + '/bin/phantomjs.exe')
        elif filename.endswith('tar.bz2'):
            folder_name = filename.replace('.tar.bz2', '')
            file = tarfile.open(file_path, 'r:bz2')
            file.extract(folder_name + '/bin/phantomjs', TMP_DIR)
            phantom_path = os.path.join(TMP_DIR, folder_name + '/bin/phantomjs')
            os.chmod(phantom_path, 755)  # Fix permission
            return phantom_path
        else:
            raise Exception('File Name is not zip or tar.bz2')
Esempio n. 11
0
# -*- coding:utf-8 -*-
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import pymongo
import re
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup

from lxml import etree
# 定义一个浏览器
# browser = webdriver.Chrome()
# 定义一个无界面的浏览器
browser = webdriver.PhantomJS(
    service_args=['--load-images=false', '--disk-cache=true'])
# 50s无响应就down掉
wait = WebDriverWait(browser, 50)
#定义一个窗口大小(无界面也需要定义)
browser.set_window_size(1400, 900)


def search():
    """
    此函数的作用为完成首页点击搜索的功能,替换标签可用于其他网页使用
    :return:
    """
    #访问页面
    browser.get('https://www.jd.com/')
    try:
        #选择到京东首页的输入框
Esempio n. 12
0
    def get_mkdir(self):
        jsonobj = json.loads(self.get_html().decode('utf-8'))
        # 列表页 - 图片
        imgList = jsonpath.jsonpath(jsonobj, '$..img')
        # 列表页 - 价格
        pricelist = jsonpath.jsonpath(jsonobj, '$..price')
        # 列表页 - 商品名
        titleList = jsonpath.jsonpath(jsonobj, '$..title')
        # 列表页 - 商品id -- skuId
        skuIdList = jsonpath.jsonpath(jsonobj, '$..promotionInfo.skuId')
        # 商品价格
        priceList = jsonpath.jsonpath(jsonobj, '$..price')
        # 商品品牌
        brandList = jsonpath.jsonpath(jsonobj, '$..brandName')
        # 商品分类
        categoryList = jsonpath.jsonpath(jsonobj, '$..thirdCatName')
        listdata = zip(titleList, imgList, pricelist, skuIdList, priceList,
                       brandList, categoryList)

        for item in listdata:

            print(item)

            # 替换'/'
            import re
            strinfo = re.compile('/')
            itemdir = strinfo.sub('-', item[0])
            print(itemdir)
            time.sleep(1)
            # 商品名称目录
            if not os.path.exists(itemdir):
                os.makedirs(itemdir)
            else:
                print(itemdir + ' -- 目录已存在!')
            self.dataurl = ''
            # 存储本地主页图片链接地址
            self.pimg = ''
            # 列表页 - 图片

            # 文件夹和文件命名不能出现这9个字符:/ \ : * " < > | ?

            if os.path.exists(itemdir + '/' + item[1][-20:].replace(
                    '/', '-').replace('\\', '-').replace(':', '-').replace(
                        '*', '-').replace('"', '-').replace('<', '-').replace(
                            '>', '-').replace('|', '-').replace('?', '-') +
                              '.webp'):
                print('文件已存在!')
                # return 0
            else:

                if item[1].startswith('//'):
                    self.dataurl = "http:" + item[1]
                else:
                    self.dataurl = item[1]
                try:
                    req = request.Request(self.dataurl, headers=self.headers)
                    reponse = request.urlopen(req)
                    get_img = reponse.read()
                    self.pimg = '/pimgs/' + itemdir + '/' + self.dataurl[
                        -20:].replace('/', '-').replace('\\', '-').replace(
                            ':', '-').replace('*', '-').replace(
                                '"', '-').replace('<', '-').replace(
                                    '>', '-').replace('|', '-').replace(
                                        '?', '-') + '.webp'
                    with open(
                            itemdir + '/' +
                            self.dataurl[-20:].replace('/', '-').replace(
                                '\\', '-').replace(':', '-').replace(
                                    '*', '-').replace('"', '-').replace(
                                        '<', '-').replace('>', '-').replace(
                                            '|', '-').replace('?', '-') +
                            '.webp', 'wb') as fp:
                        fp.write(get_img)
                except Exception as e:
                    print(e)
            # 详情目录
            if not os.path.exists(itemdir + '/详情'):
                os.makedirs(itemdir + '/详情')
            else:
                print('详情' + ' -- 目录已存在!')
            driver = webdriver.PhantomJS(
                executable_path='./phantomjs-2.1.1-macosx/bin/phantomjs')
            time.sleep(5)
            driver.get(self.detailurl + str(item[3]))
            time.sleep(5)
            driver.find_element_by_class_name('tipinfo').click()
            time.sleep(5)
            html = etree.HTML(driver.page_source)
            imglist = html.xpath('//img/@src')
            print(self.detailurl + str(item[3]))
            # 轮番图
            lunfantu = html.xpath('//img[@class="detail-img"]/@src')
            # 猜你喜欢
            # like = html.xpath('//img[@class="J_ItemImage recommend-img"]/@src')
            # 商品宣传图
            xuanchuan = html.xpath(
                '//div[@class="J_descriptionDetail parameter"]//img/@src')
            # 规格
            # 左边的参数名
            leftspec = html.xpath(
                '//div[@class="left attr_key border-1px border-r border-b"]/text()'
            )
            # 右边的参数值
            rightspec = html.xpath(
                '//div[@class="left attr_value border-1px border-b"]/span/text()'
            )
            spec = zip(leftspec, rightspec)
            # time.sleep(5)
            # print(driver.page_source)
            print(str(item[3]))
            print(
                "-------------------------- 轮播图 --------------------------------"
            )
            print(lunfantu)
            print(
                "--------------------------- 规格 ---------------------------------"
            )
            print(spec)
            print(
                "-------------------------- 介绍图 ---------------------------------"
            )
            print(xuanchuan)
            print(
                "-------------------------- 主页图 ---------------------------------"
            )
            print(self.dataurl)

            for simple in imglist:
                if not os.path.exists(
                        itemdir + '/详情/' + simple[-20:].replace('/', '-').
                        replace('\\', '-').replace(':', '-').replace('*', '-').
                        replace('"', '-').replace('<', '-').replace('>', '-').
                        replace('|', '-').replace('?', '-') + '.webp'):
                    request.urlretrieve(
                        simple, itemdir + '/详情' +
                        '/' + simple[-20:].replace('/', '-').replace(
                            '\\', '-').replace(':', '-').replace(
                                '*', '-').replace('"', '-').replace(
                                    '<', '-').replace('>', '-').replace(
                                        '|', '-').replace('?', '-') + ".webp")
                    print("正在下载......")
                else:
                    print('文件已存在!')

                #     NOT
                #     NULL
                #     AUTO_INCREMENT, title
                #     VARCHAR(1000), img
                #     VARCHAR(1000), lunfanimg
                #     VARCHAR(1000), spec
                #     VARCHAR(1000), xcimg
                #     VARCHAR(1000),
            # 插入数据库l
        # 判断数据库是否有skuId,有就不插入,无则插入

            result = self.cur.execute(
                "select skuid from duodian WHERE skuid=" + str(item[3]))
            print(str(result) + '-----------------------')

            if result:
                print("数据库里面存在此数据")
            else:
                # 不存在,存数据
                lunfantu1 = {}
                specpagram = {}
                xuanchuan1 = {}
                # 轮番图
                for index1, item1 in enumerate(lunfantu):
                    lunfantu1[index1] = item1
                # 规格
                speckey = 0
                for itemspec in spec:
                    specvalue = str(itemspec[0]) + '-' + str(itemspec[1])
                    specpagram[str(speckey)] = specvalue
                    speckey += 1
                # 介绍图
                for index3, item3 in enumerate(xuanchuan):
                    xuanchuan1[index3] = item3
                # 存储本地图片链接地址
                plunfantu = {}
                pxuanchuan = {}
                for pindex1, pitem1 in enumerate(lunfantu):
                    plunfantu[pindex1] = '/pimgs/' + itemdir + '/详情/' + pitem1[
                        -20:].replace('/', '-').replace('\\', '-').replace(
                            ':', '-').replace('*', '-').replace(
                                '"', '-').replace('<', '-').replace(
                                    '>', '-').replace('|', '-').replace(
                                        '?', '-') + '.webp'
                for pindex2, pitem2 in enumerate(xuanchuan):
                    pxuanchuan[
                        pindex2] = '/pimgs/' + itemdir + '/详情/' + pitem2[
                            -20:].replace('/', '-').replace('\\', '-').replace(
                                ':', '-').replace('*', '-').replace(
                                    '"', '-').replace('<', '-').replace(
                                        '>', '-').replace('|', '-').replace(
                                            '?', '-') + '.webp'
                self.cur.execute(
                    'INSERT INTO ' + self.tablename +
                    ' (title, img, lunfanimg, spec, xcimg,skuid,pimg, plunfanimg, pxcimg,categoryid,price,brandname,categoryname) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s,%s, %s, %s,%s)',
                    (itemdir, self.dataurl,
                     json.dumps(lunfantu1, ensure_ascii=False),
                     json.dumps(specpagram, ensure_ascii=False),
                     json.dumps(xuanchuan1, ensure_ascii=False), str(item[3]),
                     self.pimg, json.dumps(plunfantu, ensure_ascii=False),
                     json.dumps(pxuanchuan, ensure_ascii=False), '11386',
                     '%.2f' % (item[4] / 100), str(item[5]), str(item[6])))
                self.cur.connection.commit()
                print(
                    "------------------------  插入成功  ----------------------------------"
                )
Esempio n. 13
0
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
import time

url = "http://www.indiavotes.com/pc/info?eid=16&state=0"
driver = webdriver.PhantomJS(
    r"G:\study\software\phantomjs-2.1.1-windows\phantomjs-2.1.1-windows\bin\phantomjs.exe"
)
driver.get(url)

# This will get the initial html - before javascript
html1 = driver.page_source
soup = BeautifulSoup(html1)
a_tag = soup.find_all('a')
c = soup.find_all('td', attrs={'class': 'tal sorting_1'})
c = c[:-13]
c
#z=str(c[1].find_all('a')[0])
#z
#z=z[9:55]

#z.replace("'",'')

#for i in c:
#   z=''
#   z=str(i.find_all('a')[0])
#  z=z[9:55]
# z=z.replace('"','')
# print(z)
Esempio n. 14
0
async def getBukkensFromYamlInPage(yaml, pageUrl):
    # 物件情報のDBIOインスタンスを作成
    dbio = dbiomaker()
    # webサイトから取得した物件リストを格納
    bukkens = []
    #開発環境と本番環境でPhantomJSの呼び出し方が異なるため、ホスト名で振り分け
    if os.uname()[1] == "kira-no-MacBook-Air.local":
        driver = webdriver.PhantomJS(
            executable_path='/Applications/phantomjs-1.9.2-macosx/bin/phantomjs'
        )
    else:
        driver = webdriver.PhantomJS()

    # 新規タブをあけるキー操作を設定
    newtab = Keys.CONTROL + 't'
    # Mac かどうかの判定、キーがMac だと違う
    if sys.platform == 'darwin':
        newtab = Keys.COMMAND + 't'

    #webサイトからデータ取得
    print("start driver")
    #open tab
    #driver.find_element_by_tag_name('body').send_keys(newtab)
    driver.get(pageUrl)
    print("end driver")
    #HTMLは未使用にみえるが、文字列指定の形でevalで使用している
    HTML = lxml.html.fromstring(driver.page_source)

    #登録用物件辞書
    bukkenDic = {}
    bukkenSetter = BukkenSetter()

    #mainルーチン
    # g is GROUP
    # u is UNIT
    # pcs is UNIT item
    #共通情報設定
    yamlid = "website"
    bukkenDic.update({yamlid: yaml[yamlid]})
    yamlid = "websiteURL"
    bukkenDic.update({yamlid: yaml[yamlid]})
    #print("G1 --YAML[GROUPS] => YAML[GROUP]--:YAMLファイルからGROUPの検索条件を取得")
    for g in yaml:
        if g == "GROUP":
            gp = yaml[g]["PROTOCOL"]
            gc = yaml[g]["COMMAND"]
            gs = yaml[g]["SELECTOR"]
            #print("G2 --YAML[GROUP] => HTML[GROUPS]--:GROUP検索条件よりHTMLのGROUP群を抽出")
            groups = eval("HTML" + "." + gp + '("' + gc + '")' + gs)
            #print("G3 --HTML[GROUPS] => HTML[GROUP]--:HTMLのGROUP群を1つづつループ処理")
            for group in groups:
                #print("U1 --YAML[GROUP] => YAML[UNIT]--:YAMLファイルからUNITの検索条件を取得")
                for u in yaml[g]:
                    if u == "UNIT":
                        up = yaml[g][u]["PROTOCOL"]
                        uc = yaml[g][u]["COMMAND"]
                        us = yaml[g][u]["SELECTOR"]
                        #print("U2 --YAML[UNIT] => HTML[UNITS]--:UNIT検索条件よりHTMLのUNIT群を抽出")
                        #<div class="article-box clearfix">
                        units = eval("group" + "." + up + '("' + uc + '")' +
                                     us)
                        #print("U3 --HTML[UNITS] => HTML[UNIT]--:HTMLのUNIT群を1つづつループ処理")
                        for unit in units:
                            #print("UI1--YAML[UNIT] => YAML[UNITITEMS]--:YAMLファイルからUNITITEM群の検索条件を取得")
                            for uis in yaml[g][u]:
                                if uis == "UNITITEMS":
                                    #print("UI2--YAML[UNITITEMS] => YAML[UNITITEM]--:YAMLファイルからUNITITEMの検索条件を取得")
                                    for ui in yaml[g][u][uis]:
                                        if ui != "IGNORE":
                                            p = yaml[g][u][uis][ui]["PROTOCOL"]
                                            c = yaml[g][u][uis][ui]["COMMAND"]
                                            s = yaml[g][u][uis][ui]["SELECTOR"]
                                            h = yaml[g][u][uis][ui]["HEADER"]
                                            #print("UI3 --YAML[UNITITEM] => HTML[UNITITEM]--:UNITITEM検索条件よりHTMLのUNITITEM情報を抽出")
                                            #print(ui+":"+htmlItemSelector(unit,p,c,s))
                                            #登録用物件辞書に追加
                                            bukkenDic.update({
                                                ui:
                                                htmlItemSelector(
                                                    unit, p, c, s, h)
                                            })
                                    #物件情報設定
                                    bukkeninfo = bukkenSetter.getBukkenInfoByDic(
                                        bukkenDic)
                                    bukkens.append(bukkeninfo)
    #️DBへ格納
    dbio.insert(bukkens)
Esempio n. 15
0
'''
用selenium和phantomjs来模拟抓取包含ajax的页面
ajax执行完后会更改页面的某些元素,
如果获取这些元素, 直接获取是不行的,需要用下列方法----

'''
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 构造driver
driver = webdriver.PhantomJS(
    executable_path='/home/chin/company/program/phantomjs/bin/phantomjs')
driver.get('http://pvp.qq.com/web201605/herolist.shtml')
try:
    # 10是超时时间
    # element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'loadedButton')))
    heros_node = driver.find_elements_by_xpath(
        '/html/body/div[3]/div/div/div[2]/div[2]/ul/li/a')

    hero_list = []
    url_list = []
    for node in heros_node:
        hero_list.append(node.text)
        url_list.append(node.get_attribute('href'))

    #17173 pvp
    data = pd.DataFrame({'hero': hero_list, 'url1': url_list})
def get_issues_xml(web_url):
    driver = webdriver.PhantomJS(
        executable_path="C:/Users/khavaninzadeh/Desktop/phantomjs-2.1.1-windows/bin/phantomjs.exe")
    driver.get(web_url)
    if not get_webpage_lang(driver.find_element_by_tag_name('body').text):
        print("this journal is not in persian !!\n passing it ....")
        return False
    Issues = []

    driver.maximize_window()
    links_total = len(driver.find_elements_by_xpath("//a[contains(@onclick, 'loadIssues')]"))

    if not links_total:
        return False
    print("all Volumes in this web page = ", str(links_total))
    i = 1

    for plus in driver.find_elements_by_xpath(
            "//a[contains(@onclick, 'loadIssues')]"):  # browser.find_elements_by_class_name("dv_archive_vol"):
        try:
            if i == 1:
                print("Volume number  ", i, "is found ")
                print("plus number ", i, " is NOT clicked cause it is a minus !  ")
                i += 1
                time.sleep(10)
            else:
                print(' i = ', i)
                element = WebDriverWait(driver, 20).until(
                    EC.element_to_be_clickable((By.XPATH, "//a[contains(@onclick, 'loadIssues')]")))
                print("Volume number  ", i, "is found ")
                # print(plus)
                plus.click()
                print("plus number ", i, " is clicked   ", str(plus.click()))
                time.sleep(5)  # plus.find_element_by_tag_name("a").click()
                i += 1
        except Exception as exc:
            print("something went wrong ! in Volume number : ", i)

            print(exc)
            i += 1

    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # with open('PhJs_out.txt', 'w', encoding='utf8') as out:
    #     for line in soup.prettify():
    #         out.write(line)
    # print("the page source is now in the file !")
    # print(soup)
    # driver.implicitly_wait(time_to_wait=5)

    c = 0
    link_len = len(soup.findAll('div', {"class": "issue_dv"}))

    for ana in soup.findAll('div', {"class": "issue_dv"}):
        # print(ana)
        c = c + 1
        Issues.append(ana)

    # for issue in Issues:
    #     print("link = ", issue.find('a').get('href'))

    # print("all issues count =  ", c)
    # print("web url is ... ", web_url)
    # print("issue link is ... ", issue_link)
    #
    # correct_url = urljoin(web_url, issue_link)
    #
    # print(correct_url)

    # name = ""
    # path_to_save = "./"
    # name = "Sweb_Volume_"
    # final_save_loc = ""
    # parse_object = urlparse(web_url)
    # base_url = "http://" + parse_object.netloc
    # soup2 = BeautifulSoup(request.urlopen(correct_url), 'html.parser')

    issue_number = 1
    for issue in Issues:
        try:
            time.sleep(20)
            path = "./"
            issue_link = issue.find('a').get('href')
            parse_object = urlparse(web_url)
            base_url = "http://" + parse_object.netloc
            corrected_url = urljoin(web_url, issue_link)

            issue_soup = BeautifulSoup(request.urlopen(corrected_url), 'html.parser')

            issue_xml = issue_soup.findAll("a", attrs={"title": "XML"}, href=True)  # finds the xml file
            print("Going to get : ", corrected_url)
            href = issue_xml[0].get('href')
            get_xml_url = urljoin(base_url, href)
            print('Directed to = > ', get_xml_url)
        except Exception as exp:
            print("an error occured : ", exp)
            with open("missed_journals.txt", 'a') as file:
                file.write(web_url + '\n')
                return False
        with open(path + str(issue_number) + '.xml', 'wb') as file:
            try:
                file.write(requests.get(get_xml_url).content)
                print("xml file ", issue_number, " is downloaded")
            except Exception as exp:
                print("there was a problem geting : ", issue_number, ".xml")
                print("going to save it in missed_xmls")
                missed_xmls.append(get_xml_url)
                with open('missed_xmls.txt', 'a') as missed_file:
                    missed_file.writelines(get_xml_url)
                    missed_file.write('\n')
                print(exp)

        issue_number += 1
        print(
            "__________________________________________________________________________________________________________")

    return issue_number
Esempio n. 17
0
def main():
    start_time = datetime.now()
    #配置
    service_args = [
        '--proxy=http://127.0.0.1:1087',  # 代理 IP:port    (eg:192.168.0.28:808)
        '--proxy-type=https',  # 代理类型:http/https
        '--load-images=no',  # 关闭图片加载(可选)
        '--disk-cache=yes',  # 开启缓存(可选)
        '--ignore-ssl-errors=true'  # 忽略https错误(可选)
    ]

    dcap = dict(DesiredCapabilities.PHANTOMJS)
    dcap["phantomjs.page.settings.userAgent"] = (
        "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36"
    )

    #完成登录以及数据库连接
    global browser
    browser = webdriver.PhantomJS(service_args=service_args,
                                  desired_capabilities=dcap)

    global photo_query_hash
    photo_query_hash = 'bd0d6d184eefd4d0ce7036c11ae58ed9'

    login()
    cursor = connect_db()
    #user=Follower('luojunjie20','8091752170','false')
    toParseList = select_to_Parse(cursor)
    for toParse_user in toParseList:
        #print(toParse_user.name)
        if toParse_user.is_private == 'True':
            modify_flag = 'update followers set parsed="1" where follower_name="%s"' % toParse_user.name
            cursor.execute(modify_flag)
        else:
            try:
                person = get_followers(toParse_user)
            except:
                continue

            modify_flag_add(cursor, toParse_user, person)

    toParsePhotoList = get_user_to_parse_photos(cursor)

    global first_count
    first_count = 12

    global photo_count
    photo_count = 0
    base_path = '/Users/junjieluo/MyGit/instagram/instagram_photos'
    for user in toParsePhotoList:
        if user.is_private == 'True':
            modify_sen = 'update followers set photo_parsed="1" where follower_name="%s"' % user.name
            cursor.execute(modify_sen)
        else:
            url = 'https://www.instagram.com/graphql/query/?query_hash={}&variables=%7B%22id%22%3A%22{}%22%2C%22first%22%3A{}%7D'.format(
                photo_query_hash, user.id, first_count)
            try:
                urls = get_photo_urls(url, user, [])
            except:
                continue
            save_photo_in_database(cursor, user, urls)
            real_dir = create_dir(base_path, user)
            p = Pool()
            for url in urls:
                p.apply_async(save_photos, args=(url, real_dir))
            p.close()
            p.join()

    browser.quit()
    print('selenium已退出')
    end_time = datetime.now()
    print('本程序于{}时启动,于{}时关闭'.format(start_time, end_time))
Esempio n. 18
0
#!/usr/bin/python

import os
import getpass
from selenium import webdriver
import login
import training
import squad

########################################################################################
driver = webdriver.PhantomJS("lib/phantomjs")
########################################################################################

if not os.path.exists("lib"):
    os.makedirs("lib")
if not os.path.exists("config"):
    os.makedirs("config")
if not os.path.exists("squad"):
    os.makedirs("squad")
if not os.path.exists("training_reports"):
    os.makedirs("training_reports")
if not os.path.exists("errors"):
    os.makedirs("errors")
if not os.path.exists("upload"):
    os.makedirs("upload")
if not os.path.exists("upload/training_reports"):
    os.makedirs("upload/training_reports")
if not os.path.exists("upload/squad"):
    os.makedirs("upload/squad")

if login.login(driver):
Esempio n. 19
0
 def open_pjs_wd(self):
     wd = webdriver.PhantomJS()
     wd.set_window_size(1280, 900)
     return wd
Esempio n. 20
0
  def __init__(self, *args, **kwargs):
      self.driver = webdriver.PhantomJS(executable_path='/usr/local/lib/node_modules/phantomjs-prebuilt/bin/phantomjs')
  
      super(JobspiderSpider, self).__init__(*args, **kwargs)
 
      self.start_urls = ['https://www.jobs.gov.hk']
            print("complete task ok, task=" + json.dumps(task))
        else:
            print("complete task error, rsp=" + json.dumps(rsp))
    except Exception as err:
        print(err)


if __name__ == "__main__":
    print("main")
    try:
        runCount = 1
        if len(sys.argv) == 2:
            runCount = int(sys.argv[1])
        print("##### RunCount={0}".format(runCount))

        driver = webdriver.PhantomJS('phantomjs.exe')

        for index in range(0, runCount, 1):
            print("##### Run: {0} / {1}".format(index, runCount))

            errorCode, t = getTask()
            if errorCode == "NO_MORE_TASK":
                break
            elif errorCode == "OK" and t != None:
                errorCode, html = fetchTask(driver, t)
                if errorCode == 'ROBOT':
                    print('Robot error, exit')
                    break
                elif errorCode == 'OK':
                    if html != None:
                        rc = sendPage(t, html)
Esempio n. 22
0
 def setUp(self):
     self.driver = webdriver.PhantomJS(service_args=['--webdriver-loglevel=DEBUG'])
     self.driver.set_window_size(1400, 1000)
     self.driver.implicitly_wait(2)
     if not os.environ.get('BIRDSEYE_SERVER_RUNNING'):
         Thread(target=lambda: app.run(port=7777)).start()
Esempio n. 23
0
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
            # Install our opener (note that this changes the global opener to the one
            # we just made, but you can also just call opener.open() if you want)
            urllib2.install_opener(opener)
            # Build our Request object (supplying 'data' makes it a POST)
            req = urllib2.Request(url, postData)

            # Make the request and read the response
            try:
                resp = urllib2.urlopen(req)
                web_pg = resp.read()
            except urllib2.HTTPError, error:
                web_pg = error.read()
    else:
        if not dynamicData:
            wd = webdriver.PhantomJS()
            wd.get(url)
            web_pg = wd.page_source
        else:
            wd = webdriver.PhantomJS()
            wd.get(url)

            listOfWebs = []
            ccyPairs = []

            if len(dynamicData) == 3:
                baseList = dynamicData[0]
                counterList = dynamicData[1]
                nameList = dynamicData[2]

                #1st loop for each baseCCY in baseList to all counterCCY in counterList
Esempio n. 24
0
'''
아이캠 자동출책용 Test
이녀석은 일단 공개강의 - 국제어강의학습전략(인문사회과학도 에 들어가서 강좌를 클릭하는 기능

조회수가 +1되고
3초 기다린 후 체크하면, 윈도우 창이 2개 열려있는 것을 확인가능
창을 바꿔주고 스크린샷을 찍으면, 강의 플레이어가 스크린샷에 찍힘
즉, 강의 잘 찾아가서 강의 재생까지 누른 것.
'''

from selenium import webdriver
import time

id = input('아이디 :')
pw = input('패스워드 :')
driver =  webdriver.PhantomJS('C:\\Users\\박천욱\\Downloads\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe')    #팬텀JS사용. 자신의 팬텀JS위치로 설정하세요

driver.get('http://www.icampus.ac.kr/front/main/MainAction.do?method=list')
delay=3
driver.implicitly_wait(delay)

print('icampus로 로그인합니다.')
driver.find_element_by_name('uid').send_keys(id)
driver.find_element_by_name('pwd').send_keys(pw)
driver.find_element_by_xpath('//*[@id="mlogin01"]/div/a').click()
driver.implicitly_wait(delay)
print('로그인 완료\n\n')

driver.find_element_by_xpath('//*[@id="mainmenu"]/li[3]/span/a').click()    #공개강의로 이동
driver.find_element_by_link_text("국제어강의학습전략(인문사회과학도)").click()  #세부 강의 선택
driver.find_element_by_xpath('//a[img/@src="/images/front/ko/icon_test.gif"]').click()  #강의 재생 클릭
Esempio n. 25
0
import os

from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.keys import Keys

import config

print(os.path.abspath(config.PHANTOMJS_PATH))
driver = webdriver.PhantomJS(
    executable_path=os.path.abspath(config.PHANTOMJS_PATH))
#  WebDriver will wait until the page has fully loaded (that is, the “onload” event has fired)
driver.get("http://www.python.org")
assert "Python" in driver.title
# find element by its class name
elem = driver.find_element_by_class_name(
    'slide-code').find_element_by_tag_name('code')
elem.screenshot('test.png')
# The quit will exit entire browser whereas close` will close one tab, but if just one tab was open
# driver.quit()
# driver.close()
Esempio n. 26
0
passcode = "passcode"

browser = "firefox"  # phantomjs,firefox,chrome
#url = "http://google.com"
url = "http://172.18.4.1"
#delay = 5 # delay till the browser closes after sign in. The server needs to process your sign in
##########

# import #
from selenium import webdriver
import time
##########

#set your web driver
if browser == "phantomjs":
    driver = webdriver.PhantomJS()  #needs PhantomJS
elif browser == "firefox":
    driver = webdriver.Firefox()  #needs geckodriver
elif browser == "chrome":
    driver = webdriver.Chrome()  #needs N/A not sure if this works

#Open the page
driver.get(url)

#Find the Username box
driver.find_element_by_name("user").send_keys(username)

#Find the password box
driver.find_element_by_name("password").send_keys(passcode)

#Find the check box for terms of use
Esempio n. 27
0
 def setUpClass(cls):
     cls.selenium = webdriver.PhantomJS()
     super(LiveTests, cls).setUpClass()
    '*/*',
    'Accept-Encoding':
    'gzip',
    'Accept-Language':
    'zh-CN,zh;q=0.8,en;q=0.6',
    'Cache-Control':
    'max-age=0',
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36'
}
params = DesiredCapabilities.PHANTOMJS

# 设置浏览器的Headers反反爬虫
params['phantomjs.page.customHeaders'] = headers
driver = webdriver.PhantomJS(
    executable_path=
    r'D:\Python2\phantomjs\phantomjs-2.1.1-windows\bin\phantomjs.exe',
    desired_capabilities=params)


def parse_score(page_source):
    page = BeautifulSoup(page_source, 'lxml')
    academic_year = page.find('span', id="lbl_bt").text
    # 初始化总学分
    credit_sum = 0
    # 初始化总绩点
    grade_point_sum = 0
    # 获取每行的课程内容
    courses = page.find('table', id="Datagrid1").find('tbody').find_all('tr')
    print('- - - - - - - - - - %s - - - - - - - - - ' % academic_year)
    for i in range(1, len(courses)):
        # 获取学期数
Esempio n. 29
0
import xlwt
from selenium import webdriver

home = 'CLE'
home_last_game = '201612250CLE'
last_game_qualifier = 'SUN VS GSW'
players = []
player_names_split = []
firstname_whitelist = ['James Michael']

# open phantom js session
browser = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
browser.maximize_window()

# navigate to team site and parse data
browser.get('http://www.basketball-reference.com/teams/' + home + '/2017.html')
player_numbers = browser.find_elements_by_xpath(
    "//*[@id='roster']/tbody/tr/th")
player_numbers = [number.text for number in player_numbers]
player_names = browser.find_elements_by_xpath(
    "//*[@id='roster']/tbody/tr/td[1]")
player_names = [name.text for name in player_names]
player_career = browser.find_elements_by_xpath(
    "//*[@id='roster']/tbody/tr/td[1]/a")
player_career = [career.get_attribute('href') for career in player_career]
player_career = dict(zip(player_names, player_career))
# create roster sheet
for idx, player_name in enumerate(player_names):
    player = {'first': '', 'last': '', 'number': ''}
    for whitelist_name in firstname_whitelist:
        if whitelist_name in player_name:
Esempio n. 30
0
    def crawl_repost(self, weiboid, pages=None):
        """
        Crawl repost information around a weibo.

        :param weiboid: weibo's id itself
        :param pages: pages to crawl
        :return: a list of repost information
        """

        if not self.test_cookies():
            if not self.login():
                return None

        driver = webdriver.PhantomJS()
        for cookie in self.cookies:
            driver.add_cookie(cookie)

        try:
            if pages is None:
                driver.get('https://weibo.cn/repost/' + weiboid)
                try:
                    pages = int(driver.find_element_by_xpath('//input[@name="mp"]').get_attribute('value'))
                except exceptions.NoSuchElementException:
                    pages = 1

            reposters = []
            print("======获取原微博======")
            driver.get('https://weibo.cn/repost/' + weiboid)
            nickname = driver.find_element_by_xpath('//div[@id="M_"]//a').text
            uid = driver.find_element_by_xpath('//div[@id="M_"]//a').get_attribute('href').split('/')[-1]
            content = driver.find_element_by_xpath('//div[@id="M_"]//span').text[1:]
            print(content)
            repost_info = {
                'from_weibo_id': None,
                'nickname': nickname,
                'uid': uid,
                'content': content,
                # 'from_uid': [repost_from_uid],
                'weibo_id': weiboid,
            }
            reposters.append(repost_info)

            print("======获取转发======")
            for i in range(1, pages + 1):
                driver.get('https://weibo.cn/repost/' + weiboid + '?page=' + str(i))
                repost_list = driver.find_elements_by_xpath('//div[@class="c"]')
                # print(len(repost_list))
                for repost in repost_list:
                    if 'attitude' not in repost.get_attribute('innerHTML'):
                        continue

                    reposter_info = repost.find_element_by_xpath('.//a')
                    reposter_nickname = reposter_info.text
                    reposter_uid = reposter_info.get_attribute('href').split("/")[-1]
                    reposter_content = ":".join(repost.text.split(":")[1:])
                    reposter_content = reposter_content[:reposter_content.find(u'赞')].split("//@")[0]
                    repost_weibo_id = repost.find_element_by_partial_link_text(u'赞').get_attribute('href').split("/")[-2]
                    print(repost_weibo_id, end=" ")

                    if weiboid is not 'weibo.cn':
                        repost_info = {
                            'from_weibo_id': weiboid,
                            'nickname': reposter_nickname,
                            'uid': reposter_uid,
                            'content': reposter_content,
                            # 'from_uid': [repost_from_uid],
                            'weibo_id': repost_weibo_id,
                        }
                        reposters.append(repost_info)
            print()
        except Exception as e:
            return None
        driver.quit()
        return reposters