def _get_ff_profile(self, ff_profile_dir):
     if isinstance(ff_profile_dir, FirefoxProfile):
         return ff_profile_dir
     if is_falsy(ff_profile_dir):
         return webdriver.FirefoxProfile()
     return webdriver.FirefoxProfile(ff_profile_dir)
#!/usr/bin/env python

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import time, sys, os

# note: user profile varys with computer
profile = webdriver.FirefoxProfile("/Users/js/Library/Application Support/Firefox/Profiles/l40nl96j.default")
driver = webdriver.Firefox(profile)
driver.implicitly_wait(30)
base_url = "http://irsa.ipac.caltech.edu/"

lines=[tmp[:-1] for tmp in open(sys.argv[-1],'r').readlines()]

for gal in lines:

  print gal,
  driver.get(base_url + "/data/SPITZER/Enhanced/SEIP/")
  window_start = driver.window_handles[0]
  driver.find_element_by_name("locstr").clear()
  driver.find_element_by_name("locstr").send_keys(gal)
  driver.find_element_by_name("region").click()
#  time.sleep(30)
  window_after = driver.window_handles[1]
  driver.switch_to_window(window_after)
  page=driver.page_source.encode("utf-8")
  if 'NOTIFICATION' in page:
'''
使用Chrome“检查”功能找到源地址还十分容易。但是有一些网站非常复杂,例如前面的天猫产品评论,使用“检查”功能很难找到调用的网页地址。
除此之外,有一些数据真实地址的URL也十分冗长和复杂,有些网站为了规避这些抓取会对地址进行加密,造成其中的一些变量让人摸不着头脑。
'''
'''
动态网页抓取——方法二:使用浏览器渲染引擎。直接用浏览器在显示网页时解析HTML,应用CSS样式并执行JavaScript的语句
'''

from selenium import webdriver

profile_directory = R"C:\Users\R\AppData\Roaming\Mozilla\Firefox\Profiles\yjdic0n5.default"
profile = webdriver.FirefoxProfile(profile_directory)
driver = webdriver.Firefox(profile)

driver.get('http://www.santostang.com/2018/07/04/hello-world/')
# 原来代码中的 JavaScript 解析成了一个 iframe,<iframe title="livere" scrolling="no"…>也就是说,
# 所有的评论都装在这个框架之中,里面的评论并没有解析出来,所以才找不到div.reply-content元素。这时,我们需要加上对 iframe 的解析。
# driver.switch_to.frame(0)  # 1.用frame的index来定位,第一个是0
# driver.switch_to.frame("frame1")  # 2.用id来定位
# driver.switch_to.frame("myframe")  # 3.用name来定位
# driver.switch_to.frame(driver.find_element_by_tag_name("iframe"))  # 4.用WebElement对象来定位

driver.switch_to.frame(
    driver.find_element_by_css_selector("iframe[title='livere']"))

comments = driver.find_elements_by_css_selector('div.reply-content')
# 循环读取列表元素
for each in comments:
    content = each.find_element_by_tag_name('p')
    print(content.text)
'''
Example #4
0
'''

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

driver_status = "good"
test_status = "good"

########################## SETUP FIREFOX PROFILE ########################################

abrahamAzam = webdriver.FirefoxProfile(
    "C:/Users/jared/AppData/Roaming/Mozilla/Firefox/Profiles/4rv3vvf2.AbrahamAzam"
)

abrahamAzam.set_preference("webdriver_accept_untrusted_certs", True)
#driver = webdriver.Firefox(firefox_profile = abrahamAzam)
driver = webdriver.Firefox(abrahamAzam)
driver.get("https://test.dontracker.navy.mil")
exit()

########################## LOG INTO DON TRACKER #########################################

try:
    #driver = webdriver.Chrome()

    driver.get("https://test.dontracker.navy.mil")
    driver.find_element_by_id("button-1005-btnIconEl").click()
Example #5
0
def main():
    # Parse the command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('website')
    parser.add_argument('dns_type', choices=['dns', 'doh', 'dot'])
    parser.add_argument('trr_resolver_ip')
    parser.add_argument('trr_resolver_uri')
    parser.add_argument('--timeout', type=int, default=30)
    args = parser.parse_args()

    # Enable devtools in Firefox
    options = Options()
    options.headless = True
    options.add_argument('-devtools')

    # Enable the netmonitor toolbox in devtools so we can save HARs
    profile = webdriver.FirefoxProfile()
    profile.set_preference('devtools.toolbox.selectedTool', 'netmonitor')

    # If we're running a DoT measurement, turn on Stubby
    if args.dns_type == 'dot':
        if args.trr_resolver_ip == '1.1.1.1':
            subprocess.run(["sudo", "/usr/local/bin/bin/stubby", "-C", "stubby-cf.yml", "-g"])
            subprocess.run(["sudo", "cp", "resolv.conf", "/etc/resolv.conf"])
        elif args.trr_resolver_ip == '9.9.9.9':
            subprocess.run(["sudo", "/usr/local/bin/bin/stubby", "-C", "stubby-quad9.yml", "-g"])
            subprocess.run(["sudo", "cp", "resolv.conf", "/etc/resolv.conf"])
        elif args.trr_resolver_ip == '8.8.8.8':
            subprocess.run(["sudo", "/usr/local/bin/bin/stubby", "-C", "stubby-google.yml", "-g"])
            subprocess.run(["sudo", "cp", "resolv.conf", "/etc/resolv.conf"])

    # Configure the DNS settings in Firefox
    if args.dns_type == 'dns' or args.dns_type == 'dot':
        options.set_preference('network.trr.mode', 0)
    elif args.dns_type == 'doh':
        options.set_preference('network.trr.mode', 3)
        options.set_preference('network.trr.request-timeout', 1500)
        options.set_preference('network.trr.max-fails', 5)
        trr_resolver_ip = args.trr_resolver_ip
        trr_resolver_uri = args.trr_resolver_uri
        if trr_resolver_ip:
            options.set_preference('network.trr.bootstrapAddress', trr_resolver_ip)
        if trr_resolver_uri:
            options.set_preference('network.trr.uri', trr_resolver_uri)

    # Launch Firefox and install our extension for getting HARs
    driver = webdriver.Firefox(options=options,
                               firefox_profile=profile,
                               firefox_binary="/opt/firefox/firefox-bin")
    driver.install_addon("/home/seluser/measure/harexporttrigger-0.6.2-fx.xpi")
    driver.set_page_load_timeout(30)

    # Make a page load
    started = datetime.now()
    driver.get(args.website)

    # Once the HAR is on disk in the container, write it to stdout so the host machine can get it
    har_file = "/home/seluser/measure/har.json"
    def har_file_ready():
        return os.path.exists(har_file + ".ready")

    while (datetime.now() - started).total_seconds() < args.timeout \
            and not har_file_ready():
        time.sleep(1)

    if har_file_ready():
        with open(har_file, 'rb') as f:
            sys.stdout.buffer.write(f.read())
    driver.quit()
def get_firefox_profile() -> webdriver.FirefoxProfile:
    firefox_profile: webdriver.FirefoxProfile = webdriver.FirefoxProfile()
    firefox_profile.set_preference('permissions.default.image', 2)
    firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
    return firefox_profile
Example #7
0
def symmapzong(herbs):
    for herb in herbs:
        profile = webdriver.FirefoxProfile()
        profile.set_preference('browser.download.folderList', 2)
        profile.set_preference('browser.download.dir', '/Users/huanjiaming/webscraping/')
        profile.set_preference('browser.download.manager.showWhenStarting', False)
        profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'text/csv')
        browsersymmap = webdriver.Firefox(firefox_profile=profile)

        browsersymmap.get('http://www.symmap.org/search/')
        name = browsersymmap.find_element_by_id('herb_ipt')
        name.send_keys(herb)
        buttons = browsersymmap.find_element_by_id('herb_search')
        #buttons = browsersymmap.find_element_by_xpath('//*[@id="herb_ser_box"]/span')
        buttons.click()
        compoundnum = browsersymmap.find_element_by_tag_name('td')
        compoundnumsymmap = compoundnum.text
        symmapurl = 'http://www.symmap.org/detail/' + compoundnumsymmap
        browsersymmap.get(symmapurl)
        herbname = browsersymmap.find_element_by_xpath('/html/body/div/section/div/div/div/div[1]/table/tbody/tr[1]/td[2]')
        sherbname = herbname.text
        print(sherbname)
        xiala = browsersymmap.find_element_by_xpath('/html/body/div/section/div/div/div/div[3]/div[2]/form/div/div[1]/div/button/span[2]/span').click()
        ingredientbuttonurl = '/html/body/div/section/div/div/div/div[3]/div[2]/form/div/div[1]/div/div/ul/li[3]/a'
        ingredientbuttonurlcilck = browsersymmap.find_element_by_xpath(ingredientbuttonurl).click()
        downloadcompoudbutton = browsersymmap.find_element_by_id('dl-btn').click()
        oldname='/Users/huanjiaming/webscraping/data.csv'  
        newname='/Users/huanjiaming/webscraping/' + 'A' + sherbname + ' ' + 'symmap.csv'
        os.rename(oldname,newname)   
    
        herbfile = open('A' + sherbname + ' ' + 'symmap.csv')
        herbreader = csv.reader(herbfile)
        for row in herbreader:
            if herbreader.line_num == 1:
                continue
            compoundnum = row[0]
            compoundname = row[1]
            compoundssurl = 'http://www.symmap.org/detail/' + compoundnum
            browsersymmap.get(compoundssurl)
            compoundssearchname = browsersymmap.find_element_by_xpath('/html/body/div[1]/section/div/div/div/div[1]/table/tbody/tr[1]/td[2]')
            compoundsname = compoundssearchname.text
            comoundxiala = browsersymmap.find_element_by_xpath('/html/body/div[1]/section/div/div/div/div[3]/div[2]/form/div/div[1]/div/button').click()
            targesbuttonurlclick = '/html/body/div[1]/section/div/div/div/div[3]/div[2]/form/div/div[1]/div/div/ul/li[4]/a'
            targetsbutton = browsersymmap.find_element_by_xpath(targesbuttonurlclick).click()
            targetdownloadbutton = browsersymmap.find_element_by_id('dl-btn').click()
            compoundoldname='/Users/huanjiaming/webscraping/data.csv'  
            compoundnewname='/Users/huanjiaming/webscraping/' + sherbname + ' ' + compoundname  + ' ' + 'symmap.csv'
            os.rename(compoundoldname,compoundnewname) 
            targetfile = open(sherbname + ' ' + compoundname  + ' ' + 'symmap.csv')
            targetreader = csv.reader(targetfile)
            for targetrow in targetreader:
                if targetreader.line_num == 1:
                    continue
                symmapct = open('symmapct.txt','a+')
                symmapct.write(herb + '$' + sherbname + '$' + 'symmap' + '$' + compoundname + '$' + targetrow[1] + '$' + targetrow[3] + '$' + targetrow[6]+'\n')
                symmapct.close()
                
                pass
            pass
        browsersymmap.quit()
    pass
Example #8
0
def get_historical_data(name):
    stock_name = name
    #    url = "https://finance.yahoo.com/quote/AMZN?p=AMZN&.tsrc=fin-srch"
    url = "https://finance.yahoo.com/quote/" + stock_name + "?p=" + stock_name + "&.tsrc=fin-srch"
    driver = webdriver.Firefox(executable_path="/usr/bin/geckodriver")
    #    webdriver.FirefoxProfile()
    webdriver.FirefoxProfile().set_preference(
        "browser.download.manager.showWhenStarting", False)
    webdriver.FirefoxProfile().set_preference(
        "browser.download.manager.showAlertOnComplete", False)
    webdriver.FirefoxProfile().set_preference(
        "browser.helperApps.neverAsk.saveToDisk", "text/csv")
    webdriver.FirefoxProfile().set_preference("browser.download.dir",
                                              "~/Downloads")
    #    url = "http://finance.yahoo.com/quote/AMZN/history?p=AMZN"
    try:
        driver.get(url)
        #        delay = 3
        print "Page is ready!"
    except TimeoutException:
        print "Loading took too much time!"
        print "Page loading is done"
    time.sleep(.5)
    print "Finding tag span Done"
    elm_lists = driver.find_elements_by_tag_name("span")

    for elm in elm_lists:
        try:
            #               print elm.get_attribute('href'), elm.text
            if elm.text == "Historical Data":
                print "Found!!"
                print elm.text
                elm.click()
                #                    print self.url
                time.sleep(2.5)
                len_of_input_elm = 0
                while len_of_input_elm < 5:
                    input_elm_lists = driver.find_elements_by_tag_name("input")
                    len_of_input_elm = len(input_elm_lists)
                print len(input_elm_lists)
                for input_elm in input_elm_lists:
                    if input_elm.get_attribute(
                            "class"
                    ) == "C(t) O(n):f Tsh($actionBlueTextShadow) Bd(n) Bgc(t) Fz(14px) Pos(r) T(-1px) Bd(n):f Bxsh(n):f Cur(p) W(190px)":
                        print "find right input tag"
                        print input_elm.get_attribute("data-test")
                        input_elm.click()
                        time.sleep(2.5)
                        elm = driver.find_element_by_name("startDate")
                        print "Found startDate"
                        elm.clear()
                        elm.send_keys("6/25/2012")
                        elm = driver.find_element_by_name("endDate")
                        print "Found endDate"
                        elm.clear()
                        elm.send_keys("6/25/2015")
                        break
                button_elm_lists = driver.find_elements_by_tag_name("button")
                print len(button_elm_lists)
                for button_elm in button_elm_lists:
                    if button_elm.get_attribute(
                            "class"
                    ) == " Bgc($c-fuji-blue-1-b) Bdrs(3px) Px(20px) Miw(100px) Whs(nw) Fz(s) Fw(500) C(white) Bgc($actionBlueHover):h Bd(0) D(ib) Cur(p) Td(n)  Py(9px) Miw(80px)! Fl(start)":
                        print "Found Done"
                        button_elm.click()
                        time.sleep(5.5)
                        #                         if button_elm.get_attribute("class") == " Bgc($c-fuji-blue-1-b) Bdrs(3px) Px(20px) Miw(100px) Whs(nw) Fz(s) Fw(500) C(white) Bgc($actionBlueHover):h Bd(0) D(ib) Cur(p) Td(n)  Py(9px) Fl(end)":
                        #                             print "Found Apply"
                        #                             button_elm.click()
                        #                             time.sleep(5.5)
                        # #                    print input_elm.get_attribute("class")
                        #                             break
                        break
        except:
            pass
    button_elm_lists = driver.find_elements_by_tag_name("button")
    for button_elm in button_elm_lists:
        if button_elm.get_attribute(
                "class"
        ) == " Bgc($c-fuji-blue-1-b) Bdrs(3px) Px(20px) Miw(100px) Whs(nw) Fz(s) Fw(500) C(white) Bgc($actionBlueHover):h Bd(0) D(ib) Cur(p) Td(n)  Py(9px) Fl(end)":
            print "Found Apply"
            button_elm.click()
            time.sleep(5.5)
            #                   print input_elm.get_attribute("class")
            break
    a_elm_lists = driver.find_elements_by_tag_name("a")
    for a_elm in a_elm_lists:
        if a_elm.get_attribute("class") == "Fl(end) Mt(3px) Cur(p)":
            print "Found download"
            url = a_elm.get_attribute('href')
            print url
            break
    driver.get(url)
Example #9
0
def download(cfg):
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.remote.remote_connection import LOGGER
    LOGGER.setLevel(logging.WARNING)

    retCode = False
    filename_new = cfg.get('download', 'filename_new')
    filename_old = cfg.get('download', 'filename_old')
    login = cfg.get('download', 'login')
    password = cfg.get('download', 'password')
    url_lk = cfg.get('download', 'url_lk')
    url_file = cfg.get('download', 'url_file')

    download_path = os.path.join(os.getcwd(), 'tmp')
    if not os.path.exists(download_path):
        os.mkdir(download_path)

    for fName in os.listdir(download_path):
        os.remove(os.path.join(download_path, fName))
    dir_befo_download = set(os.listdir(download_path))

    if os.path.exists('geckodriver.log'): os.remove('geckodriver.log')
    try:
        ffprofile = webdriver.FirefoxProfile()
        ffprofile.set_preference("browser.download.dir", download_path)
        ffprofile.set_preference("browser.download.folderList", 2)
        ffprofile.set_preference(
            "browser.helperApps.neverAsk.saveToDisk",
            ",application/octet-stream" + ",application/vnd.ms-excel" +
            ",application/vnd.msexcel" + ",application/x-excel" +
            ",application/x-msexcel" + ",application/zip" +
            ",application/xls" + ",application/vnd.ms-excel" +
            ",application/vnd.ms-excel.addin.macroenabled.12" +
            ",application/vnd.ms-excel.sheet.macroenabled.12" +
            ",application/vnd.ms-excel.template.macroenabled.12" +
            ",application/vnd.ms-excelsheet.binary.macroenabled.12" +
            ",application/vnd.ms-fontobject" + ",application/vnd.ms-htmlhelp" +
            ",application/vnd.ms-ims" + ",application/vnd.ms-lrm" +
            ",application/vnd.ms-officetheme" +
            ",application/vnd.ms-pki.seccat" + ",application/vnd.ms-pki.stl" +
            ",application/vnd.ms-word.document.macroenabled.12" +
            ",application/vnd.ms-word.template.macroenabed.12" +
            ",application/vnd.ms-works" + ",application/vnd.ms-wpl" +
            ",application/vnd.ms-xpsdocument" +
            ",application/vnd.openofficeorg.extension" +
            ",application/vnd.openxmformats-officedocument.wordprocessingml.document"
            +
            ",application/vnd.openxmlformats-officedocument.presentationml.presentation"
            +
            ",application/vnd.openxmlformats-officedocument.presentationml.slide"
            +
            ",application/vnd.openxmlformats-officedocument.presentationml.slideshw"
            +
            ",application/vnd.openxmlformats-officedocument.presentationml.template"
            +
            ",application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            +
            ",application/vnd.openxmlformats-officedocument.spreadsheetml.template"
            +
            ",application/vnd.openxmlformats-officedocument.wordprocessingml.template"
            + ",application/x-ms-application" + ",application/x-ms-wmd" +
            ",application/x-ms-wmz" + ",application/x-ms-xbap" +
            ",application/x-msaccess" + ",application/x-msbinder" +
            ",application/x-mscardfile" + ",application/x-msclip" +
            ",application/x-msdownload" + ",application/x-msmediaview" +
            ",application/x-msmetafile" + ",application/x-mspublisher" +
            ",application/x-msschedule" + ",application/x-msterminal" +
            ",application/x-mswrite" + ",application/xml" +
            ",application/xml-dtd" + ",application/xop+xml" +
            ",application/xslt+xml" + ",application/xspf+xml" +
            ",application/xv+xml" + ",application/excel")
        if os.name == 'posix':
            #driver = webdriver.Firefox(ffprofile, executable_path=r'/usr/local/Cellar/geckodriver/0.19.1/bin/geckodriver')
            driver = webdriver.Firefox(
                ffprofile, executable_path=r'/usr/local/bin/geckodriver')
        elif os.name == 'nt':
            driver = webdriver.Firefox(ffprofile)
        driver.implicitly_wait(10)

        driver.get(url_lk)
        time.sleep(2)
        driver.get(url_file)
        time.sleep(2)
        driver.close()
        #driver.find_element_by_link_text(u"Выход").click()
        driver.quit()

    except Exception as e:
        log.debug('Exception: <' + str(e) + '>')

    dir_afte_download = set(os.listdir(download_path))
    new_files = list(dir_afte_download.difference(dir_befo_download))
    print(new_files)
    if len(new_files) == 0:
        log.error('Не удалось скачать файл прайса ')
        retCode = False
    elif len(new_files) > 1:
        log.error('Скачалось несколько файлов. Надо разбираться ...')
        retCode = False
    else:
        new_file = new_files[0]  # загружен ровно один файл.
        new_ext = os.path.splitext(new_file)[-1].lower()
        DnewFile = os.path.join(download_path, new_file)
        new_file_date = os.path.getmtime(DnewFile)
        log.info(
            'Скачанный файл ' + new_file + ' имеет дату ' +
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(new_file_date)))

        print(new_ext)
        if new_ext in ('.xls', '.xlsx', '.xlsb', '.xlsm', '.csv'):
            if os.path.exists(filename_new) and os.path.exists(filename_old):
                os.remove(filename_old)
                os.rename(filename_new, filename_old)
            if os.path.exists(filename_new):
                os.rename(filename_new, filename_old)
            shutil.copy2(DnewFile, filename_new)
            retCode = True

        elif new_ext == '.zip':
            # ветка устаревшая, не проверялась                                      # Архив. Обработка не завершена
            log.debug('Zip-архив. Разархивируем.')
            work_dir = os.getcwd()
            os.chdir(os.path.join(download_path))
            dir_befo_download = set(os.listdir(os.getcwd()))
            os.system('unzip -oj ' + new_file)
            os.remove(new_file)
            dir_afte_download = set(os.listdir(os.getcwd()))
            new_files = list(dir_afte_download.difference(dir_befo_download))
            os.chdir(work_dir)
            if len(new_files) == 1:
                new_file = new_files[0]  # разархивирован ровно один файл.
                new_ext = os.path.splitext(new_file)[-1]
                DnewFile = os.path.join(download_path, new_file)
                new_file_date = os.path.getmtime(DnewFile)
                log.debug('Файл из архива ' + DnewFile + ' имеет дату ' +
                          time.strftime("%Y-%m-%d %H:%M:%S",
                                        time.localtime(new_file_date)))
                filename_in = cfg.get('basic', 'filename_in')
                if os.path.exists(filename_new) and os.path.exists(
                        filename_old):
                    os.remove(filename_old)
                    os.rename(filename_new, filename_old)
                if os.path.exists(filename_new):
                    os.rename(filename_new, filename_old)
                shutil.copy2(DnewFile, filename_new)
                retCode = True

            elif len(new_files) > 1:
                log.debug('В архиве не единственный файл. Надо разбираться.')
                retCode = False
            else:
                log.debug(
                    'Нет новых файлов после разархивации. Загляни в папку юниттеста поставщика.'
                )
                retCode = False
    return retCode
Example #10
0
    def picture_screenshot_html(self, keyword, ckurl, searchDevice, spidertype,
                                searchPage, returnType):
        starttime = datetime.datetime.now()
        picturedata = None
        try:
            if int(searchDevice) == 1:
                browser = webdriver.Firefox(
                    executable_path=sxconfig.geckodriverPath)
                browser.set_page_load_timeout(sxconfig.page_load_timeout)
                browser.set_script_timeout(sxconfig.script_timeout)

                browser.maximize_window()
                browser.get(sxconfig.baiduPcUrl)  # Load page
                browser.find_element_by_id('kw').clear()  # 用于清除输入框的内容
                browser.find_element_by_id('kw').send_keys(u'' +
                                                           keyword)  # 在输入框内输入
                browser.find_element_by_id('su').click()  # 用于点击按钮
                browser.find_element_by_id('su').submit()  # 用于提交表单内容

                # browser.find_element_by_name("")

                self.util.fullloaded(browser)

                for currentpage in xrange(1, searchPage + 1):
                    # print currentpage
                    # print browser.current_url
                    jsClientWidth = '''return document.body.clientWidth'''
                    tatalWidth = browser.execute_script(jsClientWidth)
                    # print tatalWidth
                    jsScrollHeight = '''return  document.body.parentNode.scrollHeight'''
                    tatalHeight = browser.execute_script(jsScrollHeight)
                    # print tatalHeight

                    html_source = browser.page_source  # 页面
                    if int(returnType) > 0:
                        # 根据页面返回排名 数组
                        rankitem = self.baiduPc.getRankListByHtmlPc(
                            html_source, ckurl, spidertype)

                        ranklist = rankitem['rankList']
                        nextPageUrl = rankitem['nextPageUrl']

                        if len(ranklist) > 0:
                            picturedata = self.baiduPc.getPictureAndScreenPc(
                                browser, ranklist, tatalWidth, tatalHeight,
                                returnType)
                            break
                        else:
                            if currentpage == 5:
                                break
                            if nextPageUrl is None:
                                break
                            browser.get(sxconfig.baiduPcUrl + nextPageUrl)
                            self.util.fullloaded(browser)
                    else:
                        break
            else:
                # mobile
                firefoxProfile = webdriver.FirefoxProfile()
                # 设置 useragent
                firefoxProfile.set_preference("general.useragent.override",
                                              sxconfig.mobileUserAgent)
                browser = webdriver.Firefox(
                    firefox_profile=firefoxProfile,
                    executable_path=sxconfig.geckodriverPath)

                browser.set_page_load_timeout(sxconfig.page_load_timeout)
                browser.set_script_timeout(sxconfig.script_timeout)

                browser.set_window_size(sxconfig.baiduMobileWidth,
                                        sxconfig.baiduMobileHeight)
                browser.get(sxconfig.baiduMobileUrl)  # Load page
                browser.find_element_by_id('index-kw').clear()  # 用于清除输入框的内容
                browser.find_element_by_id('index-kw').send_keys(
                    u'' + keyword)  # 在输入框内输入
                browser.find_element_by_id('index-bn').click()  # 用于点击按钮

                self.util.fullloaded(browser)
                for currentpage in xrange(1, searchPage + 1):
                    html_source = browser.page_source  # 页面
                    if int(returnType) > 0:
                        # 根据页面返回排名 数组
                        rankitem = self.baiduMobile.getRankListByHtmlMobile(
                            html_source, ckurl, spidertype)
                        ranklist = rankitem['rankList']
                        nextPageUrl = rankitem['nextPageUrl']
                        if len(ranklist) > 0:
                            picturedata = self.baiduMobile.getPictureAndScreenMobile(
                                browser, ranklist, sxconfig.baiduMobileWidth,
                                sxconfig.baiduMobileHeight, returnType)
                            break
                        else:
                            if currentpage == 5:
                                break
                            if nextPageUrl is None:
                                break
                            browser.get(nextPageUrl)
                            self.util.fullloaded(browser)
                    else:
                        break
            browser.delete_all_cookies()
            browser.close()
            # browser.quit()

            endtime = datetime.datetime.now()
            print((endtime - starttime).seconds)
            if picturedata:
                picturedata["html"] = html_source
                picturedata["page"] = currentpage
                return json.dumps(picturedata)
            else:
                return -2
        except Exception, e:
            print e
            browser.close()
Example #11
0
    def InitDriver(self):
        tag = False
        try:
            #如果driver存在,则退出再创建
            if not self.driver:
                pass
            else:
                self.driver.quit()
        except Exception as e:
            log_error("释放浏览器资源失败")
            log_error(e)
        #创建浏览器对象
        try:
            browser = my_browser
            if browser == "firefox":
                options = webdriver.FirefoxOptions()
                if my_sys_platform == "Linux":
                    options.set_headless(
                    )  #或者使用options.add_argument('-headless')
                options.add_argument('--disable-gpu')  #禁用GPU加速

                firefox_profile = webdriver.FirefoxProfile()
                user_agent = get_header()  #随机user_agent
                my_log.logger.info("get random user_agent:%s" % user_agent)
                firefox_profile.set_preference("general.useragent.override",
                                               user_agent)
                #如果要截图,则加载图片,否则不加载
                # if is_screenshot != '1':
                #     firefox_profile.set_preference('permissions.default.image', 2)#禁止加载图片,某些firefox只需要这个
                firefox_profile.update_preferences()
                # firefox_profile.set_preference('browser.migration.version', 9001)#禁止加载图片,部分需要加上这个
                # firefox_profile.set_preference('permissions.default.stylesheet', 2)#禁用css
                # firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')#禁用flash
                # firefox_profile.set_preference('javascript.enabled', 'false')#禁用js
                if my_sys_platform == "Linux":
                    self.driver = webdriver.Firefox(
                        executable_path="./geckodriver",
                        firefox_profile=firefox_profile,
                        firefox_options=options)
                else:
                    self.driver = webdriver.Firefox(
                        firefox_profile=firefox_profile,
                        firefox_options=options)
                # self.driver = webdriver.Firefox()
            elif browser == "chrome":

                # WIDTH = 320
                # HEIGHT = 640
                # PIXEL_RATIO = 3.0
                # UA = 'Mozilla/5.0 (Linux; Android 4.1.1; GT-N7100 Build/JRO03C) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/35.0.1916.138 Mobile Safari/537.36 T7/6.3'
                # mobileEmulation = {"deviceMetrics": {"width": WIDTH, "height": HEIGHT, "pixelRatio": PIXEL_RATIO}, "userAgent": UA}
                # options = webdriver.ChromeOptions()
                # options.add_experimental_option('mobileEmulation', mobileEmulation)
                # self.driver = webdriver.Chrome(chrome_options=options)
                # prefs = {"profile.managed_default_content_settings.images": 2}#禁止加载图片
                # options.add_experimental_option("prefs", prefs)
                self.driver = webdriver.Chrome()

            elif browser == "ie":
                self.driver = webdriver.Ie()
            elif browser == "phantomjs":
                from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
                dcap = dict(DesiredCapabilities.PHANTOMJS)
                # dcap['phantomjs.page.settings.userAgent'] = ('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36')
                dcap['phantomjs.page.settings.userAgent'] = (
                    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
                )
                self.driver = webdriver.PhantomJS(
                    executable_path=r"./phantomjs",
                    desired_capabilities=dcap,
                    service_args=['--ignore-ssl-errors=true'])
                # self.driver = webdriver.PhantomJS(executable_path = "./phantomjs")
            else:
                self.driver = webdriver.Firefox()
            #设置请求超时时间
            self.driver.set_page_load_timeout(5)
            self.driver.set_script_timeout(5)
            tag = True
            log_info("打开浏览器成功Open browser successfully")
        except Exception as e:
            log_error("打开浏览器异常Failed to open browser")
            log_error(e)
            tag = False
        return tag
def get_result_type1(url, result_db, path):
    options = Options()
    options.headless = True
    profile = webdriver.FirefoxProfile()
    profile.set_preference("browser.download.folderList", 2)
    profile.set_preference("browser.download.manager.showWhenStarting", False)
    profile.set_preference("browser.download.dir", path)
    profile.set_preference("browser.helperApps.neverAsk.saveToDisk",
                           "application/pdf")
    profile.set_preference("pdfjs.disabled", True)
    driver = webdriver.Firefox(firefox_profile=profile,
                               options=options,
                               executable_path='../driver/geckodriver.exe')
    driver.get(url)
    no = 0
    for _ in result_db:
        root.update()
        id_num = result_db[no][0]
        pwd = result_db[no][1]
        test = True
        try:
            if driver.find_elements_by_id("regnum"):
                RegNo = driver.find_element_by_id("regnum")
                RegNo.send_keys(id_num)
                if driver.find_elements_by_id("dob"):
                    DoB = driver.find_element_by_id("dob")
                    DoB.send_keys(pwd)
                login = driver.find_element_by_name('sub')
                login.click()
                RegNo.clear()
                DoB.clear()
            elif driver.find_element_by_name("regno"):
                if driver.find_element_by_name("regno"):
                    temp = driver.find_element_by_name("regno")
                    temp.send_keys(id_num)
                else:
                    temp = driver.find_element_by_id("regno")
                    temp.send_keys(id_num)
                if driver.find_element_by_id("dob"):
                    temp1 = driver.find_element_by_id("dob")
                    temp1.send_keys(pwd)
                if driver.find_elements_by_name('but'):
                    login = driver.find_element_by_name('but')
                    login.click()
                else:
                    login = driver.find_element_by_xpath(
                        '/html/body/form/table/tbody/tr[5]/td/input')
                    login.click()
                temp1.clear()
                temp.clear()
            else:
                textBox.insert(tk.END, "Something missing Contact dev\n")
                textBox.see(tk.END)
        except TimeoutException as e:
            textBox.insert(tk.END, "timeout retrying...\n")
            textBox.see(tk.END)
            individual(url, id_num, pwd, path)
        except Exception as exception:
            test = False
            log(exception, reg=id_num, value=pwd)
            textBox.insert(tk.END, "Issue generated,check log file \n")
            textBox.see(tk.END)
        if test == True:
            val = no + 1
            textBox.insert(tk.END, '{} file downloaded \n'.format(val))
            textBox.see(tk.END)
        else:
            val = no + 1
            textBox.insert(tk.END, ' {} file not downloaded \n'.format(val))
            textBox.see(tk.END)
        no += 1
    driver.quit()
    if platform.system() in "Windows":
        os.system("Taskkill /IM firefox.exe /F")
Example #13
0
from urllib.request import urlopen
from bs4 import BeautifulSoup
from time import sleep
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.utils import keys_to_typing
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from unicodedata import normalize
import re
import random
from unicodedata import normalize

profile = webdriver.FirefoxProfile('/home/dgc7/.mozilla/firefox/7aebrp31.dd7')
profile.set_preference("browser.download.panel.shown", False)
profile.set_preference("browser.helperApps.neverAsk.saveToDisk",
                       "application/zip")
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.dir',
                       '/home/dgc7/zlibros/libros1920-1921')

dirNombre = 'home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/crearCorreos'


class crearCorros:
    def __init__(self):
        self.urlProtocoe = 'http://3g2upl4pq6kufc4m.onion', 'https://mail.protonmail.com/create/new', 'https://singlelogin.org/registration.php'
        print(self.urlProtocoe[2])
        self.dirNombre = '/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/crearCorreos/nombre.txt'
Example #14
0
    def findBookUrl(self):
        directory_name = '.'
        binary = FirefoxBinary('/docs/python_projects/firefox/firefox')

        fp = webdriver.FirefoxProfile()

        fp.set_preference("webdriver.log.file", "/tmp/firefox_console")
        fp.set_preference("browser.download.folderList", 2)
        fp.set_preference('browser.download.manager.showWhenStarting', False)
        fp.set_preference('browser.download.manager.focusWhenStarting', False)
        fp.set_preference("browser.download.dir", directory_name)
        fp.set_preference("browser.download.manager.scanWhenDone", False)
        fp.set_preference("browser.download.manager.useWindow", False)
        #             fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")
        fp.set_preference(
            "browser.helperApps.neverAsk.saveToDisk",
            "application/octet-stream,application/xml,application/pdf,text/plain,text/xml,image/jpeg,text/csv,application/zip,application/x-rar-compressed"
        )
        fp.set_preference("browser.helperApps.alwaysAsk.force", False)
        fp.set_preference("browser.popups.showPopupBlocker", False)
        fp.update_preferences()
        driver = webdriver.Firefox(firefox_profile=fp, firefox_binary=binary)
        # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img")
        driver.get(self.baseUrl)
        efd_link = driver.find_element_by_css_selector(
            ".login-popup > div:nth-child(1)")
        efd_link.click()
        try:
            emailEl = driver.find_element_by_css_selector(
                '#packt-user-login-form > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > input:nth-child(1)'
            )
            #             emailEl = driver.find_element_by_name("email")
            '''
            Login with user credential
            '''
            emailEl.send_keys('*****@*****.**')
            passwordEl = driver.find_element_by_css_selector(
                "#packt-user-login-form > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > input:nth-child(1)"
            )
            passwordEl.send_keys('default')
            loginEl = driver.find_element_by_css_selector(
                "#packt-user-login-form > div:nth-child(1) > div:nth-child(1) > div:nth-child(3) > input:nth-child(1)"
            )
            loginEl.click()

            if True:
                '''
                clicking on My Account
                '''
                myAccountEl = driver.find_element_by_css_selector(
                    '#account-bar-logged-in > a:nth-child(1) > div:nth-child(1) > strong:nth-child(1)'
                )
                myAccountEl.click()
                '''
                clicking My ebooks
                '''
                myEbook = driver.get(self.baseUrl + 'account/my-ebooks')
                productListEls = driver.find_elements_by_css_selector(
                    'div.product-line')
                print len(productListEls)
                bookList = list()
                for productEl in productListEls:
                    print productEl

                    try:
                        bookName = productEl.find_element_by_css_selector(
                            '.title').text
                        book = self.createBookDetail(bookName)
                        productEl.click()
                        readMeEl = productEl.find_element_by_css_selector(
                            '.fake-button-text')
                        print 'new page',
                        isbnEl = productEl.find_elements_by_css_selector(
                            'div > div:nth-child(2) > div:nth-child(1)> a:nth-child(1) > div:nth-child(1)'
                        )
                        book.isbn_13 = isbnEl[0].get_attribute('isbn')
                        #                     readMeEl.click()
                        print 'div.product-line:nth-child(1) > div:nth-child(2) > div:nth-child(1) > a:nth-child(1) > div:nth-child(1)',
                        #                     readMeEl.find_element_by_css_selector('h2.ng-binding')
                        #
                        #                     readingEl = driver.get('https://www.packtpub.com/mapt/book/All%20Books/' + book.isbn_13)
                        #                     bookName1=driver.find_elements_by_css_selector('h2.ng-binding')[0].text

                        bookList.append(book)
                    except Exception as e:
                        print e
#                 product_account_list_el=driver.find_elements_by_css_selector('#product-account-list')

            driver.get('https://www.packtpub.com/packt/offers/free-learning')
            try:
                '''
                clicking on Claim your free ebook
                '''
                bookNameEl_1 = driver.find_element_by_css_selector(
                    '.dotd-title > h2:nth-child(1)')
                isBookAlreadyAvailable = False
                bookName_1 = bookNameEl_1.text
                for book in bookList:
                    if bookName_1 in book.bookName:
                        isBookAlreadyAvailable = True
                        break

                if not isBookAlreadyAvailable:
                    claimFreeEbookEl = driver.find_element_by_css_selector(
                        '.book-claim-token-inner > input:nth-child(3)')
                    claimFreeEbookEl.click()
            except Exception as e:
                print e


#             myEbook.click()

        except Exception as e:
            print e
        finally:
            print 'completed'
        print 'hi'
Example #15
0
    def spider(self,url, time=time):
        #循环指数
        num = 0
        while 1:

            #f=xlrd.open_workbook('tianyancha.xlsx')
            ip='61.161.46.179'
            port=8118
            proxies = self.get_ip()
            if proxies:
                items = re.findall('(.*?):(.*)', proxies)
                ip=items[0][0]
                port=items[0][1]
            firefox_options = webdriver.FirefoxOptions()
            ff_profile = webdriver.FirefoxProfile()
            ff_profile.set_preference("network.proxy.type", 1)
            ff_profile.set_preference("network.proxy.http", ip)
            ff_profile.set_preference("network.proxy.http_port", int(port))
            ff_profile.set_preference("network.proxy.ssl", ip)
            ff_profile.set_preference("network.proxy.ssl_port", int(port))
            ff_profile.set_preference("network.proxy.ftp", ip)
            ff_profile.set_preference("network.proxy.ftp_port", int(port))
            ff_profile.set_preference("general.useragent.override",
                                      "Mozilla/5.0 (Windows NT 6.1; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36")
            ff_profile.update_preferences()
            #这里是打开浏览器
            # driver = webdriver.Firefox(firefox_options=firefox_options, firefox_profile=ff_profile)
            driver = webdriver.Firefox()
            wait=WebDriverWait(driver,15)
            try:
                driver.maximize_window()#窗口最大
                driver.get(url)#天眼查制造业企业的信息
                time.sleep(5)#给个五秒延迟,必须先定义,不能在函数中直接用
                submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#searchHotelPanel > div.b_tool.clr_after > div.pager.js_pager > div > ul > li.item.next > a > span:nth-of-type(1)')))#直到此元素可以点击
                for i in range(5):
                    '''for j in range(1, 3):
                        height = 20000 * j  # 每次滑动20000像素
                        strWord = "window.scrollBy(0," + str(height) + ")"
                        driver.execute_script(strWord)
                        time.sleep(2)'''
                    selector=driver.page_source
                    yield selector
                    #创建新窗口/选项卡
                    # for i in range(10):
                          #开启选项卡
                    #     driver.execute_script('window.open()')
                    #     切换到此窗口
                    #     driver.switch_to_window(driver.window_handles[i + 1])
                    #     访问新网址
                    #     driver.get('http://www.douban.com/')
                    #     time.sleep(1)
                    e=self.exit(driver,'//*[@id="tyc_banner_close"]')
                    if e==True:
                        print('有小弹窗')
                        driver.find_element_by_xpath('//*[@id="tyc_banner_close"]').click()
                    else:
                        print('无弹窗')
                    #s=driver.find_element_by_xpath('//*[@id="tyc_banner_close"]')

                    '''if s!='':#判断元素是否存在
                        print('有小弹窗')
                        s.click()#点击下一页
                    else:
                        print('无弹窗')'''
                    time.sleep(2)
                    driver.find_element_by_css_selector('html body.font-bb49248c div#web-content.mt74 div.container.pt25 div.container-left div.search-block div.result-footer div ul.pagination li a.num.-next').click()
                    time.sleep(5)
                num += 1
                driver.close()

            except Exception as e:
                print('出错了:',e)
                driver.close()
            #退出while循环,判断num这个循环指数,如果因为出错而运行到这里,就要继续循环,num并没有+1
            #如果是正常爬取完了,正常需要退出,num+1,可以退出这个循环
            if num==1:
                print('正常退出浏览器')
                break
            print('非正常退出浏览器,继续访问')
def main(ip_of_wp, port_of_wp, admin_pwd):
    #os.setuid(pwd.getpwnam(username).pw_uid) # ncessary b/c cannot run selenium as root

    global driver
    global driver_two

    try:
        os.makedirs('./wp_csv_loc')
    except OSError as e:
        print e
        # if the dictory already exists,then we want to clear it (to make the result easy to find)
    # (taken from: https://stackoverflow.com/questions/185936/how-to-delete-the-contents-of-a-folder-in-python)
    # file_name = None
    for file_name in os.listdir('./wp_csv_loc'):
        file_path = os.path.join('./wp_csv_loc', file_name)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
        except Exception as e:
            print(e)
            if e.errno != errno.EEXIST:
                raise

    options = Options()
    options.headless = True
    #options.add_argument('--no-sandbox')

    # from: https://selenium-python.readthedocs.io/faq.html (literally copy-pasted)
    fp = webdriver.FirefoxProfile()
    fp.set_preference("browser.download.folderList", 2)
    fp.set_preference("browser.download.manager.showWhenStarting", False)
    fp.set_preference("browser.download.dir", os.getcwd() + '/wp_csv_loc/')
    fp.set_preference("browser.helperApps.neverAsk.saveToDisk",
                      "application/octet-stream")
    fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv")
    fp.accept_untrusted_certs = True

    driver = webdriver.Firefox(fp, options=options)
    driver_two = webdriver.Firefox(fp, options=options)
    admin_login(admin_pwd, driver)
    admin_login(admin_pwd, driver_two)
    time.sleep(5)

    # okay, first install fakerpress
    # '''
    page_about_fakerpress = 'https://' + ip_of_wp + ':' + port_of_wp + '/wp-admin/plugin-install.php?tab=plugin-information&plugin=fakerpress&TB_iframe=true&height=-34%22&width=772'
    # print driver.page_source.encode("utf-8")
    driver.get(page_about_fakerpress)
    time.sleep(5)
    install_pluggin()
    time.sleep(10)

    # then install "Export All URLs"
    page_about_export_all_urls = 'https://' + ip_of_wp + ':' + port_of_wp + '/wp-admin/plugin-install.php?tab=plugin-information&plugin=export-all-urls&TB_iframe=true&width=772&height=627'
    driver.get(page_about_export_all_urls)
    install_pluggin()
    time.sleep(10)

    # then install "app_pass"
    page_about_app_pass = '******' + ip_of_wp + ':' + port_of_wp + '/wp-admin/plugin-install.php?tab=plugin-information&plugin=application-passwords&TB_iframe=true&width=772&height=627'
    driver.get(page_about_app_pass)
    install_pluggin()
    # '''

    # now generate the fake data using fakerpress
    max_num = 'fakerpress-field-qty-max'
    max_num_class = 'fp-field fp-type-number fp-size-tiny'
    global min_num
    min_num = 'fakerpress-field-qty-min'
    drop_down_id = 's2id_fakerpress-field-meta-type'

    # ''' # this is good.
    user_page = 'https://' + ip_of_wp + ':' + port_of_wp + '/wp-admin/admin.php?page=fakerpress&view=users'
    driver.get(user_page)
    #print(driver.page_source)
    user_page_code()
    time.sleep(170)
    '''
    #'''  # this is good.
    terms_page = 'https://' + ip_of_wp + ':' + port_of_wp + '/wp-admin/admin.php?page=fakerpress&view=terms'
    driver.get(terms_page)
    terms_page_code()
    time.sleep(60)
    '''
    #'''  # this is good
    post_page = 'https://' + ip_of_wp + ':' + port_of_wp + '/wp-admin/admin.php?page=fakerpress&view=posts'
    driver.get(post_page)
    posts_page_code()
    time.sleep(300)
    # '''
    # ''' # this is good
    comments_page = 'https://' + ip_of_wp + ':' + port_of_wp + '/wp-admin/admin.php?page=fakerpress&view=comments'
    driver.get(comments_page)
    comments_page_code()
    time.sleep(300)
    # '''

    export_all_urls_page = 'https://' + ip_of_wp + ':' + port_of_wp + '/wp-admin/options-general.php?page=extract-all-urls-settings'
    # ''' This is good
    driver.get(export_all_urls_page)
    # export_urls_code(driver_two)
    thread = threading.Thread(target=export_urls_code, args=(driver, ))
    thread.start()

    print "will now sleep for 15"
    time.sleep(15)
    print "done sleeping"
    # '''
    new_pdw = make_new_application_passwd(driver_two)
    time.sleep(35)

    # todo: first finish loading wp (via fakerpress) -- okay, I think this might be done (just gotta test it...)
    # then modify so that passwd is a cmd line arg -- okay, I think this might be done (just gotta test it...)
    # then make it run on cloudlab <----- start from here
    # then modify so called before run_experiment (should be easy...)
    # and needs to modify wordpress_background to take the passwd from this function as a cmdline argument...
    # might need to write to a file or something... (b/c gets wierd with python scripting)
    # might need to get tricky but shouldn't be too bad either...

    driver.close()
    driver_two.close()

    # let's also return the name of the resulting csv folder...
    folders_in_csv_path = os.listdir('./wp_csv_loc')
    print "folders_in_csv_path", folders_in_csv_path
    path_to_csv_file = './wp_csv_loc/' + folders_in_csv_path[0]

    try:
        os.remove("../" + "wordpress_users.csv")
    except OSError:
        pass

    try:
        os.remove("../wordpress_setup/" + "wordpress_users.csv")
    except OSError:
        pass

    shutil.copy(path_to_csv_file, "../" + "wordpress_users.csv")
    shutil.copy(path_to_csv_file,
                "../wordpress_setup/" + "wordpress_users.csv")

    with open('../wordpress_setup/wordpress_api_pwd.txt', 'w') as f:
        f.write(new_pdw)

    with open('../wordpress_setup/failures_list.txt', 'w') as f:
        f.write('')

    return new_pdw  # , path_to_csv_file
Example #17
0
    def start_browser(self):

        try:
            self.proxy = self.server.create_proxy()
        except Exception as e:
            print("Browser " + str(self.id) + ": Proxy server is offline: ", e)

            try:
                self.barrier.wait(3 * self.timeout)
            except BrokenBarrierError:
                print(
                    "Browser " + str(self.id) +
                    ": Timed out waiting for a browser", e)
                exit(1)

            self.proxy = self.server.create_proxy()

        self.proxy.timeouts = {
            'request': 5,
            'read': 5,
            'connection': 5,
            'dns': 5
        }

        self.profile = webdriver.FirefoxProfile()

        # Download files
        self.profile.set_preference("browser.download.folderList", 2)
        self.profile.set_preference("browser.download.dir", self.temp_dir)

        # A comma-separated list of MIME types to save to disk without asking
        # what to use to open the file
        self.profile.set_preference(
            "browser.helperApps.neverAsk.saveToDisk",
            "application/x-msexcel," + "application/excel," +
            "application/x-excel," + "application/vnd.ms-excel," +
            "application/pdf," + "application/msword," + "application/xml," +
            "application/octet-stream," + "image/png," + "image/jpeg," +
            "text/html," + "text/plain," + "text/csv")

        # Do not show the Download Manager
        self.profile.set_preference(
            "browser.download.manager.showWhenStarting", False)
        self.profile.set_preference(
            "browser.download.manager.focusWhenStarting", False)
        self.profile.set_preference("browser.download.manager.useWindow",
                                    False)
        self.profile.set_preference(
            "browser.download.manager.showAlertOnComplete", False)
        self.profile.set_preference("browser.download.manager.closeWhenDone",
                                    False)

        # Do not ask what to do with an unknown MIME type
        self.profile.set_preference("browser.helperApps.alwaysAsk.force",
                                    False)

        self.profile.set_proxy(self.proxy.selenium_proxy())

        self.driver = webdriver.Firefox(firefox_profile=self.profile)

        self.driver.set_page_load_timeout(self.timeout)
# -*- coding: utf-8 -*-
from selenium import webdriver
import urllib2
import os
import time
adblockfile = 'c:/Users/julio/Downloads/adblock_plus-2.6.11-sm+tb+fx+an.xpi'
ffprofile = webdriver.FirefoxProfile("C:/Users/julio/AppData/Local/Mozilla/Firefox/Profiles")
ffprofile.add_extension(adblockfile)
driver = webdriver.Firefox(ffprofile)

mainpage = "http://en.dm5.com/manhua-yiquanchaoren/"
base = "http://en.dm5.com"
driver.get(mainpage)

#%%
chapters = driver.find_elements_by_xpath("//ul[@id='cbc_3']/li/a")
chapter_links = []
for chapter in chapters:
    chapter_links.append(chapter.get_attribute('href'))

#len(chapter_links)
#chapter_links = chapter_links[0:2]
#%%
for chapter_link in chapter_links:
    driver.get(chapter_link)
    driver.execute_script( "window.onbeforeunload = function(e){};" ) # turn off all js
    title = driver.find_element_by_xpath("//div[@class='view_bt']/h1").text
    chapter_id = title.replace(u"一拳超人","").replace(u"原作版","").replace(u"话","")
    pages = driver.find_elements_by_xpath("//select[@id='pagelist']/option")
    option_value = pages[0].get_attribute('value').replace("-p1/","")
    for x in range(1, len(pages) + 1):
Example #19
0
def get_profile():
    profile = webdriver.FirefoxProfile()
    profile.set_preference("browser.privatebrowsing.autostart", True)
    profile.update_preferences()
    return profile
Example #20
0
    #    '/usr/bin/google-chrome')
    driver_arguments['executable_path'] = chromedriver

    # Travis-CI uses OpenVZ containers which are incompatible with the sandbox
    # technology.
    # See https://code.google.com/p/chromium/issues/detail?id=31077 for more
    # information.
    if 'TRAVIS' in os.environ:
        driver_arguments['chrome_options'].add_argument('--no-sandbox')
        driver_arguments['chrome_options'].add_argument(
            '--disable-setuid-sandbox')
        driver_arguments['chrome_options'].add_argument(
            '--allow-sandbox-debugging')

elif args.browser == "Firefox":
    driver_arguments['firefox_profile'] = webdriver.FirefoxProfile()
    # Firefox will often pop-up a dialog saying "script is taking too long" or
    # similar. So we can notice this problem we use "accept" rather then the
    # default "dismiss".
    webdriver.DesiredCapabilities.FIREFOX[
        "unexpectedAlertBehaviour"] = "accept"

elif args.browser == "PhantomJS":
    driver_arguments['executable_path'] = phantomjs
    driver_arguments['service_args'] = ['--remote-debugger-port=9000']

elif args.browser == "Remote":
    driver_arguments['command_executor'] = args.remote_executor

    for arg in args.remote_caps:
        if not arg.strip():
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
import psycopg2
from sqlalchemy import create_engine
from collections import defaultdict
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import pickle
import matplotlib.pyplot as plt

#GRAB ALL STATES FROM THE WORLD WIDE WEB
options = Options()
options.set_headless(True)
firefox_profile = webdriver.FirefoxProfile()
firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
browser = webdriver.Firefox(options=options,
                            firefox_profile=firefox_profile,
                            executable_path='/usr/local/bin/geckodriver')
url = 'https://alphabetizer.flap.tv/lists/list-of-states-in-alphabetical-order.php'
browser.get(url)
page_content = BeautifulSoup(browser.page_source, 'html.parser')
scrape_results = page_content.findAll('li')
states = []
for res in scrape_results:
    states.append(res.text.replace(' ', '-'))
states.append('washington-dc')

##THIS IS FOR DATA IMPORT AND SQL EXPORT, IT DOES NOT NEED TO BE RUN AGAIN
exporter = DatabaseExport('az_trail_recommender')
Example #22
0
    def crawler(self):
        url = "https://www.instagram.com/explore/tags/무신사/"
        # 포스트 내 컨텐츠 담을 리스트 선언
        tagList = [] 
        # 페이지 스크롤 변수
        pagedowns = 0
        # dict(hashtag,cnt)
        hashtag = {}
        # 엑셀 저장 데이터
        feedList = []
        # 리턴 데이터
        returnList = {}
        # 크롤링 결과 데이터
        crawlingList = {}
        # 크롬 옵션 설정
        # options = webdriver.ChromeOptions()
        # print(options)
        # #headless 모드 
        # options.add_argument('headless')
        # options.add_argument('window-size=1920x1080')
        # options.add_argument('disable-gpu')
        # #headless 모드 탐지 방지 언어 및 headless로 보이지 않도록 플러그인 수정
        # options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
        # options.add_argument("lang=ko_KR") # 한국어!
        # print(options)
        # driver = webdriver.Chrome('chromedriver',chrome_options=options)
        #네비게이터에 올바른 브라우저 환경처럼 보이도록 세팅해준다
        #driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});")
        #언어
        #driver.execute_script("Object.defineProperty(navigator, 'languages', {get: function() {return ['ko-KR', 'ko']}})")
        #위에서 차단한 렌더링 가속 가짜로 넣어서 위장
        #driver.execute_script("const getParameter = WebGLRenderingContext.getParameter;WebGLRenderingContext.prototype.getParameter = function(parameter) {if (parameter === 37445) {return 'NVIDIA Corporation'} if (parameter === 37446) {return 'NVIDIA GeForce GTX 980 Ti OpenGL Engine';}return getParameter(parameter);};")
        # 브라우저가 실행되며 해당 url로 이동

        # 파이어폭스 옵션 설정
        profile = webdriver.FirefoxProfile()
        profile.set_preference("network.proxy.type", 1)
        profile.set_preference("network.proxy.socks", "127.0.0.1")
        profile.set_preference("network.proxy.type", 9150)
        profile.set_preference('general.useragent.override', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0')
        profile.update_preferences()

        options = webdriver.FirefoxOptions()
        options.add_argument("--headless")

        try:
            driver = webdriver.Firefox(executable_path='/crawler/repo/blog/geckodriver.exe',firefox_profile=profile,firefox_options=options)
        except WebDriverException:
            webdriver.close()

        #코드 시작시간
        start = datetime.datetime.now().strftime("%Y_%m_%d %H:%M:%S")
        print(start)

        driver.get(url)
        # 웹자원 대기
        driver.implicitly_wait(1) 
        # 총 게시물 수 태그 클래스이름으로 찾기
        ttlFeed = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.CLASS_NAME,"g47SY"))) 
        print("총 게시물:", ttlFeed.text)
        # body 태그를 태그 이름으로 찾기
        time.sleep(1)
        # 페이지 내 첫번째 게시물 클릭
        driver.find_elements_by_class_name("eLAPa")[0].click()
        # failCnt
        failCnt = 0
        count = self.count
        # 데이터 스크래핑 시작
        while pagedowns < count:
                # 페이지 호출 후 대기
                #driver.implicitly_wait(5) 

            #게시물 본문
            try:
                post = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.CLASS_NAME,"C4VMK")))

                try:
                    driver.find_element_by_class_name('XQXOT').send_keys(Keys.HOME)     
                    driver.find_element_by_class_name('XQXOT').find_element_by_xpath("//ul/li/div/button").click()
                    driver.find_element_by_class_name('XQXOT').send_keys(Keys.HOME)  

                except (NoSuchElementException,ElementNotInteractableException):
                    pass
                
                # 게시물 글자수 160자
                # 댓글포함 최대 30개
                # 하나의 해시트그 내 글자수 100자

                #id = driver.execute_script("document.body.getElementsByClassName('C4VMK')[0].getElementsByTagName('a')[0].innerText")
                
                #content = driver.execute_script("document.body.getElementsByClassName('C4VMK')[0].getElementsByTagName('span')[0].innerText")
                
                req = driver.page_source
                soup = BeautifulSoup(req,'html.parser')
                replyCount = soup.find_all("div",class_="C4VMK")
                tagCount = replyCount[0].select('span>a')
                id = replyCount[0].find_all(class_="_6lAjh")[0].select("a")[0].text
                content = replyCount[0].select('span')[0].text
                like = '0'
                tags=[]  
                feedRow = {}        

                try:
                    #like = driver.find_element_by_class_name("Nm9Fw").find_element_by_tag_name("span").text
                    like = soup.find_all("div",class_="Nm9Fw")[0].select("span")[0].text

                except (NoSuchElementException,IndexError):
                    try:
                        like = soup.find_all("span",class_="vcOH2")[0].select("span")[0].text
                    except IndexError:
                        pass
            
                #데이터 가공
                emoji_pattern = re.compile("[\U00010000-\U0010ffff]", flags=re.UNICODE)

                content = emoji_pattern.sub('',content)
                #태그 로직 끝난 후에 긍정,부정 체크 메서드 만들것        
                
                #본문의 해시태그
                if len(tagCount) > 0:

                    for i in range(0,len(tagCount)):
                        tag = tagCount[i].text

                        if "#" in tag:
                            tag = tag.replace("#","").replace(" ","")
                            tags.append(tag)

                #댓글의 해시태그
                if len(replyCount) > 0:

                    for i in range(1,len(replyCount)):
                        #replyid = "document.body.getElementsByClassName('C4VMK')["+i+"].getElementsByTagName('a')[0].innerText"
                        replyid = replyCount[i].find_all("a")[0].text

                        if id == replyid:
                            #replyTagCount = driver.execute_script("document.body.getElementsByClassName('C4VMK')["+i+"].getElementsByTagName('span')[0].getElementsByTagName('a').length")
                            replyTagCount = replyCount[i].find_all("a")

                            if len(replyCount) > 1:

                                for j in range(0,len(replyTagCount)):
                                    #reply =  driver.execute_script("document.body.getElementsByClassName('C4VMK')["+i+"].getElementsByTagName('span')[0].getElementsByTagName('a')["+j+"].innerText")
                                    reply = replyTagCount[j].text

                                    if "#" in reply:
                                        reply = reply.replace("#","").replace(" ","")
                                        tags.append(reply)
                
                #중복제거
                tags = list(set(tags))
                tagList.append(tags)
                print("=======================================================================================")
                print("====================================pagedowns : ",pagedowns,"====================================")
                print("=======================================================================================")
                print("id===============================",id)
                print("content==========================",content)
                print("like=============================",like)
                print("finaltag=========================",tags)
                feedRow["id"] = id
                feedRow["content"] = content
                feedRow["tag"] = tags
                feedRow["like"] = like
                feedList.append(feedRow)

                time.sleep(1)           
                
                #다음 게시물 클릭
                try:
                    driver.find_element_by_class_name("HBoOv").click()

                except NoSuchElementException:
                    # 웹자원 대기
                    driver.get(url)
                    driver.implicitly_wait(1) 

                    for i in range(0,pagedowns):
                        driver.find_elements_by_class_name("eLAPa")[0].click()
                        #html = driver.find_element_by_tag_name("html")
                        #html.send_keys(Keys.DOWN)
                
                pagedowns += 1
                print("=======================================================================================")
                print("=======================================================================================")
            except (NoSuchElementException,StaleElementReferenceException,TimeoutException):
                failCnt += 1
                print("=======================================================================================")
                print("====================================failcount : ",failCnt,"=====================================")
                print("=======================================================================================")
                if failCnt > 3:
                    driver.find_element_by_class_name("HBoOv").click()
                    
                time.sleep(120)
                pass
                
        print("끝!!")
                
        # 해시태그 중복 검사 후 리스트로 재할당
        tagList = list([tuple(set(tag)) for tag in tagList])

        # 해시태그 갯수 구하기
        for htags in tagList:
            for htag in htags:
                # 해시태그 카운트 업
                if not (htag in hashtag):
                    hashtag[htag] = 1
                else:
                    hashtag[htag] += 1

        # 정렬
        keys = sorted(hashtag.items(), key = lambda x:x[1], reverse = True)

        # n순위 까지 출력
        for k, v in keys[:15]:
            print("{}({})".format(k, v))

        end = datetime.datetime.now().strftime("%Y_%m_%d %H:%M:%S")

        print("start======",start)
        print("end======",end)

        print("enddivision=========",datetime.datetime.strptime(end,"%Y_%m_%d %H:%M:%S")-datetime.datetime.strptime(start,"%Y_%m_%d %H:%M:%S"))
        # result = pd.DataFrame(feedList)
        # result.columns = ['id','content','tag','like']
        # result.head()

        #웹자원 종료
        driver.close
        
        crawlingList["ttlfeed"] = ttlFeed.text
        crawlingList["crwfeed"] = len(tagList)
        crawlingList["succnt"] = pagedowns
        crawlingList["failcnt"] = failCnt
        crawlingList["created_at"] = start
        crawlingList["updated_at"] = end
        crawlingList["working_while"] = str(datetime.datetime.strptime(end,"%Y_%m_%d %H:%M:%S")-datetime.datetime.strptime(start,"%Y_%m_%d %H:%M:%S"))

        returnList["crawlingList"] = crawlingList
        returnList["tagList"] = keys
        returnList["excelList"] = feedList

        return returnList 
Example #23
0
def run(x):
    save_location = x
    currentdir = os.getcwd()

    #sema = threading.BoundedSemaphore(maxthreads)
    sema.acquire()
    for item in umlautdict.keys():
        save_location = save_location.replace(item, umlautdict[item])

    path = currentdir + '\\Output\\' + str(save_location) + "\\"
    #print(path)

    # Set Firefox preferences so that the file automatically saves to disk when downloaded
    if not os.path.exists('Output'):
        os.makedirs('Output')
    fp = webdriver.FirefoxProfile()
    fp.set_preference("browser.preferences.instantApply", True)
    fp.set_preference(
        "browser.helperApps.neverAsk.saveToDisk",
        "text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml"
    )
    fp.set_preference("browser.helperApps.alwaysAsk.force", False)
    fp.set_preference("browser.download.manager.showWhenStarting", False)
    fp.set_preference("browser.download.folderList", 2)
    fp.set_preference("browser.download.dir",
                      currentdir + "\\Output\\" + str(save_location) + "\\")
    fp.set_preference("browser.download.downloadDir",
                      currentdir + "\\Output\\" + str(save_location) + "\\")
    fp.set_preference("browser.download.defaultFolder",
                      currentdir + "\\Output\\" + str(save_location) + "\\")

    driver = webdriver.Firefox(firefox_profile=fp)

    driver.get(
        "https://foerderportal.bund.de/foekat/jsp/SucheAction.do?actionMode=searchmask"
    )

    # elem.clear()

    # bundesland.send_keys(options.bundesland)
    for i in range(len(options.bundesland) - 1):
        driver.find_element_by_css_selector(
            '#gemeindeZeile > td:nth-child(7) > input:nth-child(1)').click()
    for i in range(len(options.bundesland)):
        driver.find_element_by_css_selector(
            f'#suche_bundeslandSuche_{i}_').send_keys(options.bundesland[i])

    if options.lfdvorhaben == False:
        driver.find_element_by_css_selector('#suche_lfdVhbN').click()
    driver.find_element_by_css_selector('#suche_nurVerbundJ').click()
    submit_button = driver.find_element_by_css_selector(
        "#suche_general_search")

    driver.find_element_by_css_selector("#suche_gemeindeSuche_0_").send_keys(x)
    # bundesland = driver.find_element_by_css_selector('#suche_bundeslandSuche_0_')
    driver.find_element_by_css_selector(
        '#suche_laufzeitVonSuche_0_').send_keys(options.laufzeit)
    submit_button.click()

    items = driver.find_element_by_css_selector(
        ".content_background_outer > h1:nth-child(3)").text  #cosmetics
    items = int("".join(filter(str.isdigit, str(items))))  #cosmetics

    with tqdm(total=items) as progress_bar:
        select = Select(
            WebDriverWait(driver, 100).until(
                EC.element_to_be_clickable(
                    (By.ID, 'listselect_suche_listrowfrom'))))
        progress = len(select.options)
        for index in range(len(select.options)):
            select = Select(
                WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable(
                        (By.ID, 'listselect_suche_listrowfrom'))))
            select.select_by_index(index)
            percentage = (index / progress) * 100
            #    print ("" + sys.argv[1] +  " "+ str(percentage) + "%")

            #    verbundprojekte = len(driver.find_elements_by_partial_link_text('J'))
            verbundprojekte = len(
                driver.find_elements_by_css_selector(
                    "[title^='Detailansicht von Förderkennzeichen']"))
            #print (verbundprojekte)

            for index in range((verbundprojekte)):
                progress_bar.update(1)  # update progress
                try:

                    #counter = counter + 1
                    #print(str(counter) + "/" + "max." + str((progress*10)))
                    #link = WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.PARTIAL_LINK_TEXT, 'J')))[index].click()
                    WebDriverWait(driver, 5).until(
                        EC.visibility_of_all_elements_located(
                            (By.CSS_SELECTOR,
                             "[title^='Detailansicht von Förderkennzeichen']"
                             )))[index].click()  # Detailansicht
                    WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable(
                            (By.CSS_SELECTOR, '#sucheVerbund > a:nth-child(3)'
                             ))).click()  #verbundliste
                    WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable(
                            (By.CSS_SELECTOR,
                             'li.nobreak_hz:nth-child(4)'))).click()  #download
                    #breakpoint()
                    driver.execute_script("window.history.go(-3)")

                    attempts = 0
                    while attempts < 5:
                        try:
                            select = Select(
                                WebDriverWait(driver, 100).until(
                                    EC.element_to_be_clickable(
                                        (By.ID,
                                         'listselect_suche_listrowfrom'))))
                            attempts += 1
                        except StaleElementReferenceException as ex:
                            select = Select(
                                WebDriverWait(driver, 100).until(
                                    EC.element_to_be_clickable(
                                        (By.ID,
                                         'listselect_suche_listrowfrom'))))
                            #print ("stale")
                            continue
                        except NoSuchElementException as ex:
                            #    print("NoSuchElement " + str(ex))

                            continue

                except TimeoutException as ex:
                    #    print("Timeout " + str(ex))
                    # driver.back()
                    # driver.back()
                    driver.execute_script("window.history.go(-2)")
                    continue

                except NoSuchElementException as ex:
                    #    print("NoSuchElement " + str(ex))
                    driver.execute_script("window.history.go(-2)")
                    continue
        select = Select(
            WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable(
                    (By.ID, 'listselect_suche_listrowfrom'))))
    subprocess.call('copy *.csv merged.csv', shell=True, cwd=path)
    driver.close()
    sema.release()
Example #24
0
 def get_default_firefox_options():
     firefoxOptions = webdriver.FirefoxOptions()
     firefoxOptions.setProfile(webdriver.FirefoxProfile())
     return firefoxOptions
except Exception as e:
    print(str(e))

video_id = re.findall('"videoId": "(.*?)"', str(respData))
video_title = re.findall('"title": "(.*?)"', str(respData))
video_files = []
for titles in video_title:
    titles += ".mp3"
    video_files.insert(len(video_files), titles)
dictionary = dict(zip(video_files, video_id))
video_dl = []
for eachP in dictionary.values():
    video_dl.insert(len(video_dl), 'https://www.youtubeinmp3.com/download/?video=https://www.youtube.com/watch?v=' + (str(eachP)))
dictionary = dict(zip(video_files, video_dl))
failed_dl = []
fp = webdriver.FirefoxProfile(r"C:\Users\Shlok Khandelwal\AppData\Roaming\Mozilla\Firefox\Profiles\4hlau0sw.Selenium")
driver = webdriver.Firefox(executable_path=r"C:\Users\Shlok Khandelwal\Desktop\geckodriver.exe", firefox_profile=fp)
driver.set_page_load_timeout(60)
videoCount = 0


while(len(dictionary)> 0):
    songsToDelete = []
    for eachLink in dictionary.values():
        try:
            alert = driver.switch_to_alert()
            alert.dismiss()
        except Exception as e:
            print("No alert")
        try:
            driver.get(eachLink)
Example #26
0
'''
HTML 旨在显示信息,而 XML 旨在传输信息。
XML 没有预定义的标签。XML 允许创作者定义自己的标签和自己的文档结构。
在 HTML 中使用的标签(以及 HTML 的结构)是预定义的。HTML 文档只使用在 HTML 标准中定义过的标签(比如 <p> 、<h1> 等等)。
'''

from selenium import webdriver
import time

profile_dictionary = R"C:\Users\R\AppData\Roaming\Mozilla\Firefox\Profiles\yjdic0n5.default"
profile = webdriver.FirefoxProfile(profile_dictionary)
driver = webdriver.Firefox(profile)

driver.get("Http://www.baidu.com")
# 1. xpath 属性定位
# 1)通过元素ID、class、name等属性定位  *代表任意标签
# driver.find_element_by_xpath("//*[@id='kw']").send_keys("python")
# driver.find_element_by_xpath("//*[@class='s_ipt']").send_keys("python")
# driver.find_element_by_xpath("//*[@name='wd']").send_keys("python")

# 2)如果一个元素id、name、class属性都没有,这时候也可以通过其它属性定位到
# 3) xpath:标签   *代表任意标签;如果有具体标签直接写标签即可
# driver.find_element_by_xpath("//input[@autocomplete='off']").send_keys("python")

# 4)xpath:层级 1.如果一个元素,它的属性不是很明显,无法直接定位到,这时候我们可以先找父元素2.再找下个层级就能定位到了。
# 要是其父属性也不是很明显,就找它父元素的父元素。
# driver.find_element_by_xpath("//form[@id='form']/span/input").send_keys("python")

# 5)xpath:索引 如果一个元素和其兄弟元素tag相同,那么使用层级来定位,就需要索引指定。索引从1开始算起
# driver.find_element_by_xpath("//select[@id='nr']/option[1]").click()
# driver.find_element_by_xpath("//select[@id='nr']/option[2]").click()
Example #27
0
    def setUpClass(cls):
        super().setUpClass()

        profile = webdriver.FirefoxProfile(
            os.path.join(cls.profilesDir, 'test'))
        cls.browser = webdriver.Firefox(profile)
Example #28
0
                    help="List the groups you want to scrape for recent posts")

parser.add_argument("-d", "--depth", action="store",
                    dest="depth", default=5, type=int,
                    help="How many recent posts you want to gather -- in multiples of (roughly) 8.")

args = parser.parse_args()

BROWSER_EXE = '/usr/bin/firefox'
GECKODRIVER = '/usr/local/bin/geckodriver'

FIREFOX_BINARY = FirefoxBinary(BROWSER_EXE)

#  Code to disable notifications pop up of Chrome Browser

PROFILE = webdriver.FirefoxProfile()
# PROFILE.DEFAULT_PREFERENCES['frozen']['javascript.enabled'] = False
PROFILE.set_preference("dom.webnotifications.enabled", False)
PROFILE.set_preference("app.update.enabled", False)
PROFILE.update_preferences()


class CollectPosts(object):
    """Collector of recent FaceBook posts.
           Note: We bypass the FaceBook-Graph-API by using a 
           selenium FireFox instance! 
           This is against the FB guide lines and thus not allowed.

           USE THIS FOR EDUCATIONAL PURPOSES ONLY. DO NOT ACTAULLY RUN IT.
    """
    info = thread.xpath('.//span[@class="GIEUOX-DOQ"]')
    parsed['seen'] = int(info[1].text.split()[0])
    parsed['posts'] = int(info[0].text.split()[0])
    return parsed

GOOGLE_GROUP_BASE = 'https://groups.google.com/forum/'
# GOOGLE_GROUP_URL = GOOGLE_GROUP_BASE + '#!forum/{}'
GOOGLE_GROUP_URL = GOOGLE_GROUP_BASE + '#!forum/{}/?hl=en'
GROUP_URL = GOOGLE_GROUP_URL.format('nsndev')

proxy = os.environ.get('http_proxy')
if proxy:
    PROXY_HOST, PROXY_PORT = os.environ.get('http_proxy').split('//')[1].split(':')
    PROXY_PORT = int(PROXY_PORT)

    fp = webdriver.FirefoxProfile()
    fp.set_preference("network.proxy.type", 1)
    fp.set_preference("network.proxy.http", PROXY_HOST)
    fp.set_preference("network.proxy.http_port", PROXY_PORT)
    fp.set_preference("network.proxy.ftp", PROXY_HOST)
    fp.set_preference("network.proxy.ftp_port", PROXY_PORT)
    fp.set_preference("network.proxy.ssl", PROXY_HOST)
    fp.set_preference("network.proxy.ssl_port", PROXY_PORT)
    # fp.set_preference("general.useragent.override", "whater_useragent")
    fp.update_preferences()

    browser = webdriver.Firefox(firefox_profile=fp)
else:
    browser = webdriver.Firefox()

browser.implicitly_wait(30)
Example #30
0
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from pyvirtualdisplay import Display

from time import sleep
display = Display(visible=0, size=(800, 600))
display.start()
options = Options()
options.headless = False
_browser_profile = webdriver.FirefoxProfile()
_browser_profile.set_preference("dom.webnotifications.enabled", False)
driver = webdriver.Firefox(options=options,
                           firefox_profile=_browser_profile,
                           executable_path=r'/root/ytbot/geckodriver')
try:
    print("loaded")
    driver.get("https://www.ytmonster.net/campaigns/views")
    user_name = driver.find_element_by_id('inputUsername')
    user_name.send_keys('vinay221097')
    password = driver.find_element_by_id('inputPassword')
    password.send_keys('Musha22@')
    login = driver.find_element_by_xpath(
        '/html/body/div[2]/div/div/div/div[1]/div/form/button')
    login.click()
    print("logged in successfully")
    sleep(3)
    driver.get("https://www.ytmonster.net/exchange/views")
    sleep(4)
except Exception as e:
    print("error occured", e)