コード例 #1
0
from selenium import webdriver
from selenium.webdriver.chrome.options import Options   #从options模板中调用OPtions类
import time
from bs4 import BeautifulSoup

chrome_options = Options()  # 实例化Option对象
chrome_options.add_argument('--headless') # 把Chrome浏览器设置为静默模式
dirver = webdriver.Chrome(options = chrome_options)    #设置引擎为Chrome,在后台默默运行

# dirver = webdriver.Chrome()     #设置引擎为Chrome,真实的打开一个Chrome浏览器
dirver.get("https://localprod.pandateacher.com/python-manuscript/hello-spiderman/")
time.sleep(2)

pageSource = dirver.page_source     #获取完整渲染的网页源代码  输出'str'类型,实例化
print(type(pageSource))

# label = dirver.find_elements_by_tag_name('label')   #解析网页并提取标签'label'  输出'list'类型
a = dirver.find_element_by_css_selector("[class='xl-chrome-ext-bar']") # selenium中查找某标签有空格用css
labels = dirver.find_element_by_tag_name('label')
print(type(labels))             #解析网页并提取标签'label'  输出'WebElement'类型 
print(labels.text)              #提取文字
print(labels.get_attribute('type'))     #输入参数:属性名,可以提取属性值
# print(type(label))
# for i in label:
#     print(i.text)
# print(label.get_attribute('type'))        #获取属性的值


#find_element_by_link_text  通过链接文本获得超链接
#find_element_by_partial_link_text   通过链接部分文本获得超链接
コード例 #2
0
        if i != page_all:
            WebDriverWait(driver, 10).until(
                EC.text_to_be_present_in_element(
                    (By.XPATH, '//*[@id="datagrid-row-r1-1-9"]/td[1]/div'),
                    str(i * 10)))
        else:
            time.sleep(1)

        try:
            get_content(driver)
        except:
            print('出现异常,请调试代码')


if __name__ == '__main__':
    db = To_db()
    db.create_db(DB_NAME)
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(chrome_options=chrome_options)
    # driver=webdriver.Chrome()
    driver.implicitly_wait(10)
    url = 'http://www.whzbtb.cn/V2PRTS/OpeningRoomInfoListInit.do'
    driver.get(url)
    change_page(driver)

    db.close_db()
    driver.close()
    driver.quit()
#finish_all
コード例 #3
0
 def __init__(self):
     self._options = Options()
     self._options.add_argument('--headless')
     self._web_driver: WebDriver or None = None
     self._headed_web_driver: WebDriver or None = None
     atexit.register(self.cleanup)
コード例 #4
0
def scrape_recipe_sources(recipe_sources, batch_id):
    """Scrape the given list of recipe sources. The scraping is processed with a process assigned with the given batch ID.

	Args:
		recipe_sources: List of recipe sources. A source contains a 'url', the recipe URL, and 'categories', the categories associated with the recipe page.
		batch_id: Batch ID assigned to the process where the scaping is conducted.

    Returns:
		True if the scraping has been completed for all recipe sources.
	"""
    options = Options()
    options.headless = True
    driver = webdriver.Chrome('./chromedriver', options=options)

    cache_path = recipe_cache_path(batch_id)

    scraped_ids = set()
    if path.exists(cache_path):
        scraped_ids = set([
            row[0]
            for row in pandas.read_csv(RESULT_CSV_PATH, usecols=['id']).values
        ])
    else:
        with open(cache_path, 'w') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=RECIPE_CSV_FIELDNAMES)
            writer.writeheader()

    try:
        with open(cache_path, 'a') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=RECIPE_CSV_FIELDNAMES)

            for i, recipe_source in enumerate(recipe_sources, start=1):
                print('Batch {} processing recipe #{}'.format(batch_id, i))

                recipe_id = recipe_id_from_recipe_url(recipe_source['url'])
                if recipe_id in scraped_ids:
                    continue

                recipe_content = None

                # Sometimes the driver experiences a connection failure. Keep trying to scrape one page until it succeeded.
                try:
                    recipe_content = scrape_single_recipe_url(
                        recipe_source['url'], recipe_source['categories'],
                        driver)
                except:
                    # Instantiate a new driver.
                    try:
                        driver.close()
                        driver.quit()
                    finally:
                        time.sleep(1)
                        driver = webdriver.Chrome('./chromedriver',
                                                  options=options)
                        recipe_content = None

                writer.writerow(recipe_content)

    finally:
        driver.close()
        driver.quit()

    return True
コード例 #5
0
ファイル: wl.py プロジェクト: pivosxbmc/ALL_project
def dr():
    chrome_options = Options()
    chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
    chrome_driver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
    driver = webdriver.Chrome(chrome_driver, chrome_options=chrome_options)
    return driver
コード例 #6
0
async def carbon_api(e):

 if not e.text[0].isalpha() and e.text[0] not in ("/", "#", "@", "!"):

   """ A Wrapper for carbon.now.sh """

   await e.edit("`Processing..`")

   CARBON = 'https://carbon.now.sh/?l={lang}&code={code}'

   global CARBONLANG

   textx = await e.get_reply_message()

   pcode = e.text

   if pcode[8:]:

         pcode = str(pcode[8:])

   elif textx:

         pcode = str(textx.message) # Importing message to module

   code = quote_plus(pcode) # Converting to urlencoded

   await e.edit("`Meking Carbon...\n25%`")

   url = CARBON.format(code=code, lang=CARBONLANG)

   chrome_options = Options()

   chrome_options.add_argument("--headless")

   chrome_options.binary_location = GOOGLE_CHROME_BIN

   chrome_options.add_argument("--window-size=1920x1080")

   chrome_options.add_argument("--disable-dev-shm-usage")

   chrome_options.add_argument("--no-sandbox")

   chrome_options.add_argument("--disable-gpu")

   prefs = {'download.default_directory' : './'}

   chrome_options.add_experimental_option('prefs', prefs)

   driver = webdriver.Chrome(executable_path=CHROME_DRIVER, options=chrome_options)

   driver.get(url)

   await e.edit("`Be Patient...\n50%`")

   download_path = './'

   driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')

   params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': download_path}}

   command_result = driver.execute("send_command", params)

   driver.find_element_by_xpath("//button[contains(text(),'Export')]").click()

   driver.find_element_by_xpath("//button[contains(text(),'4x')]").click()

   driver.find_element_by_xpath("//button[contains(text(),'PNG')]").click()

   await e.edit("`Processing..\n75%`")

   # Waiting for downloading

   sleep(2.5)

   await e.edit("`Done Dana Done...\n100%`")

   file = './carbon.png'

   await e.edit("`Uploading..`")

   await e.client.send_file(

         e.chat_id,

         file,

         caption="<< Here's your carbon, gey boi! \n Carbonised by @inferno_scorpion >> ",

         force_document=True,

         reply_to=e.message.reply_to_msg_id,

         )

   os.remove('./Anubis.png')

   driver.quit()

   # Removing carbon.png after uploading

   await e.delete() # Deleting msg
コード例 #7
0
def launch_browser():

    if env.RUNNING_BROWSER.upper() == "FIREFOX":
        #os.popen("TASKKILL /F /IM firefox.exe")

        fp = FirefoxProfile()
        fp.native_events_enabled = False

        binary_path = common.get_value_from_conf("FIREFOX_BINARY_PATH")

        if binary_path == "":
            env.BROWSER = webdriver.Firefox(firefox_profile=fp)
        else:
            fb = FirefoxBinary(firefox_path=binary_path)
            env.BROWSER = webdriver.Firefox(firefox_profile=fp,
                                            firefox_binary=fb)

    elif env.RUNNING_BROWSER.upper() == "CHROME":
        #os.popen("TASKKILL /F /IM chrome.exe")
        os.popen("TASKKILL /F /IM chromedriver.exe")

        binary_path = common.get_value_from_conf("CHROME_BINARY_PATH")
        chromedriver = common.get_value_from_conf("DRIVER_CHROME")

        if binary_path == "":
            os.environ["webdriver.chrome.driver"] = chromedriver
            env.BROWSER = webdriver.Chrome(executable_path=chromedriver)
        else:
            opts = Options()
            opts.binary_location = binary_path

            os.environ["webdriver.chrome.driver"] = chromedriver
            env.BROWSER = webdriver.Chrome(executable_path=chromedriver,
                                           chrome_options=opts)

    elif env.RUNNING_BROWSER.upper() == "IE":
        #os.popen("TASKKILL /F /IM iexplore.exe")
        os.popen("TASKKILL /F /IM IEDriverServer.exe")

        dc = DesiredCapabilities.INTERNETEXPLORER.copy()

        dc['acceptSslCerts'] = True
        dc['nativeEvents'] = True

        iedriver = common.get_value_from_conf("DRIVER_IE")

        os.environ["webdriver.ie.driver"] = iedriver

        env.BROWSER = webdriver.Ie(executable_path=iedriver, capabilities=dc)

    else:
        return False

    env.TEST_URL = common.get_value_from_conf("TESTING_URL")

    env.BROWSER.get(env.TEST_URL)
    env.BROWSER.maximize_window()

    time.sleep(3)

    return True
コード例 #8
0
ファイル: jdgoods.py プロジェクト: ysong211/scrapy-
 def __init__(self):
     chrome_options = Options()
     chrome_options.add_argument('--headless')
     self.browser = webdriver.Chrome(chrome_options=chrome_options)
     super(JdgoodsSpider, self).__init__()
     dispatcher.connect(self.spider_closed, signals.spider_closed)
コード例 #9
0
async def carbon_api(e):
    """ carbon.now.sh için bir çeşit wrapper """
    await e.edit("`İşleniyor...`")
    CARBON = 'https://carbon.now.sh/?l={lang}&code={code}'
    global CARBONLANG
    textx = await e.get_reply_message()
    pcode = e.text
    if pcode[8:]:
        pcode = str(pcode[8:])
    elif textx:
        pcode = str(textx.message)  # Girilen metin, modüle aktarılıyor.
    code = quote_plus(pcode)  # Çözülmüş url'ye dönüştürülüyor.
    await e.edit("`İşleniyor...\nTamamlanma Oranı: 25%`")
    if os.path.isfile("./carbon.png"):
        os.remove("./carbon.png")
    url = CARBON.format(code=code, lang=CARBONLANG)
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.binary_location = GOOGLE_CHROME_BIN
    chrome_options.add_argument("--window-size=1920x1080")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-gpu")
    prefs = {'download.default_directory': './'}
    chrome_options.add_experimental_option('prefs', prefs)
    driver = webdriver.Chrome(executable_path=CHROME_DRIVER,
                              options=chrome_options)
    driver.get(url)
    await e.edit("`İşleniyor...\nTamamlanma Oranı: 50%`")
    download_path = './'
    driver.command_executor._commands["send_command"] = (
        "POST", '/session/$sessionId/chromium/send_command')
    params = {
        'cmd': 'Page.setDownloadBehavior',
        'params': {
            'behavior': 'allow',
            'downloadPath': download_path
        }
    }
    command_result = driver.execute("send_command", params)
    driver.find_element_by_xpath("//button[contains(text(),'Export')]").click()
    # driver.find_element_by_xpath("//button[contains(text(),'4x')]").click()
    # driver.find_element_by_xpath("//button[contains(text(),'PNG')]").click()
    await e.edit("`İşleniyor...\nTamamlanma Oranı: 75%`")
    # İndirme için bekleniyor
    while not os.path.isfile("./carbon.png"):
        await sleep(0.5)
    await e.edit("`İşleniyor...\nTamamlanma Oranı: 100%`")
    file = './carbon.png'
    await e.edit("`Resim karşıya yükleniyor...`")
    await e.client.send_file(
        e.chat_id,
        file,
        caption="Bu resim [Carbon](https://carbon.now.sh/about/) kullanılarak yapıldı,\
        \nbir [Dawn Labs](https://dawnlabs.io/) projesi.",
        force_document=True,
        reply_to=e.message.reply_to_msg_id,
    )

    os.remove('./carbon.png')
    driver.quit()
    # Karşıya yüklemenin ardından carbon.png kaldırılıyor
    await e.delete()  # Mesaj siliniyor
コード例 #10
0
ファイル: carbon.py プロジェクト: denomparkour/DenomUniborg
async def carbon_api(e):
    if not e.text[0].isalpha() and e.text[0] not in ("/", "#", "@", "!"):
        """ A Wrapper for carbon.now.sh """
        await e.edit("⬜⬜⬜⬜⬜")
        CARBON = 'https://carbon.now.sh/?l={lang}&code={code}'
        CARBONLANG = "en"
        textx = await e.get_reply_message()
        pcode = e.text
        if pcode[8:]:
            pcode = str(pcode[8:])
        elif textx:
            pcode = str(textx.message)  # Importing message to module
        code = quote_plus(pcode)  # Converting to urlencoded
        url = CARBON.format(code=code, lang=CARBONLANG)
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.binary_location = Config.GOOGLE_CHROME_BIN
        chrome_options.add_argument("--window-size=1920x1080")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument('--disable-gpu')
        prefs = {'download.default_directory': './'}
        chrome_options.add_experimental_option('prefs', prefs)
        await e.edit("⬛⬛⬜⬜⬜")

        driver = webdriver.Chrome(executable_path=Config.CHROME_DRIVER,
                                  options=chrome_options)
        driver.get(url)
        download_path = './'
        driver.command_executor._commands["send_command"] = (
            "POST", '/session/$sessionId/chromium/send_command')
        params = {
            'cmd': 'Page.setDownloadBehavior',
            'params': {
                'behavior': 'allow',
                'downloadPath': download_path
            }
        }
        command_result = driver.execute("send_command", params)

        driver.find_element_by_xpath(
            "//button[contains(text(),'Export')]").click()
        sleep(5)  # this might take a bit.
        driver.find_element_by_xpath("//button[contains(text(),'4x')]").click()
        sleep(5)
        await e.edit("⬛⬛⬛⬜⬜")
        driver.find_element_by_xpath(
            "//button[contains(text(),'PNG')]").click()
        sleep(5)  #Waiting for downloading

        await e.edit("⬛⬛⬛⬛⬛")
        file = './carbon.png'
        await e.edit("✅Carbon Completed, Uploading Carbon✅")
        await e.client.send_file(
            e.chat_id,
            file,
            caption=
            "Carbon by [@r4v4n4](https://www.github.com/ravana69/pornhub)",
            force_document=False,
            reply_to=e.message.reply_to_msg_id,
        )

        os.remove('./carbon.png')
        # Removing carbon.png after uploading
        await e.delete()  # Deleting msg
コード例 #11
0
ファイル: taobao_selenium.py プロジェクト: xx0746/Spiders
 def __init__(self,keyword):
     # 关键词
     self.keyword = keyword
     self.cookies = {}
     # cookie池
     self.cookie = [
         'thw=cn; t=30912a0211d2f7c4b616585bc4825060; hng=CN%7Czh-CN%7CCNY%7C156; enc=pzdR76EQ9XgSRGR82Xq45tmJruRFWu0FouJ8kQAkE3nawWt6z1uotCujQi0PcMIZI%2FB7iYyg4rl8rsxLX1xJSA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; _m_h5_tk=bd1525710e4afb9c577d8a990f3353b7_1564926869087; _m_h5_tk_enc=5a6d2edb8c06b13c3c5e8c0a0e6dd566; cookie2=166f3000b9672c536c59566b63e90b79; _tb_token_=eeb3a3bb33a33; _uab_collina=156492382396750161615565; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; swfstore=200907; mt=ci=0_0; cna=353FFR0e5FsCAXa3WpLq0giz; v=0; x5sec=7b227365617263686170703b32223a223464386535366432633737383636646133303535303466613238643738363734434f572b6d2b6f46454b2b367a35364f334b625954526f504d6a49774d7a41334d6a597a4e6a4d7a4d447378227d; JSESSIONID=8405DFCA73B5B3B1C30D44ED9D1A4A79; isg=BEVFs3FGtIJs1ZBax07XR-RvVIG_qvjxG33nnEeq5nyL3mdQDlZSZIH86EKNnhFM; l=cBSHVLunqYl65142BOfZCuI8LPbt5IRbzsPzw4OG4ICPOb5e5cvcWZFPC28wCnGVK6uJJ3oWYJ1uB0L5yyCqJxpsw3k_J_f..',
         # 'thw=cn; t=4a67cf0f54b38a06b12baa6d7011ac01; enc=z0TfWvQ9HWXGg%2FRoa2MY2HYj2UfgrfgniIYK%2FEv2r%2FGt32csHyi8iBOmGabkyql62Uuf9%2BYrgcukKLieAnE%2FjA%3D%3D; mt=ci=0_0; cna=353FFR0e5FsCAXa3WpLq0giz; hng=CN%7Czh-CN%7CCNY%7C156; v=0; cookie2=1188c00dd90500ec2caf188256b95566; _tb_token_=555f6ebe1f33b; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; JSESSIONID=8D4A142105251AA3BD42FD981CC4D589; uc1=cookie14=UoTaHP3Aq5rlzQ%3D%3D; isg=BC0t8WQ9DM36--hecDw_dwcBPMlnImA0s3X_RG8wZkQU5kmYNNvhLE_g0fql0nkU; l=cBTFlrMmqbmpLQB3BOfgCuI8Ls7OmQAfCfVzw4OGjICP9mCwkrwcWZFXDALeCnhVp6UM83oWYJ1uBeYBqtftHxoD2j-la',
         # 'thw=cn; t=4a67cf0f54b38a06b12baa6d7011ac01; enc=z0TfWvQ9HWXGg%2FRoa2MY2HYj2UfgrfgniIYK%2FEv2r%2FGt32csHyi8iBOmGabkyql62Uuf9%2BYrgcukKLieAnE%2FjA%3D%3D; mt=ci=0_0; cna=353FFR0e5FsCAXa3WpLq0giz; hng=CN%7Czh-CN%7CCNY%7C156; v=0; cookie2=1188c00dd90500ec2caf188256b95566; _tb_token_=555f6ebe1f33b; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; uc1=cookie14=UoTaHP3Aq5rlzQ%3D%3D; JSESSIONID=D1451FA603364663FBC5F784EEC1FCC0; isg=BLW1cbogBDUCUGD2CFQHX08pxDGvmmgsay137DfaIix7DtQA24PHFIzMWZKdeoH8; l=cBTFlrMmqbmpL_gtBOCwSuI8Ls79YIR2muPRwC0Xi_5Q49L6OfbOkStYshp6DjWd9SJ640tUd_29-etliOHx3mx-g3fP.',
     ]
     # user-agent池
     self.user_agent = [
         "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
     ]
     self.headers = {
         'cookie': random.choice(self.cookie),
         'referer': 'https://s.taobao.com/search',
         'user-agent': random.choice(self.user_agent),
     }
     # IP代理
     self.ip = [
         # '117.191.11.111:8080',
         '117.191.11.113:80',
         '117.191.11.109:8080',
         '117.191.11.80:80',
         '117.191.11.76:8080',
         '117.191.11.80:80',
         '117.191.11.108:80',
         '117.191.11.111:80',
         '117.191.11.109:8080',
         '39.135.24.11:80',
         '117.191.11.109:80',
         '117.191.11.108:8080',
         '117.191.11.110:8080',
         '35.183.111.234:80',
         '144.217.229.157:1080',
         '39.137.69.7:80',
         '39.137.69.7:8080',
         '39.137.69.10:8080'
     ]
     self.proxies = {
         'http': random.choice(self.ip),
     }
     # 获取当前的年月日
     self.date = time.strftime('%Y%m%d',time.localtime(time.time()))
     # 初始页码
     self.page = 0
     # 淘宝搜索url
     self.page_url = 'https://s.taobao.com/search?q={keyword}&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_{date}&ie=utf8&s={page}'
     # 开启一个session会话
     self.session = requests.session()
     # 将cookiesJar赋值给会话
     self.session.cookies = self.read_cookies()
     # mysql数据库
     self.connect = pymysql.connect(host='localhost',port=3306,user='******',passwd='0000',db='scrapytest')
     self.cursor = self.connect.cursor()
     chrome_options = Options()
     chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
     # chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
     chrome_drive = r'D:\soft\Chrome\chromedriver.exe'
     self.driver = webdriver.Chrome(executable_path=chrome_drive, options=chrome_options)
     self.count = 1  # 最多识别6次
コード例 #12
0
ファイル: app.py プロジェクト: cypark9022/Jungo-project
def dgmk(keyword, uname):
    chrome_path = 'C:/Users/LGPC/Desktop/sparta/Jungo-project/driver/chromedriver_v81/chromedriver_win32/chromedriver'
    # chrome_path = '/usr/bin/chromedriver'
    
    # chrome 브라우저를 headless(non-gui)로 사용하기위한 옵션설정
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(options=options, executable_path=chrome_path)
    driver.implicitly_wait(3)

    url = 'https://www.daangn.com/search/' + keyword
    driver.get(url)
    driver.implicitly_wait(1)

    # 크롬 웹브라우저 화면에서 [더보기] 클릭 -> 12회 반복 (150개)
    for i in range(12):
        try:
            driver.find_element_by_xpath('//*[@id="result"]/div[1]/div[2]').click()
            driver.implicitly_wait(1)
            driver.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
            time.sleep(0.5)
        # [더보기] 버튼이 없을경우, 현재페이지를 크롤링
        except:
            print('Dangn-market more-button end')
            time.sleep(1)
            break

    # url 페이지의 html data 크롤링
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    # 크롤링된 데이터를 DB에 저장
    items_uname = 'items_' + uname
    items = soup.select('div > article.flea-market-article')

    for item in items:
        item_img = item.select_one('div.card-photo > img')['src']
        item_title = item.select_one('span.article-title').text
        item_position = item.select_one('p.article-region-name').text
        item_price = item.select_one('p.article-price').text
        item_link = item.select_one('article.flea-market-article > a')['href']

        doc = {
                'keyword': keyword,
                'img': item_img,
                'title': item_title,
                'position': item_position,
                'price': item_price,
                'link': 'daangn.com' + item_link
        }
        db[items_uname].insert_one(doc)

    # Chrome 브라우저 종료
    driver.close()

    result = db[items_uname].find({'keyword': keyword})
    if result is not None:
        print('Dangn-market crawling success!! (save to DB)')
    else:
        print('ERROR!! Dangn-market crawling Fail...')
コード例 #13
0
#!/usr/bin/env python3

import os
import time
from selenium.webdriver import Chrome
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

opts = Options()

browser = webdriver.Chrome(ChromeDriverManager().install())

browser.get("https://www.nhm.ac.uk/wpy/gallery?tags=")

elements = browser.find_elements_by_class_name("ImageGrid__container___YSm77")
hrefs = [element.get_attribute('href') for element in elements]

# write a file of urls
with open("urls.txt", "w") as url_file:
    for href in hrefs:
        url_file.write(href + "\n")
コード例 #14
0
async def carbon_api(e):
    if not e.text[0].isalpha() and e.text[0] not in ("/", "#", "@", "!"):
        """ A Wrapper for carbon.now.sh """
        await e.edit("🌚🌚🌚🌚🌚")
        CARBON = 'https://carbon.now.sh/?bg=rgba(29%2C40%2C104%2C1)&t=one-light&wt=none&l=application%2Ftypescript&ds=true&dsyoff=20px&dsblur=68px&wc=true&wa=true&pv=56px&ph=56px&ln=false&fl=1&fm=Hack&fs=14px&lh=143%25&si=false&es=2x&wm=false&code={code}'
        CARBONLANG = "en"
        textx = await e.get_reply_message()
        pcode = e.text
        if pcode[8:]:
            pcode = str(pcode[8:])
        elif textx:
            pcode = str(textx.message)  # Importing message to module
        code = quote_plus(pcode)  # Converting to urlencoded
        url = CARBON.format(code=code, lang=CARBONLANG)
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.binary_location = Config.GOOGLE_CHROME_BIN
        chrome_options.add_argument("--window-size=1920x1080")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument('--disable-gpu')
        prefs = {'download.default_directory': './'}
        chrome_options.add_experimental_option('prefs', prefs)
        await e.edit("🌝🌝🌚🌚🌚")

        driver = webdriver.Chrome(executable_path=Config.CHROME_DRIVER,
                                  options=chrome_options)
        driver.get(url)
        download_path = './'
        driver.command_executor._commands["send_command"] = (
            "POST", '/session/$sessionId/chromium/send_command')
        params = {
            'cmd': 'Page.setDownloadBehavior',
            'params': {
                'behavior': 'allow',
                'downloadPath': download_path
            }
        }
        command_result = driver.execute("send_command", params)

        driver.find_element_by_xpath(
            "//button[contains(text(),'Export')]").click()
        sleep(5)  # this might take a bit.
        driver.find_element_by_xpath("//button[contains(text(),'4x')]").click()
        sleep(5)
        await e.edit("🌝🌝🌝🌚🌚")
        driver.find_element_by_xpath(
            "//button[contains(text(),'PNG')]").click()
        sleep(5)  #Waiting for downloading

        await e.edit("🌝🌝🌝🌝🌝")
        file = './carbon.png'
        await e.edit("✅Karbon4 Completed, Uploading Karbon✅")
        await e.client.send_file(
            e.chat_id,
            file,
            caption="Karbon4 by [@NoOne](https://t.me/kirito6969)",
            force_document=True,
            reply_to=e.message.reply_to_msg_id,
        )

        os.remove('./carbon.png')
        # Removing carbon.png after uploading
        await e.delete()  # Deleting msg
コード例 #15
0
def main():
    start_time = datetime.now()
    n = 0

    while True:
        sleep(5)
        print("Перезапуск браузера")

        chromeOptions = Options()
        chromeOptions.add_argument('--headless')
        chromeOptions.add_argument('--no-sandbox')
        prefs = {"profile.managed_default_content_settings.images": 2}
        chromeOptions.add_experimental_option('prefs', prefs)
        driver = webdriver.Chrome(chrome_options=chromeOptions)
        # driver = webdriver.Chrome(options=chromeOptions)

        driver.get('https://www.w3.org/People/mimasa/test/')
        while True:
            try:
                product_ = MySQL().get_product_reviews()

                if product_:

                    sku = product_['sku']
                    id_product = product_['id']
                    id_category = product_['id_category']

                    # Преобразовываем URL
                    link_ = product_['link_product']
                    if link_.endswith('/'):
                        link = link_ + 'otzyvy/'
                    else:
                        link = link_ + '/otzyvy/'

                    # Забираем страницу и ждём окончания её загрузки
                    driver.get(link)
                    tmp = ''

                    while True:
                        page_source = driver.page_source
                        if len(page_source) > len(tmp):
                            tmp = page_source
                        else:
                            break

                    reviews = get_reviews(sku=sku, page_source=page_source)
                    # print(reviews)

                    if reviews:
                        MySQL().write_reviews(reviews=reviews)
                        MySQL().set_product_reviews_ready(
                            id_product=id_product)
                    elif reviews is False:
                        MySQL().set_product_reviews_bad(id_product=id_product)
                    else:
                        MySQL().set_product_reviews_ready(
                            id_product=id_product)

                    n += 1
                    print('\rCount',
                          n,
                          'Time',
                          datetime.now() - start_time,
                          end='')
            except:
                break

        driver.close()
        driver.quit()
コード例 #16
0
ファイル: views_site.py プロジェクト: huanmp4/deprecated
def getData(request):
    url = "https://918hj.zjlbw.top/"
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    driver = webdriver.Chrome(chrome_options=chrome_options)
    driver.get(url)
    iframe = driver.find_elements_by_tag_name('iframe')[0]  # 查找第一个[0]iframe
    driver.switch_to.frame(iframe)  # 查找进入刚刚查找的iframe里面
    iframe = driver.find_elements_by_tag_name('iframe')[0]  # 重复
    driver.switch_to.frame(iframe)  # 重复
    soup = BeautifulSoup(driver.page_source, "html.parser")  # 解码
    soup_dl = soup.find_all("dl")
    legend_list = []
    howMany = 0
    for index, dl in enumerate(soup_dl):
        temp_list = []
        howMany += 1

        for a in dl.find_all("a"):
            temp_list.append(a.string)

        for b in dl.find_all("span"):
            if b.string == None or '':
                for bb in b.find_all("font"):
                    if bb.string == None or '':
                        continue
                    else:
                        temp_list.append(bb.string)
            else:
                temp_list.append(b.string)
        try:
            spanIsNone = dl.span.string
        except:
            for index, c in enumerate(dl.find_all("dd")):
                if index >= 2 and index <= 5:
                    if c.string == None or '':
                        for index, cc in enumerate(c.strings):
                            if index == 0:
                                temp_list.append(cc)
                            else:
                                continue
                        # c5 = c.parent.find(class_="c5")
                        #
                        # print("c.父节点", c.parent.find(class_="c5"))

                    else:
                        temp_list.append(c.string.replace("\xa0", ""))
        temp_list.append(dl.a["href"])
        del temp_list[2]  # 把多余的删除
        legend_list.append(temp_list)

    for legend in legend_list:
        print("dl:", legend)
        # spli = legend[2].split("/")
        # month = spli[0][0]
        # day = spli[1][0:2]
        # hour = spli[2][0]
        # minute = spli[2][2:4]
        # year = time.strftime('%Y', time.localtime(time.time()))
        # dd = "%s-%s-%s %s:%s:0" % (year, month, day, hour, minute)
        #如果时间是空的,就给个假时间给它
        if len(legend) < 4 or len(legend) >= 8:
            print("警告,列表元素小于3或大于7")
            continue
        if legend[2] == None or '':
            temp_month = time.strftime('%m', time.localtime(time.time()))
            temp_day = time.strftime('%d', time.localtime(time.time()))
            temp_legend = "%s月/%s日/★错误时间★" % (temp_month, temp_day)
        else:
            temp_legend = legend[2]
        spli = temp_legend.split("/")

        if spli[0] == "---精品全天固定---":
            onPage = "allDay"
            ttime = time.strftime('%Y-%m-%d 0:0:0',
                                  time.localtime(time.time()))
            dd = ttime
            legendSite.objects.create(serverName=legend[0],
                                      ip=legend[1],
                                      time=dd,
                                      type=legend[3],
                                      introduce=legend[4],
                                      QQ=legend[5],
                                      href=legend[6],
                                      onPage=onPage)
        elif spli[-1] == "★通宵推荐★":
            onPage = "allNight"
            ttime = time.strftime('%Y-%m-%d 0:0:0',
                                  time.localtime(time.time()))
            dd = ttime
            legendSite.objects.create(serverName=legend[0],
                                      ip=legend[1],
                                      time=dd,
                                      type=legend[3],
                                      introduce=legend[4],
                                      QQ=legend[5],
                                      href=legend[6],
                                      onPage=onPage)

        elif spli[-1] == "★错误时间★":
            onPage = "error"
            ttime = time.strftime('%Y-%m-%d 0:0:0',
                                  time.localtime(time.time()))
            dd = ttime
            legendSite.objects.create(serverName=legend[0],
                                      ip=legend[1],
                                      time=dd,
                                      type=legend[3],
                                      introduce=legend[4],
                                      QQ=legend[5],
                                      href=legend[6],
                                      onPage=onPage)
        else:
            try:
                onPage = "normal"
                year = time.strftime('%Y', time.localtime(time.time()))
                month = spli[0].replace("月", '')
                day = spli[1].replace("日", '')
                hour = spli[2].split("点")[0]
                minute = spli[2].split("点")[1].split("开放")[0].replace('分', '')
                minute = minute if minute != '' else 0
                dd = "%s-%s-%s %s:%s:0" % (year, month, day, hour, minute)
                legendSite.objects.create(serverName=legend[0],
                                          ip=legend[1],
                                          time=dd,
                                          type=legend[3],
                                          introduce=legend[4],
                                          QQ=legend[5],
                                          href=legend[6],
                                          onPage=onPage)

            except:
                onPage = "allNight"
                ttime = time.strftime('%Y-%m-%d 0:0:0',
                                      time.localtime(time.time()))
                dd = ttime
                legendSite.objects.create(serverName=legend[0],
                                          ip=legend[1],
                                          time=dd,
                                          type=legend[3],
                                          introduce=legend[4],
                                          QQ=legend[5],
                                          onPage=onPage)
コード例 #17
0
 def __init__(self):
     chrome_options = Options()
     chrome_options.add_argument('--lang=pt-BR')
     self.driver = webdriver.Chrome(
         executable_path=r'./chromedriver.exe', options=chrome_options)
コード例 #18
0
import time
from selenium.webdriver.chrome.options import Options
import os, sys

#deciding folder name
if len(sys.argv) > 1:
    default_folder_name = sys.argv[1]
else:
    default_folder_name = 'Song'
download_path = "C:/Users/gaurav.khatri/Downloads/"
final_path = os.path.join(download_path, default_folder_name)
if not os.path.exists(final_path):
    os.mkdir(final_path)

#Global arguments for avoiding Show notification popups
option = Options()
option.add_argument("--disable-infobars")
option.add_argument("start-maximized")
option.add_argument("--disable-extensions")

option.add_experimental_option(
    "prefs", {
        "profile.default_content_setting_values.notifications": 2,
        "download": {
            'default_directory': final_path
        }
    })


def check_download_completion(wait=False):
    download_path = "C:/Users/gaurav.khatri/Downloads"
コード例 #19
0
ファイル: linkedin.py プロジェクト: toxicmender/Applicant
 def __init__(self, path=None, headless=True):
     self.browser_options = Options()
     self.browser_options.add_argument('--disable-extensions')
     if headless:
         self.browser_options.add_argument('--headless')
     self.browser = webdriver.Chrome(path, options=self.browser_options)
コード例 #20
0
 def __init__(self):
     options = Options()
     options.set_headless(True)
     self.driver = webdriver.Chrome(chrome_options=options)
コード例 #21
0
 def setup(self):
     options = Options()
     options.debugger_address = "127.0.0.1:9222"
     self.driver = webdriver.Chrome(options=options)
コード例 #22
0
ファイル: getRES.py プロジェクト: jainil27/DDU-Result-Checker
def check_res():
    data = openpyxl.load_workbook('./it_data1.xlsx', 'r')
    sheet = data.active
    print("\n\tDDU IT Student Result Checker")
    print("Enter the name of student : ", end=" ")
    name = input()
    name = name.upper()
    name = name.strip()

    flag = 0
    for rowNum in range(2, 130):
        DataName = sheet.cell(row=rowNum, column=2).value
        #print(produceName)
        if (DataName.find(name) != -1):
            flag = 1
            Id = sheet.cell(row=rowNum, column=1).value
            Id = Id.strip()
            Dob = sheet.cell(row=rowNum, column=3).value
            if (isinstance(Dob, datetime.datetime)):
                Dob = Dob.strftime('%m/%d/%Y')
            Dob = Dob.strip()
            d = Dob.split('/')
            if (len(d[0]) == 1):
                d[0] = '0' + d[0]
            if (len(d[1]) == 1):
                d[1] = '0' + d[1]
            if (len(d[2]) == 2):
                d[2] = '19' + d[2]
            Dob = '/'.join(d)
            print("Searching Result...")
            print()
            print("Name : " + DataName)
            print("Id   : " + Id)
            print("Dob  : " + Dob)
            break
    if (flag == 0):
        print("No such student exists...")
    if (flag):
        sleep(2)
        chrome_options = Options()
        chrome_options.add_argument("disable-infobars")
        driver = webdriver.Chrome(
            executable_path=
            "C:/Users/Admin/PycharmProjects/Results/chromedriver.exe",
            chrome_options=chrome_options)
        driver.get("https://egov.ddit.ac.in/index.php?r=site/login")
        uid = driver.find_element_by_id("LoginForm_username")
        pswd = driver.find_element_by_id("LoginForm_password")
        uid.send_keys(Id)
        pswd.send_keys(Dob)
        #print(driver.current_url)
        while (driver.current_url !=
               "https://egov.ddit.ac.in/index.php?r=studentInformation/studentInfo"
               ):
            sleep(2)
        driver.get(
            "https://egov.ddit.ac.in/index.php?r=tblstudentmst/academicHistory"
        )
        sleep(1)
        driver.find_element_by_id('yt10').click()
        print()
コード例 #23
0
def QRLogin(Dictionary):
    options = Options()

    # ヘッドレスブラウザ指定
    # options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    # ChromeDriverのパスとオプションをつけてwebdriverを作成
    # this path is for keigo's windows PC
    # driver = webdriver.Chrome(
    #    'C:\\Users\\keigo\\chromedriver', options=options)

    # this path is passed by installing selenium and chromedriver_binary
    # detail will be in this URl
    # https://qiita.com/memakura/items/20a02161fa7e18d8a693
    # take care of the version of chromedriver_binary, should be same or near to your chrome
    driver = webdriver.Chrome(options=options)

    driver.get(
        'https://portal.nap.gsic.titech.ac.jp/GetAccess/Login?Template=userpass_key&AUTHMETHOD=UserPassword'
    )

    # time.sleep(1)

    # あなたのユーザー名/メールアドレス
    #username = '******'
    # あなたのパスワード
    #password = '******'

    # あなたのユーザー名/メールアドレス
    gakuseki = Dictionary['gakuseki']
    # あなたのパスワード
    password = Dictionary['PW']

    # ユーザー名の入力ボックスを探す F12を押してhtmlを出力し、xpathをここに入力する
    username_box = driver.find_element_by_xpath(
        "/html/body/center[3]/form/table/tbody/tr/td/table/tbody/tr[2]/td/div/div/input"
    )
    # パスワードの入力ボックスを探す
    password_box = driver.find_element_by_xpath(
        "/html/body/center[3]/form/table/tbody/tr/td/table/tbody/tr[3]/td/div/div/input"
    )

    # ユーザ名とパスワードをインプットする
    username_box.send_keys(gakuseki)
    password_box.send_keys(password)

    # ログインボタンを探す
    login_button = driver.find_element_by_xpath(
        "/html/body/center[3]/form/table/tbody/tr/td/table/tbody/tr[5]/td/input[1]"
    )
    # ログインボタンをクリック
    login_button.click()

    # time.sleep(1)

    m1 = youso(driver, "//*[@id=\"authentication\"]/tbody/tr[4]/th[1]")
    m2 = youso(driver, "//*[@id=\"authentication\"]/tbody/tr[5]/th[1]")
    m3 = youso(driver, "//*[@id=\"authentication\"]/tbody/tr[6]/th[1]")
    matrix1_box = driver.find_element_by_xpath(
        "//*[@id=\"authentication\"]/tbody/tr[4]/td/div/div/input")
    matrix1_box.send_keys(Dictionary[m1])
    matrix2_box = driver.find_element_by_xpath(
        "//*[@id=\"authentication\"]/tbody/tr[5]/td/div/div/input")
    matrix2_box.send_keys(Dictionary[m2])
    matrix3_box = driver.find_element_by_xpath(
        "//*[@id=\"authentication\"]/tbody/tr[6]/td/div/div/input")
    matrix3_box.send_keys(Dictionary[m3])

    # ログインボタンを探す
    OK = driver.find_element_by_xpath(
        "//*[@id=\"authentication\"]/tbody/tr[8]/td/input[1]")
    # ログインボタンをクリック
    OK.click()

    time.sleep(1)

    # スクショ用
    # driver.save_screenshot('screenshot.png')

    # ブラウザを終了
    # driver.quit()

    return "Worked it"
コード例 #24
0
def spr_scrape_postpaid_tablet_prices():
    # go to website
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920x1080")
    chrome_driver = os.getcwd() + "\\chromedriver.exe"
    driver = webdriver.Chrome(chrome_options=chrome_options,
                              executable_path=chrome_driver)
    driver.get(
        'https://www.sprint.com/en/shop/tablets.html?INTNAV=TopNav:Shop:Tablets&credit=A2&sort=FEATURED'
    )
    time.sleep(5)

    # get soup
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    driver.close()

    # make scraper object
    scraped_postpaid_price = ScrapedPostpaidPrice()

    # set hardcoded variables
    scraped_postpaid_price.provider = 'sprint'
    scraped_postpaid_price.date = datetime.date.today()
    scraped_postpaid_price.time = datetime.datetime.now().time()

    # iterate through devices on landing page
    for device_tile in soup.findAll(
            'li',
            class_='col-xs-24 col-sm-12 col-lg-8 text-center device-tile'):

        # get device name text
        device_name = device_tile.find("h3", {
            "class":
            "font-size-18 line-height-24 font-normal my-0 align-left"
        }).text.strip().lower()

        # eliminate out of scope devices
        if device_name.find("linelink") != -1 or device_name.find("pre-owned") != -1 or device_name.find("flip") != -1 \
                or device_name.find("sim") != -1 or device_name.find("duraxtp") != -1 or device_name.find("duratr") != -1 \
                or device_name.find("xp strike") != -1 or device_name.find("certified") != -1:
            continue

        # device name
        scraped_postpaid_price.device = device_parser(device_name)

        # url
        scraped_postpaid_price.url = "https://www.sprint.com" + device_tile.find(
            "a")["href"]

        # promo text for device landing page & add to database
        try:
            promo_text = device_tile.find("span", {
                "class": "color--purple font-size-14"
            }).text.strip()
        except AttributeError:
            promo_text = ''
        add_scraped_promotions_to_database(scraped_postpaid_price.provider,
                                           scraped_postpaid_price.device, '0',
                                           'device landing page', promo_text,
                                           scraped_postpaid_price.url,
                                           scraped_postpaid_price.date,
                                           scraped_postpaid_price.time)

        # go to url
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--window-size=1920x1080")
        chrome_driver = os.getcwd() + "\\chromedriver.exe"
        driver = webdriver.Chrome(chrome_options=chrome_options,
                                  executable_path=chrome_driver)
        driver.implicitly_wait(5)
        driver.get(scraped_postpaid_price.url)
        time.sleep(5)
        html = driver.page_source
        device_soup = BeautifulSoup(html, "html.parser")

        # if 404 error, stop program
        site_title = device_soup.find_all("title")
        if '404' in str(site_title):
            print('404 Error: ' + scraped_postpaid_price.device)
            continue

        # click on drop down menu and record device sizes
        size_selector = driver.find_element_by_id('sprint_storage_selector')
        size_selector.click()
        time.sleep(2)
        sizes = size_selector.text.strip().replace(' GB', '')
        sizes = sizes.split('\n')

        # iterate through sizes
        for size in sizes:

            # click on size and reload page
            select = Select(
                driver.find_element_by_id('sprint_storage_selector'))
            select.select_by_value(size)
            time.sleep(2)
            html = driver.page_source
            device_soup = BeautifulSoup(html, "html.parser")

            # record device size
            scraped_postpaid_price.storage = size

            # initialize price variables
            scraped_postpaid_price.monthly_price = '0.00'
            scraped_postpaid_price.retail_price = '0.00'
            scraped_postpaid_price.onetime_price = '0.00'

            # get prices
            for label in device_soup.findAll('label',
                                             class_='soar-selection__label'):
                if label.find('strong'
                              ).text == ' Buy it with 24 monthly installments':
                    monthly = label.findAll('span', class_='display-block')
                    scraped_postpaid_price.monthly_price = price_parser(
                        monthly[0].text.strip())
                    scraped_postpaid_price.onetime_price = price_parser(
                        monthly[1].text.strip())
                if label.find('strong').text == ' Full price':
                    retail = label.findAll('span', class_='display-block')
                    scraped_postpaid_price.retail_price = price_parser(
                        retail[1].text.strip())

            # add to database
            remove_postpaid_duplicate(scraped_postpaid_price.provider,
                                      scraped_postpaid_price.device,
                                      scraped_postpaid_price.storage,
                                      scraped_postpaid_price.date)
            add_postpaid_to_database(scraped_postpaid_price.provider,
                                     scraped_postpaid_price.device,
                                     scraped_postpaid_price.storage,
                                     scraped_postpaid_price.monthly_price,
                                     scraped_postpaid_price.onetime_price,
                                     scraped_postpaid_price.retail_price,
                                     scraped_postpaid_price.contract_ufc,
                                     scraped_postpaid_price.url,
                                     scraped_postpaid_price.date,
                                     scraped_postpaid_price.time)
            spr_scrape_postpaid_promotions(device_soup,
                                           scraped_postpaid_price.url,
                                           scraped_postpaid_price.device,
                                           scraped_postpaid_price.storage)

    driver.quit()
コード例 #25
0
ファイル: script2.py プロジェクト: sum008/python-codes
def show_reviews(data, index):

    index -= 1
    d = data[index]
    #     print(d)
    link = d[4]

    link = "https://www.flipkart.com/" + link
    #     print(link)
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920x1080")
    driver = webdriver.Chrome(options=chrome_options,
                              executable_path="D:\chromedriver.exe")
    driver.get(link)

    review_rating = int(input("Review rating "))
    #     swINJg _3nrCtb
    e = []
    count = 0
    time.sleep(1)
    while len(e) <= 0 and count <= 20:
        count += 1
        try:
            e = driver.find_elements_by_xpath("//div[@class='swINJg _3nrCtb']")
        except:
            continue

    if len(e) > 0:
        e[0].click()

        time.sleep(1)
        read_more = []
        count = 0
        while len(read_more) <= 0 and count <= 20:  #and count<=20:
            count += 1
            try:
                read_more = driver.find_elements_by_xpath(
                    "//span[@class='_1EPkIx']")
            except:
                continue

        if len(read_more) > 0:
            for r in read_more:
                r.click()
            try:
                read_more[len(read_more) - 1].click()
            except:
                pass
        print(len(read_more))
        time.sleep(1)
        #         link=driver.current_url
        #         source=requests.get(link).text
        #         soup=BeautifulSoup(source,"html5lib")
        #         //div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12'][2]/div[@class='_1PBCrt']/div[@class='col']/div[@class='col _390CkK _1gY8H-']/div[@class='row']/div[@class='hGSR34 E_uFuv']
        count = len(
            driver.find_elements_by_xpath(
                "//div[@class='col _390CkK _1gY8H-']"))
        for i in range(2, count + 2):
            rating = driver.find_element_by_xpath(
                "//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12']["
                + str(i) +
                "]/div[@class='_1PBCrt']/div[@class='col']/div[@class='col _390CkK _1gY8H-']/div[@class='row']/div[@class='hGSR34 E_uFuv']"
            ).text
            print(rating)
            if int(rating) >= review_rating:
                review = driver.find_element_by_xpath(
                    "//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12']["
                    + str(i) +
                    "]/div[@class='_1PBCrt']/div[@class='col']/div[@class='col _390CkK _1gY8H-']/div[@class='row']/div[@class='qwjRop']//div[@class]"
                ).text
                title = driver.find_element_by_xpath(
                    "//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12']["
                    + str(i) +
                    "]/div[@class='_1PBCrt']/div[@class='col']/div[@class='col _390CkK _1gY8H-']/div[@class='row']/p[@class='_2xg6Ul']"
                ).text
                print("Rating : " + str(rating))
                print(title)
                print()
                print(review)
                print()
    else:
        count = 0
        while len(e) <= 0 and count <= 20:
            count += 1
            try:
                e = driver.find_elements_by_xpath(
                    "//div[@class='swINJg _3cycCZ']")
            except:
                continue

        e[0].click()

        time.sleep(1)
        read_more = []
        count = 0
        while len(read_more) <= 0 and count <= 20:  #and count<=20:
            count += 1
            try:
                read_more = driver.find_elements_by_xpath(
                    "//span[@class='_2jRR3v']")
            except:
                continue

        if len(read_more) > 0:
            for r in read_more:
                r.click()
            try:
                read_more[len(read_more) - 1].click()
            except:
                pass
        print(len(read_more))
        time.sleep(1)

        count = len(
            driver.find_elements_by_xpath(
                "//div[@class='col _390CkK _1gY8H- _2675cp']"))
        for i in range(2, count + 2):
            rating = driver.find_element_by_xpath(
                "//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12']["
                + str(i) +
                "]/div[@class='_1PBCrt _26FBOm']/div[@class='col']/div[@class='col _390CkK _1gY8H- _2675cp']/div[@class='row']/div[@class='qwjRop _2675cp']/div"
            )
            print(rating)
#             if int(rating)>=review_rating:
#                 review=driver.find_element_by_xpath("//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12']["+str(i)+"]/div[@class='_1PBCrt']/div[@class='col']/div[@class='col _390CkK _1gY8H-']/div[@class='row']/div[@class='qwjRop']//div[@class]").text
#                 title=driver.find_element_by_xpath("//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12']["+str(i)+"]/div[@class='_1PBCrt']/div[@class='col']/div[@class='col _390CkK _1gY8H-']/div[@class='row']/p[@class='_2xg6Ul']").text
#                 print("Rating : "+str(rating))
#                 print(title)
#                 print()
#                 print(review)
#                 print()

#         path_rating_review="//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12'][10]/div[@class='_1PBCrt _26FBOm']/div[@class='col']/div[@class='col _390CkK _1gY8H- _2675cp']/div[@class='row']/div[@class='qwjRop _2675cp']/div"

#         lis=soup.select('div[class="col _390CkK _1gY8H-"]')
#         lis2=[len(lis)]
#         rating_data=[]
#         count=0
#         for i in lis:
#             if '"hGSR34 E_uFuv"' in str(i):
#                 rate=str(i.select('div[class="hGSR34 E_uFuv"]'))
#                 rate=rate[0:rate.index("img")-1]
#                 rate=int(format_data(rate))
#                 if rate>=review_rating:
#                     print("Rating "+ str(rate))
#                     l=[]
#                     l.append(rate)
#                     if '"_2xg6Ul"' in str(i):
#                         title=str(i.select('p[class="_2xg6Ul"]'))
#                         title=title[0:len(title)-5]
#                         title=format_data(title)
#                         print(title)
#                     else:
#                         print("No title")
#
#                     if '"qwjRop"' in str(i):
#                         review=driver.find_element_by_xpath("//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12']["+str(count+2)+"]/div[@class='_1PBCrt']/div[@class='col']/div[@class='col _390CkK _1gY8H-']/div[@class='row']/div[@class='qwjRop']//div[@class]").text
# #                         review=str(i.select('div[class=""]'))
#                         print(review)
#                         print()
#                     else:
#                         print("No Review")
#                 count+=1
#                     lis2.append(i)
    while True:
        pass
コード例 #26
0
def autoupdate_chromedriver():

    driverName = "/chromedriver.exe"

    # defining base file directory of chrome drivers
    driver_loc = "C:/Users/fitim/IdeaProjects/PythonProject/"
    #driver_loc = "C:\\Users\\Administrator\\AppData\\Local\\Programs\\Python\\Python37\\Scripts\\" #-- ENTER the file path of your exe
    # -- I created a separate folder to house the versions of chromedriver, previous versions will be deleted after downloading the newest version.
    # ie. version 75 will be deleted after 77 has been downloaded.

    # defining the file path of your exe file automatically updating based on your browsers current version of chrome.
    #currentPath = driver_loc + chrome_browser_version + driverName
    currentPath = driver_loc + driverName
    # check file directories to see if chrome drivers exist in nextVersion

    import os.path

    # check if new version of drive exists --> only continue if it doesn't
    Newpath = driver_loc + nextVersion
    match = False
    driver = webdriver.Chrome()
    str1 = driver.capabilities['browserVersion']
    str2 = driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0]
    print(str1)
    print(str2)
    print(str1[0:2])
    print(str2[0:2])
    if str1[0:2] != str2[0:2]:
        print("please download correct chromedriver version")
        match = True
    else:
        print("chrome and driver match")

    driver.quit()

    # check if we have already downloaded the newest version of the browser, ie if we have version 76, and have already downloaded a version of 77, we don't need to run any more of the script.
    newfileloc = Newpath + driverName
    exists = os.path.exists(newfileloc)

    if (exists == False and match == True):

        #open chrome driver and attempt to download new chrome driver exe file.
        chrome_options = Options()
        executable_path = currentPath
        driver = webdriver.Chrome(executable_path=executable_path,
                                  options=chrome_options)

        # opening up url of chromedriver to get new version of chromedriver.
        chromeDriverURL = 'https://chromedriver.storage.googleapis.com/index.html?path=' + nextVersion

        driver.get(chromeDriverURL)

        time.sleep(5)
        # find records of table rows
        table = driver.find_elements_by_css_selector('tr')

        # check the length of the table
        Table_len = len(table)

        # ensure that table length is greater than 4, else fail. -- table length of 4 is default when there are no availble updates
        if (Table_len > 4):

            # define string value of link
            rowText = table[(len(table) - 2)].text[:6]
            time.sleep(1)
            # select the value of the row
            driver.find_element_by_xpath('//*[contains(text(),' + '"' +
                                         str(rowText) + '"' + ')]').click()
            time.sleep(1)
            #select chromedriver zip for windows
            driver.find_element_by_xpath('//*[contains(text(),' + '"' +
                                         "win32" + '"' + ')]').click()

            time.sleep(3)
            driver.quit()

            from zipfile import ZipFile
            import shutil

            fileName = r"C:\Users\fitim\Downloads\chromedriver_win32.zip"  #--> enter your download path here.

            # Create a ZipFile Object and load sample.zip in it
            with ZipFile(fileName, 'r') as zipObj:
                # Extract all the contents of zip file in different directory
                zipObj.extractall(Newpath)

            # delete downloaded file
            os.remove(fileName)

            # defining old chrome driver location
            oldPath = driver_loc + lastVersion
            oldpathexists = os.path.exists(oldPath)

            # this deletes the old folder with the older version of chromedriver in it (version 75, once 77 has been downloaded)
            if (oldpathexists == True):
                shutil.rmtree(oldPath, ignore_errors=True)

    if match == False:
        return "no needed to do update"
    else:
        return "Done chromedriver update to version ", nextVersion
コード例 #27
0
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 16 10:40:22 2020

@author: ADHIRAJ MAJUMDAR
"""
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import re
import csv
import pandas as pd
import time

options = Options()
chromedriver = "chromedriver.exe"
browser = webdriver.Chrome(chromedriver, options=options)
data = pd.read_excel('Input/input_company.xlsx')
with open('Output/IEC_Details_file.csv', 'w', newline='') as outcsv:
    writer = csv.writer(outcsv)
    writer.writerow([
        "IEC", "IEC_Allotment_Date", "File_Number", "Party_Name_and_Address",
        "Phone_No", "e_mail", "Exporter_Type", "Date_of_Establishment",
        "PAN_ISSUE_DATE", "BIN", "PAN_ISSUED_BY", "Nature_Of_Concern", "Bank",
        "Dirct1", "Dirct2"
    ])
for index, row in data.iterrows():
    browser.get("http://dgft.delhi.nic.in:8100/dgft/IecPrint")
    IEC = browser.find_element_by_xpath('/html/body/form/input[1]')
    if len(str(row["IEC"])) <= 9:
        IEC.send_keys('0' + str(row["IEC"]))
コード例 #28
0
def trackMultipleObjects(video):
	rectangleColor = (0, 255, 0)
	frameCounter = 0
	currentCarID = 0
	fps = 0
	
	carTracker = {}
	carNumbers = {}
	carLocation1 = {}
	carLocation2 = {}
	speed = [None] * 1000
	u=[]
	z=[]
	# Write output to video file
	#out = cv2.VideoWriter('outpy.avi',cv2.VideoWriter_fourcc('M','J','P','G'), 10, (WIDTH,HEIGHT))


	while True:
		start_time = time.time()
		rc, image = video.read()
		if type(image) == type(None):
			break
		
		image = cv2.resize(image, (WIDTH, HEIGHT))
		resultImage = image.copy()
		Y_THRESH = 290
		cv2.line(resultImage,(0, Y_THRESH),(1280, Y_THRESH),(255,0,0),2)
		frameCounter = frameCounter + 1
		
		carIDtoDelete = []
		
		for carID in carTracker.keys():
			trackingQuality = carTracker[carID].update(image)
			
			if trackingQuality < 7:
				carIDtoDelete.append(carID)
				
		for carID in carIDtoDelete:
			print ('Removing carID ' + str(carID) + ' from list of trackers.')
			print ('Removing carID ' + str(carID) + ' previous location.')
			print ('Removing carID ' + str(carID) + ' current location.')
			carTracker.pop(carID, None)
			carLocation1.pop(carID, None)
			carLocation2.pop(carID, None)
		
		if not (frameCounter % 10):
			gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
			cars = carCascade.detectMultiScale(gray, 1.1, 13, 18, (24, 24))
			
			for (_x, _y, _w, _h) in cars:
				x = int(_x)
				y = int(_y)
				w = int(_w)
				h = int(_h)
			
				x_bar = x + 0.5 * w
				y_bar = y + 0.5 * h
				
				matchCarID = None
			
				for carID in carTracker.keys():
					trackedPosition = carTracker[carID].get_position()
					
					t_x = int(trackedPosition.left())
					t_y = int(trackedPosition.top())
					t_w = int(trackedPosition.width())
					t_h = int(trackedPosition.height())
					
					t_x_bar = t_x + 0.5 * t_w
					t_y_bar = t_y + 0.5 * t_h
				
					if ((t_x <= x_bar <= (t_x + t_w)) and (t_y <= y_bar <= (t_y + t_h)) and (x <= t_x_bar <= (x + w)) and (y <= t_y_bar <= (y + h))):
						matchCarID = carID
				
				if matchCarID is None:
					print ('Creating new tracker ' + str(currentCarID))
					
					tracker = dlib.correlation_tracker()
					tracker.start_track(image, dlib.rectangle(x, y, x + w, y + h))
					
					carTracker[currentCarID] = tracker
					carLocation1[currentCarID] = [x, y, w, h]

					currentCarID = currentCarID + 1
		
		#cv2.line(resultImage,(0,480),(1280,480),(255,0,0),5)


		for carID in carTracker.keys():
			trackedPosition = carTracker[carID].get_position()
					
			t_x = int(trackedPosition.left())
			t_y = int(trackedPosition.top())
			t_w = int(trackedPosition.width())
			t_h = int(trackedPosition.height())
			
			cv2.rectangle(resultImage, (t_x, t_y), (t_x + t_w, t_y + t_h), rectangleColor, 4)
			
			# speed estimation
			carLocation2[carID] = [t_x, t_y, t_w, t_h]
		
		end_time = time.time()
		
		if not (end_time == start_time):
			fps = 1.0/(end_time - start_time)
		
		#cv2.putText(resultImage, 'FPS: ' + str(int(fps)), (620, 30),cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)


		for i in carLocation1.keys():	
			if frameCounter % 1 == 0:
				[x1, y1, w1, h1] = carLocation1[i]
				[x2, y2, w2, h2] = carLocation2[i]
		
				# print 'previous location: ' + str(carLocation1[i]) + ', current location: ' + str(carLocation2[i])
				carLocation1[i] = [x2, y2, w2, h2]
		
				# print 'new previous location: ' + str(carLocation1[i])
				if [x1, y1, w1, h1] != [x2, y2, w2, h2]:
					if (speed[i] == None or speed[i] == 0) and y1 >= 275 and y1 <= 285:
						speed[i] = estimateSpeed([x1, y1, w1, h1], [x2, y2, w2, h2])
						if int(speed[i])>65:
							winsound.PlaySound('speed_car_sound.wav', winsound.SND_FILENAME)
							u.append(speed[i])
							cv2.imwrite('speeding_%s.png' % i, resultImage)
							options = Options()
							options.add_argument("--use-fake-ui-for-media-stream")
							timeout = 20
							driver = webdriver.Chrome(executable_path = 'D:/project/computervision/garbage detection/pyPushBullet-master/chromedriver_win32/chromedriver.exe', chrome_options=options)
							driver.get("https://mycurrentlocation.net/")
							wait = WebDriverWait(driver, timeout)
							longitude = driver.find_elements_by_xpath('//*[@id="longitude"]')
							longitude = [x.text for x in longitude]
							longitude = str(longitude[0])
							latitude = driver.find_elements_by_xpath('//*[@id="latitude"]')
							latitude = [x.text for x in latitude]
							latitude = str(latitude[0])
							driver.quit()
							m=(latitude,longitude)
							num=speed[i]
							database(num)
							z.append(m)
					#if y1 > 275 and y1 < 285:
					if speed[i] != None and y1 >= 180:
						cv2.putText(resultImage, str(int(speed[i])) + " km/hr", (int(x1 + w1/2), int(y1-5)),cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 255, 255), 2)
						#winsound.PlaySound('speed_car_sound.wav', winsound.SND_FILENAME) 
					#print ('CarID ' + str(i) + ': speed is ' + str("%.2f" % round(speed[i], 0)) + ' km/h.\n')

					#else:
					#	cv2.putText(resultImage, "Far Object", (int(x1 + w1/2), int(y1)),cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)

						#print ('CarID ' + str(i) + ' Location1: ' + str(carLocation1[i]) + ' Location2: ' + str(carLocation2[i]) + ' speed is ' + str("%.2f" % round(speed[i], 0)) + ' km/h.\n')
		cv2.imshow('result', resultImage)
		# Write the frame into the file 'output.avi'
		#out.write(resultImage)


		key = cv2.waitKey(1) & 0xFF

	# if the `q` key was pressed, break from the loop
		if key == ord("q"):
			break
	
	cv2.destroyAllWindows()
	return u,z
コード例 #29
0
def scrape_yieldwatch(my_address: Optional[str] = None,
                      headless=True,
                      timeout: int = 30):
    config = read_config()
    if my_address is None:
        my_address = config["bsc"]["address"]
    chrome_options = Options()
    if headless:
        chrome_options.add_argument("--headless")
    with webdriver.Chrome(options=chrome_options) as driver:
        WebDriverWait(driver, timeout)
        driver.get("https://www.yieldwatch.net/")
        for letter in my_address:
            address_bar = driver.find_element_by_id("addressInputField")
            address_bar.send_keys(letter)

        icon_bar = driver.find_element_by_class_name(
            "centered.bottom.aligned.row")
        buttons = icon_bar.find_elements_by_class_name("center.aligned.column")
        for button in buttons:
            grayscale = button.find_element_by_class_name(
                "ui.centered.image").value_of_css_property("filter")
            if grayscale == "grayscale(1)":
                button.click()

        button = driver.find_element_by_class_name("binoculars")
        button.click()

        # Wait until the next page is loaded
        element_present = presence_of_element_located(
            (By.CLASS_NAME, "content.active"))
        WebDriverWait(driver, timeout).until(element_present)

        infos = defaultdict(dict)
        segments = driver.find_elements_by_class_name("ui.segment")
        for segment in segments:
            # Many elements have the "ui segment" class, only pick the ones with
            # the "accordion ui" style.
            for defi in segment.find_elements_by_class_name("accordion.ui"):
                boxes = defi.find_elements_by_class_name("ui.equal.width.grid")
                if not boxes:
                    continue
                which = defi.text.split("\n")[0]
                for box in boxes:
                    header, content = box.find_elements_by_class_name("row")
                    header_text = header.text.split("\n")
                    box_name = header_text[0]
                    dollar_value = header_text[1]
                    assert "$" in dollar_value
                    dollar_value = float(
                        dollar_value.replace(",", "").replace("$", ""))
                    # Get the columns in the box, only the first two are relevant
                    columns = content.find_elements_by_class_name(
                        "collapsing.right.aligned")
                    names = columns[0].text.split("\n")
                    amounts = columns[1].text.split("\n")
                    d = defaultdict(list)
                    for i, amount in enumerate(amounts):
                        amount, coin = amount.split(" ", 1)
                        name = names[min(i, len(names) - 1)]
                        amount = (float(amount[:-1]) *
                                  1000 if "k" in amount else float(amount))
                        d[name].append((amount, coin))
                    d = dict(d)
                    d["dollar_value"] = dollar_value
                    infos[which][box_name] = dict(d)
    return dict(infos)
コード例 #30
0
ファイル: demo34.py プロジェクト: zhangbo2008/pachong2
class DmozSpider4(scrapy.Spider):  # 继承Spider类

    print("进入%s了!!!!!!!!!" % num)
    import os
    if os.path.exists('output'):
        shutil.rmtree('output')
    yuming = '中国青年'
    lang = '英语'
    '''
    超参数都在这里修改, 就下面这2个有用.name 随便起一个,在main函数里面调用这个名就行.
    html就是要爬取的网站.
    '''
    name = "dmoz%s" % num  # 爬虫的唯一标识,不能重复,启动爬虫的时候要用
    print("name", name)
    # html='http://www.171english.cn/news/'
    # html='http://www.171english.cn/news/2018'
    # html='http://www.171english.cn/news/2019'
    html = 'http://www.kantsuu.com/cnjp/List_1408.shtml'
    # html=' http://roll.edu.sina.com.cn/english/syxw/ss4/index_5.shtml'
    html = html.strip()

    from bs4 import BeautifulSoup
    #首页写这里

    baseUrl = html

    import requests
    # a=requests.get(html).content

    # bs = BeautifulSoup(a, "html.parser")  # 缩进格式
    # print(bs)
    # 下面冲bs中找到所有爬取的页.
    # print(bs.find_all("a"))  # 获取所有的a标签,也就是超链接
    from selenium import webdriver
    import sys

    # browser = webdriver.Firefox()  # Get local session of firefox
    # aaa=browser.get("http://news.sina.com.cn/c/2013-07-11/175827642839.shtml ")  # Load page
    # print(aaa)
    saveall = [html]
    print(777777777777777777777777777777, baseUrl)
    if 0:  #调试用, 一般不用这么跑.这个只是动态js代码需要这么使用而已. 一般网页没有这种方式.这个方式太慢爬虫.但是可以避免不必要的js bug
        while 1:
            tmpurl = saveall[-1]
            from selenium import webdriver
            from selenium.webdriver.chrome.options import Options

            chrome_options = Options()
            chrome_options.add_argument("--headless")
            from .utilsme import driver

            base_url = tmpurl
            driver.get(base_url)  # 注意这里面结果直接写到deriver里面
            # print(driver.page_source)
            a = driver.page_source

            bs = BeautifulSoup(a, "html.parser")  # 缩进格式
            # print(bs)
            # 下面冲bs中找到所有爬取的页.
            # print(bs.find_all("a"))
            import re
            # tmp=bs.find_all(text=re.compile("Next[ ]*"))
            # print(tmp)
            now = None

            for s in bs('a'):
                # print(s.text,444444444444444444444444444444444444444444444444)
                if s.text == "»":
                    now = s.extract()
                    # 需要对now进行中文转码
                    # now=parse.quote(now.get('href'))
                    print("loook", now)
                    # 注意这种旧网站的编码方式.
                    now = parse.quote(now.get('href'),
                                      safe=";/?:@&=+$, ",
                                      encoding="gbk")
                    now = 'https://ru.hujiang.com/' + now
                    print(now, "now网页是!!!!!!!!!!")
            if now == None or now in saveall:  #防止循环
                break
            else:
                saveall.append(now)
        print(saveall, '最后获取的所有index页')

#-------------推荐的方式获取全部index页

# 下面是直接匹配方式获取所有index页. 也就是一般需求这么跑就可以获取所有index页了.
    import urllib.request  # 导入urllib.request库

    if 0:  #调试用
        while 1:

            tmpurl = saveall[-1]

            import urllib
            from bs4 import BeautifulSoup

            url = tmpurl
            print(url, 8989898998)
            print(repr(url),
                  9999999999999999999999999999999999999999999999999999)

            a = urllib.request.urlopen(url)  # 打开指定网址
            page = a.read()  # 读取网页源码
            try:
                page = page.decode('gbk')  # 会有2中编码方式. gbk 或者utf-8
            except:
                page = page.decode('utf-8')  # 会有2中编码方式. gbk 或者utf-8

            print(type(page), 'yuio')
            # page = requests.get(url)         # 开不开,一直404.
            # page = requests.get('http://www.i21st.cn/story/index_1.html')         # 开不开,一直404.
            # page.encoding = 'utf-8'
            # soup = BeautifulSoup(page,"html.parser")
            print(page, 3434343434343)
            bs = BeautifulSoup(page, "html.parser")  # 缩进格式
            print(bs, 999999999999999999999999999999999999)
            # print(bs)
            # 下面冲bs中找到所有爬取的页.
            # print(bs.find_all("a"))
            import re
            # tmp=bs.find_all(text=re.compile("Next[ ]*"))
            # print(tmp)
            now = None

            print(url, bs('a'), 'uiop')
            for s in bs('a'):
                print(s.text, 'yyyyyyyyyy')
                if s.text == "下一页":
                    now = s.extract()
                    print(now, 12345)
                    # 需要对now进行中文转码
                    # now=str(now)
                    print(now, 888888888888888888888888)
                    # now=parse.quote(re.findall(r'href=".*"',now)[0])

                    print("loook", now)
                    # 注意这种旧网站的编码方式.
                    now = parse.quote(
                        now.get('href'), safe=";/?:@&=+$%, ", encoding="gbk"
                    )  # 中文的处理方式是里面加上%即可!!!!!!!!!!!!!!!!!!!!!!
                    print(89898934392423423, now)

                    if now[0] == '.':
                        now = now[2:]
                    now = now

                    # now=r'https://' + 'jp.hjenglish.com'+now
                    print(now, "now网页是!!!!!!!!!!")
            if now == None:
                break
            else:
                # print(now,556565656565)
                saveall.append(now)
        print("我们通过普通index算法得到所有的index页信息是", saveall)

    # 直接修改这里面!!!!!!!!!!!!! ,可以手动的吧上面的到的saveall直接改下面即可.就得到了全爬虫.

    saveall = [

        #'http://www.171english.cn/news/2018/june/',
        html,
    ]
    start_urls = saveall  # 开始爬取的链接 start_urls必须用这个名.

    def parse(self, response):  # 一级爬取代码
        print("进入了一级爬虫")
        #xpath教学:https://blog.csdn.net/qq_27283619/article/details/88704479
        #https://www.cnblogs.com/wt7018/p/11749778.html
        # @表示属性
        # 好像使用框架scrapy没法debug.只能疯狂print了
        # help(response.url)
        print(response.url, 77777777777777777777777777777777777777777777777777)
        print(response, '**********************当前爬取的网页链接')
        div_list = response.xpath('//td[@class="lbxx"]//a/@href')  # 加入正则
        # div_list = response.xpath('//div[@class="module cl xl"]/ul/li')  # 加入正则

        # print(85654645654, div_list)
        div_list = [i.extract() for i in div_list]

        # 去掉调回的情况.
        div_list = [i for i in div_list if i != response.url]
        div_list = list(set(div_list))
        print(85654645654, div_list)
        # div_list = response.xpath('//div[@class="newslist solid"]')  # 加入正则
        # print(90909090,div_list)

        # print(div_list)
        # print(div_list[0])
        # print(div_list[-1])
        # print((div_list))

        # print(div_list,99999999999999999999999999999999999999)
        for i in div_list:
            # print(self.baseUrl+i.extract())# 获得了全部链接,进入二级爬虫.
            item = en_youth()
            item['link'] = i
            item['link'] = item['link']
            # print(item['link'],"lianjie !!!!!!!!!!!!!!!!!!!!!!")
            #每一次一级爬虫得到的页面,都触发一次二级爬虫.
            yield scrapy.Request(item['link'],
                                 callback=self.parse_detail,
                                 meta={'item': item},
                                 encoding='raw_unicode_escape')

    #https://blog.csdn.net/Light__1024/article/details/88763541 如何进行爬取二级界面

    def parse_detail(self, response):  # 二级爬取代码
        infomation = response.meta['item']['link']
        # print(infomation,988776754456435345435345435)
        print(infomation, "二级爬取的地址是")
        item = response.body
        # print(item,9090909090909090909090909090)
        # print(item,444444444444444444444444444444444444)
        # print(item)
        # print(response.body,"???????????????")
        # print("********打印二次爬虫结果")#[@class="TRS_Editor"]
        item = en_youth()
        print('进入2极品宠')

        # 预过滤: 改了body,但是还是不生效.??
        #
        # # response.body="dfadsf"
        #
        # tmp=re.sub(r'<script.*</script>','',str(response.body))
        # print(tmp,6666666666666666666666666666666666666666)
        # response._set_body(tmp.encode(response.encoding))
        # print(response.body,777777777777777777777777777777777777777777777)
        # print(response.body,88888888888888888888888888888888888)
        # HtmlResponse.replace()
        # HtmlResponse.replace('body',remove_tags_with_content(response.body, 'script'))
        # HtmlResponse.replace('body',remove_tags_with_content(response.body, 'script'))

        # tmp2=response.xpath('//td[@class="e14"]//text()').extract()
        #下面要设计多重xpath判断.因为格式不同意.
        # 下面这个是只有div 里面写没有p标签.

        # 如果要提取这个标签里面的不管多深的全部文本, 就不用写细节了.直接div extract就可以实现!
        # item['neirong']= response.xpath('//div//p').extract()
        # print( item['neirong'],33333333333333333333333333333333333333333333333333333333333)
        item['neirong'] = response.xpath('//tr//p').extract()
        # item['neirong']+= response.xpath('//div[@class="content"]//p').extract()
        # item['neirong']+= response.xpath('//div[@id="article"]//p').extract()
        # item['neirong']+= response.xpath('//td[@class="e14"]').extract()
        # item['neirong']+= response.xpath('//td[@id="article_content"]').extract()
        # print(item['neirong'],22222222222222222222222)

        save = []

        item['neirong'] = [i for i in item['neirong'] if '<script' not in i]
        print('tttt', item['neirong'])
        item['neirong'] = [replace_tags(i, '') for i in item['neirong']]

        print('neirong2222222222222', item['neirong'])

        # item['neirong']+= response.xpath('//div[@id="article"]/div/p/text()').extract()
        # item['neirong']+= response.xpath('//div[@id="article"]/p/text()').extract()

        # 下面进行脚本滤过.

        # item['neirong'] = filter(lambda x: '<script>'not in x, item['neirong'])

        # print(item['neirong'], '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        # print(item['neirong'], 8888888888888888888)

        save2 = '\n'.join(item['neirong'])
        print(save2, 9999999999999999999999999999999999999)
        item['neirong'] = save2
        item['title'] = infomation
        yield item
        # 下面学习pipeline, 进行文件读写.
        # setttings里面设置pipeline写入文件
        #https://www.cnblogs.com/python2687806834/p/9836935.html
        pass


#
# if __name__=="__main__":
#     DmozSpider()