コード例 #1
0
def get(url):
    resp = requests.get(url,
                        headers={'User-Agent': header.get_ua()})
    if resp.status_code == 200:
        parse(resp.text)
    else:
        raise Exception('请求失败!')
コード例 #2
0
def get(url):
    resp = requests.get(url,
                        verify=False,
                        headers={'User-Agent': header.get_ua()})
    if resp.status_code == 200:
        # print(resp.text)  # 240099
        parse(resp.text)
    else:
        raise Exception('请求失败')
コード例 #3
0
import json
import time
import re
from urllib.parse import quote

import requests
from selenium.webdriver import Chrome
from selenium.webdriver.support import ui, expected_conditions
from selenium.webdriver.common.by import By

from utils.header import get_ua

headers = {'User-Agent': get_ua()}


def start(cityName):
    url = f'https://zhaopin.baidu.com/?city={quote(cityName)}'
    chrome.get(url)

    query = chrome.find_element_by_css_selector('input[name="query"]')
    query.send_keys('Python')
    chrome.execute_script('var q=document.documentElement.scrollLeft=1000')
    chrome.find_element_by_css_selector('.search-btn').click()
    time.sleep(2)
    # 需要验证登录
    try:
        chrome.find_element_by_class_name('tang-pass-footerBarULogin').click()
        time.sleep(0.5)
        input_uesrname = chrome.find_element(
            By.XPATH, '//input[@id="TANGRAM__PSP_3__userName"]')
        input_uesrname.send_keys('18795681793')
コード例 #4
0
ファイル: meinv.py プロジェクト: wangxinglong74520/filename
"""
爬取美女网
-requests
-bs4
-csv储存
- 扩展 携程 asyncio
"""
import json

from bs4 import BeautifulSoup, Tag
from utils.header import get_ua
import requests
import time

headers = {
    'User-Agent': get_ua()
}


def get(url):
    resp = requests.get(url, headers=headers)
    if resp.status_code == 200:
        resp.encoding = 'utf-8'
        parse(resp.text)


def parse(html):
    soup = BeautifulSoup(html, 'lxml')
    content_boxs = soup.select('.content-box')
    item = {}
    for content_box in content_boxs:
コード例 #5
0
def download_code():
    resp = session.get('https://so.gushiwen.org/RandCode.ashx',
                       headers={'User-Agent': get_ua()})
    # 写入文件
    with open('code.png', 'wb') as f:
        f.write(resp.content)
コード例 #6
0
def get(url):
    resp: Response = requests.get(url, headers={'user-agent': header.get_ua()})
    if resp.status_code == 200:
        parse(resp.text)
    else:
        raise Exception('失败')
コード例 #7
0
ファイル: requests_re.py プロジェクト: shenchen1991/spider
import os
import re

import requests

from utils.header import get_ua

base_url = 'http://sc.chinaz.com/tupian/'
url = 'http://sc.chinaz.com/tupian/'

if os.path.exists('mn.html'):
    with open('mn.html', encoding='utf-8') as f:
        html = f.read()
else:
    resp = requests.get(url, headers={'User_Agent': get_ua()})
    resp.encoding = 'utf-8'
    assert resp.status_code == 200
    html = resp.text
    with open('mn.html', 'w', encoding=resp.encoding) as f:
        f.write(html)

compile = re.compile(r'<img src2="(.*?) alt="(.*?)">')
images = compile.findall(html)
# print(images)
next_url = re.findall(r'<b>2221</b></a><a href="(.*?)" class="nextpage', html,
                      re.S)
print(base_url + next_url[0])
コード例 #8
0
import re
import os

import requests
from requests import Response

from utils.header import get_ua

base_url = 'http://sc.chinaz.com/tupian/'

url = 'http://sc.chinaz.com/tupian/shuaigetupian.html'
if os.path.exists('mn.html'):
    with open('mn.html','r',encoding='utf-8') as f:
        html = f.read()
else:
    resp: Response = requests.get(url, headers = {'User-Agent': get_ua()})
    print(resp.encoding)          # ISO-8859-1   这是国际标准编码
    resp.encoding = 'utf-8'       # 可以修改响应的状态码
    assert resp.status_code == 200
    html = resp.text
    with open('mn.html', 'w') as f:
        f.write(html)
# print(html)
# [\u4e00-\u9fa5]
compile = re.compile(r'<img src2="(.*?)" alt="(.*?)">')
compile2 = re.compile(r'<img alt="(.*?)" src="(.*?)" >')
imgs = compile.findall(html)   # list
if len(imgs) == 0:
    imgs = compile2.findall(html)

print(len(imgs), imgs, sep="\n")
コード例 #9
0
 def process_request(self, request, spider):
     headers = {'User-Agent': get_ua()}
     # UA 伪装
     request.headers['User-Agent'] = headers
     return None