def get_page(url: str) -> Response: headers: str = {"User-Agent": requests_html.user_agent()} with requests_html.HTMLSession() as s: resp: Response = s.get(url, headers=headers) try: resp.raise_for_status() except requests.exceptions.HTTPError as e: print(e) return None return resp
from requests_html import HTMLSession, user_agent import json import csv import concurrent.futures chrome_header = {"User-Agent": user_agent()} DEFAULT_PARAMS = { "lang": "en-US", "corsDomain": "finance.yahoo.com", ".tsrc": "finance", } url = "https://query1.finance.yahoo.com/v7/finance/quote" session = HTMLSession() # tickers or symbols # symbols = ["NFLX"] # params = {"symbols": symbols[0]} # params.update(DEFAULT_PARAMS) # # session # session = HTMLSession() # response = session.get(url, headers=chrome_header, params=params) # resp["quoteResponse"]["result"][0]["marketCap"] is the market capital # resp["quoteResponse"]["result"][0]["symbol"] is the symbor or ticker input_file = "all_tickers.csv" output_file = "sym_marketcap_sim.csv" sym_list = []
#! /usr/bin/env python3 from cache import Cache import config import datetime import logging import re import time from random import random from requests_html import HTMLSession, user_agent from db import mysql_db, TopicList, Topic session = HTMLSession() user_agent("google chrome") def filter_emoji(author): try: # UCS-4 highpoints = re.compile(u"[\U00010000-\U0010ffff]") except re.error: # UCS-2 highpoints = re.compile(u"[\uD800-\uDBFF][\uDC00-\uDFFF]") return highpoints.sub(u"\u25FD", author) class DoubanSpider(object): def __init__(self): self.__group_list = config.GROUP_LISTS self.__suffix = config.GROUP_SUFFIX self.__rules = config.RULES self.cache = Cache()
import re import time import requests_html import xlsxwriter as xw from requests.adapters import HTTPAdapter headers = { 'User-Agent': requests_html.user_agent(), 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh-HK;q=0.8,en-GB;q=0.6,en-US;q=0.4', } def create_execl(play_id, workbook, session): for p_key, pid in enumerate(play_id, start=1): # 新建工作薄 worksheet = workbook.add_worksheet() worksheet.set_column('A:A', 15) worksheet.set_column('H:H', 15) worksheet.set_column('I:I', 15) worksheet.set_column('J:J', 15) # 框架url url = 'http://nba.win0168.com/cn/Tech/TechTxtLive.aspx?matchid=%s' % pid r = session.get(url, timeout=6) # ------------总比分 ----------------# for tr_key, tr_val in enumerate(r.html.find('table.t_bf > tr'), start=15): for td_key, td_val in enumerate(tr_val.find('td'), start=0): # 写入数据
from requests_html import HTMLSession, user_agent import requests from pathlib import Path import shutil url = 'https://smtmm.win/article/52735/' folder = Path(r'~/Desktop/smtmm/').expanduser() if not folder.exists(): folder.mkdir() session = HTMLSession() r = session.get(url, headers={"User-Agent": user_agent()}) image_urls = r.html.xpath( '//article[@class="article-content"]//img/@data-original') for image_url in image_urls: image = requests.get('https://smtmm.win' + image_url, stream=True) filename = image_url[image_url.rfind('/') + 1:] with open(folder / filename, 'wb') as f: shutil.copyfileobj(image.raw, f) print(f'{filename}下载完成')
# -*- coding:utf-8 -*- import requests_html import time import urllib3 from requests.adapters import HTTPAdapter from requests_html import HTMLSession urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) session = HTMLSession() session.mount('http://', HTTPAdapter(max_retries=3)) session.mount('https://', HTTPAdapter(max_retries=3)) session.keep_alive = False requests_html.user_agent() def req_api(): timestamp = time.time() url = 'http://screensavers-1252899349.file.myqcloud.com/cdn_bandwith_config.json?v={0}'.format( int(timestamp)) # proxies = {'http': None, 'https': None} proxies = { 'http': 'http://localhost:8888', 'https': 'http://localhost:8888' } try: r = session.get(url=url, proxies=proxies, verify=False) print(r.text) cur = r.json()['pb']['cur'] if cur > 1700: print('当前CDN-pb超载:%s' % cur)
def get_r(url): user_agent = random.choice(USER_AGENT_LIST) header = {"user-agent": requests_html.user_agent()} r = session.get(url=url, headers=header) return r
#验证码破解用简单的pytesseract示范 while True: captchaid = '03eabd0b7dad46d28d197f3ca{}b9c1'.format( str(random.randint(111, 999))) time.sleep(1) while True: time.sleep(1) url = 'http://zxgk.court.gov.cn/zhzxgk/captcha.do?captchaId={}&random=0.0356847153767{}'.format( captchaid, str(random.randint(11111, 99999))) # 启动 session = HTMLSession() r = session.get(url, headers={ 'User-Agent': user_agent(), "Host": 'zxgk.court.gov.cn' }) with open('./yzm.png', 'wb') as f: f.write(r.content) img = Image.open(r'./yzm.png') yzm = pytesseract.image_to_string(img).strip() yzm = "".join(yzm.split()) print(yzm) if len(yzm) == 4: break checkurl = 'http://zxgk.court.gov.cn/zhzxgk/checkyzm?captchaId={}&pCode={}'.format( captchaid, str(yzm)) r = session.get(checkurl, headers={ 'User-Agent': user_agent(),
from requests_html import HTMLSession, user_agent import requests import shutil from pathlib import Path ua = user_agent() base_url = 'https://www.mzitu.com/' folder = Path(r'~/Desktop/图片').expanduser() if not folder.exists(): folder.mkdir() def save_image(url, filename, headers): img = requests.get(url, stream=True, headers=headers) with open(filename, 'wb') as f: shutil.copyfileobj(img.raw, f) print(f'{filename}下载完成') def meizitu(url): session = HTMLSession() r = session.get(url, headers={'User-Agent': ua, 'Referer': base_url}) referer = url total_page = int(r.html.xpath( '//div[@class="pagenavi"]/a/span/text()')[-2]) title = r.html.xpath('//h2[@class="main-title"][1]/text()', first=True) save_folder = folder/title if not save_folder.exists(): save_folder.mkdir() for i in range(1, total_page+1): r = session.get(url+f'/{i}',