コード例 #1
0
def get_page(url: str) -> Response:
    headers: str = {"User-Agent": requests_html.user_agent()}

    with requests_html.HTMLSession() as s:
        resp: Response = s.get(url, headers=headers)

        try:
            resp.raise_for_status()
        except requests.exceptions.HTTPError as e:
            print(e)
            return None

        return resp
コード例 #2
0
from requests_html import HTMLSession, user_agent
import json
import csv

import concurrent.futures

chrome_header = {"User-Agent": user_agent()}
DEFAULT_PARAMS = {
    "lang": "en-US",
    "corsDomain": "finance.yahoo.com",
    ".tsrc": "finance",
}
url = "https://query1.finance.yahoo.com/v7/finance/quote"
session = HTMLSession()

# tickers or symbols
# symbols = ["NFLX"]


# params = {"symbols": symbols[0]}
# params.update(DEFAULT_PARAMS)

# # session
# session = HTMLSession()
# response = session.get(url, headers=chrome_header, params=params)

# resp["quoteResponse"]["result"][0]["marketCap"] is the market capital
# resp["quoteResponse"]["result"][0]["symbol"] is the symbor or ticker
input_file = "all_tickers.csv"
output_file = "sym_marketcap_sim.csv"
sym_list = []
コード例 #3
0
#! /usr/bin/env python3
from cache import Cache
import config
import datetime
import logging
import re
import time
from random import random
from requests_html import HTMLSession, user_agent
from db import mysql_db, TopicList, Topic

session = HTMLSession()
user_agent("google chrome")


def filter_emoji(author):
    try:
        # UCS-4
        highpoints = re.compile(u"[\U00010000-\U0010ffff]")
    except re.error:
        # UCS-2
        highpoints = re.compile(u"[\uD800-\uDBFF][\uDC00-\uDFFF]")
    return highpoints.sub(u"\u25FD", author)


class DoubanSpider(object):
    def __init__(self):
        self.__group_list = config.GROUP_LISTS
        self.__suffix = config.GROUP_SUFFIX
        self.__rules = config.RULES
        self.cache = Cache()
コード例 #4
0
import re
import time

import requests_html
import xlsxwriter as xw
from requests.adapters import HTTPAdapter

headers = {
    'User-Agent': requests_html.user_agent(),
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh-HK;q=0.8,en-GB;q=0.6,en-US;q=0.4',
}


def create_execl(play_id, workbook, session):
    for p_key, pid in enumerate(play_id, start=1):
        # 新建工作薄
        worksheet = workbook.add_worksheet()
        worksheet.set_column('A:A', 15)
        worksheet.set_column('H:H', 15)
        worksheet.set_column('I:I', 15)
        worksheet.set_column('J:J', 15)
        # 框架url
        url = 'http://nba.win0168.com/cn/Tech/TechTxtLive.aspx?matchid=%s' % pid
        r = session.get(url, timeout=6)
        # ------------总比分 ----------------#
        for tr_key, tr_val in enumerate(r.html.find('table.t_bf > tr'),
                                        start=15):
            for td_key, td_val in enumerate(tr_val.find('td'), start=0):
                # 写入数据
コード例 #5
0
from requests_html import HTMLSession, user_agent
import requests
from pathlib import Path
import shutil

url = 'https://smtmm.win/article/52735/'

folder = Path(r'~/Desktop/smtmm/').expanduser()
if not folder.exists():
    folder.mkdir()

session = HTMLSession()
r = session.get(url, headers={"User-Agent": user_agent()})
image_urls = r.html.xpath(
    '//article[@class="article-content"]//img/@data-original')
for image_url in image_urls:
    image = requests.get('https://smtmm.win' + image_url, stream=True)
    filename = image_url[image_url.rfind('/') + 1:]
    with open(folder / filename, 'wb') as f:
        shutil.copyfileobj(image.raw, f)
    print(f'{filename}下载完成')
コード例 #6
0
ファイル: cdn.py プロジェクト: laolyu/anyone
# -*- coding:utf-8 -*-

import requests_html
import time
import urllib3
from requests.adapters import HTTPAdapter
from requests_html import HTMLSession

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
session = HTMLSession()
session.mount('http://', HTTPAdapter(max_retries=3))
session.mount('https://', HTTPAdapter(max_retries=3))
session.keep_alive = False
requests_html.user_agent()


def req_api():
    timestamp = time.time()
    url = 'http://screensavers-1252899349.file.myqcloud.com/cdn_bandwith_config.json?v={0}'.format(
        int(timestamp))
    # proxies = {'http': None, 'https': None}
    proxies = {
        'http': 'http://localhost:8888',
        'https': 'http://localhost:8888'
    }
    try:
        r = session.get(url=url, proxies=proxies, verify=False)
        print(r.text)
        cur = r.json()['pb']['cur']
        if cur > 1700:
            print('当前CDN-pb超载:%s' % cur)
コード例 #7
0
def get_r(url):
    user_agent = random.choice(USER_AGENT_LIST)
    header = {"user-agent": requests_html.user_agent()}
    r = session.get(url=url, headers=header)
    return r
コード例 #8
0
#验证码破解用简单的pytesseract示范

while True:
    captchaid = '03eabd0b7dad46d28d197f3ca{}b9c1'.format(
        str(random.randint(111, 999)))
    time.sleep(1)
    while True:
        time.sleep(1)
        url = 'http://zxgk.court.gov.cn/zhzxgk/captcha.do?captchaId={}&random=0.0356847153767{}'.format(
            captchaid, str(random.randint(11111, 99999)))
        # 启动
        session = HTMLSession()
        r = session.get(url,
                        headers={
                            'User-Agent': user_agent(),
                            "Host": 'zxgk.court.gov.cn'
                        })
        with open('./yzm.png', 'wb') as f:
            f.write(r.content)
        img = Image.open(r'./yzm.png')
        yzm = pytesseract.image_to_string(img).strip()
        yzm = "".join(yzm.split())
        print(yzm)
        if len(yzm) == 4:
            break
    checkurl = 'http://zxgk.court.gov.cn/zhzxgk/checkyzm?captchaId={}&pCode={}'.format(
        captchaid, str(yzm))
    r = session.get(checkurl,
                    headers={
                        'User-Agent': user_agent(),
コード例 #9
0
from requests_html import HTMLSession, user_agent
import requests
import shutil
from pathlib import Path

ua = user_agent()
base_url = 'https://www.mzitu.com/'
folder = Path(r'~/Desktop/图片').expanduser()
if not folder.exists():
    folder.mkdir()


def save_image(url, filename, headers):
    img = requests.get(url, stream=True, headers=headers)
    with open(filename, 'wb') as f:
        shutil.copyfileobj(img.raw, f)
    print(f'{filename}下载完成')


def meizitu(url):
    session = HTMLSession()
    r = session.get(url, headers={'User-Agent': ua, 'Referer': base_url})
    referer = url
    total_page = int(r.html.xpath(
        '//div[@class="pagenavi"]/a/span/text()')[-2])
    title = r.html.xpath('//h2[@class="main-title"][1]/text()', first=True)
    save_folder = folder/title
    if not save_folder.exists():
        save_folder.mkdir()
    for i in range(1, total_page+1):
        r = session.get(url+f'/{i}',