def process_request(self, request, spider): ua = UserAgent() request.headers.setdefault(b'User-Agent', ua.random) # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None
def __init__(self, crawler): super(RandomUserAgentMiddleware, self).__init__() self.ua = UserAgent() self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
def get_google_search_response(person_name, exact_match, proxies, country_code): # set params params = {'as_epq' if exact_match else 'q': person_name.encode('utf8')} # make sure to set google search country code because # when using proxies the google results will depend on # the random country the proxy is located at and # the results will differ with every random proxy call # also keep in mind that including this parameter (as well as others, likely) # will increase the prob of triggering bot detection # so using this without a proxy will quickly result # in google banning the ip address and asking for recaptcha # set lr and cr params, maybe both of the will result # in actually simulating a search from specified country # source: https://github.com/MarioVilas/googlesearch/blob/master/googlesearch/__init__.py # as of 2021 the request triggers some kind of google protection # by not rendering the full page in html. Depending on the exact params set # in the html returned it says "Jei per kelias sekundes nebūsite nukreipti, <...>" # or just returns a different html structure that does not include div slim_appbar. # div slim_appbar contains the number of total results and we want to parse it. # # For future reference, the following does not help: # following the url provided together with "if you are not redirected within a few". # fixing proxy location (in private.py) e.g. US only. # do not setting cr lr params. if proxies: pass # params['cr'] = 'us' # params['lr'] = 'lang_' + 'us' if country_code: pass # params['cr'] = country_code # params['lr'] = 'lang_' + country_code # set headers - this is important!!! # if headers are not set google does not return # the number of search results and none of the divs # responsible for storing number of results are there. # Basically, the structure of html is totally different. # IMPORTANT: the above holds for requests sent directly # IMPORTANT: and for requests send through a proxy. # UserAgent() is a heroku app that sometimes fails # lets save a list of browsers for headers locally, # so that we don't have to call the heroku app # again and again every time. # Randomising from locally stored browsers does # increase the rate of captcha and invalid responses # For now lets fall back to UserAgent and investigate # the reasons later on. headers = {'User-Agent': UserAgent().random} # headers = {'User-Agent': random.choice(browsers)} # make the request url = 'https://www.google.com/search' response = requests_retry_session().get(url, params=params, headers=headers, proxies=proxies) # if recaptcha in the response, the client that sent the request # is blacklisted so lets return False if 'https://www.google.com/recaptcha/api.js' in response.text: logger.info('Received Captcha request (google search)') return False logger.info('Received a valid response (google search)') return response
import re from bs4 import BeautifulSoup import configparser from requests.adapters import HTTPAdapter from threading import Thread # 导入线程函数 from threading import Lock import threading import traceback # import math # from faker import Factory # from urllib import parse # import json # import jieba.analyse # from urllib.parse import urlencode ua = UserAgent(use_cache_server=False, path="fake_useragent_0.1.11.json") # count_mutex=0#使用量mutex file = 'config.ini' # 创建配置文件对象 config_parse = configparser.ConfigParser() # 读取文件 config_parse.read(file, encoding='utf-8') use_count = config_parse.getint("ip", "count") # ip使用量 use_fail = config_parse.getint("ip", "fail") # ip失败数量 read_urid = config_parse.getint("wjx", "id") # 问卷星id号 ''' headers = { 'Connection': 'keep-alive', 'Cache-Control': 'no-cache',
def process_request(self, request, spider): request.headers[ 'Referer'] = 'https://www.qimai.cn/app/comment/appid/1084660392/country/cn' request.headers['User-Agent'] = UserAgent().random
import threading import time import random import smtplib from email.mime.text import MIMEText from email.mime.multipart import MIMEMultipart from email.header import Header import pandas as pd from fake_useragent import * # 获取配置 from fake_useragent.fake import UserAgent cfg = configparser.ConfigParser() cfg.read("config.ini") ua = UserAgent() def current_time(): ''' 返回当前时间 :return: ''' return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) def toCSV(data, flags): ''' 将抓取到的数据转换成CSV文件 :param data:数据 :param flags:标志位 0:元组,1:字典