Example #1
0
    def process_request(self, request, spider):
        ua = UserAgent()
        request.headers.setdefault(b'User-Agent', ua.random)
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
 def __init__(self, crawler):
     super(RandomUserAgentMiddleware, self).__init__()
     self.ua = UserAgent()
     self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
Example #3
0
def get_google_search_response(person_name, exact_match, proxies,
                               country_code):

    # set params
    params = {'as_epq' if exact_match else 'q': person_name.encode('utf8')}

    # make sure to set google search country code because
    # when using proxies the google results will depend on
    # the random country the proxy is located at and
    # the results will differ with every random proxy call

    # also keep in mind that including this parameter (as well as others, likely)
    # will increase the prob of triggering bot detection
    # so using this without a proxy will quickly result
    # in google banning the ip address and asking for recaptcha

    # set lr and cr params, maybe both of the will result
    # in actually simulating a search from specified country
    # source: https://github.com/MarioVilas/googlesearch/blob/master/googlesearch/__init__.py

    # as of 2021 the request triggers some kind of google protection
    # by not rendering the full page in html. Depending on the exact params set
    # in the html returned it says "Jei per kelias sekundes nebūsite nukreipti, <...>"
    # or just returns a different html structure that does not include div slim_appbar.
    # div slim_appbar contains the number of total results and we want to parse it.
    #
    # For future reference, the following does not help:
    # following the url provided together with "if you are not redirected within a few".
    # fixing proxy location (in private.py) e.g. US only.
    # do not setting cr lr params.
    if proxies:
        pass
        # params['cr'] = 'us'
        # params['lr'] = 'lang_' + 'us'
    if country_code:
        pass
        # params['cr'] = country_code
        # params['lr'] = 'lang_' + country_code

    # set headers - this is important!!!
    # if headers are not set google does not return
    # the number of search results and none of the divs
    # responsible for storing number of results are there.
    # Basically, the structure of html is totally different.
    # IMPORTANT: the above holds for requests sent directly
    # IMPORTANT: and for requests send through a proxy.

    # UserAgent() is a heroku app that sometimes fails
    # lets save a list of browsers for headers locally,
    # so that we don't have to call the heroku app
    # again and again every time.
    # Randomising from locally stored browsers does
    # increase the rate of captcha and invalid responses
    # For now lets fall back to UserAgent and investigate
    # the reasons later on.
    headers = {'User-Agent': UserAgent().random}
    # headers = {'User-Agent': random.choice(browsers)}

    # make the request
    url = 'https://www.google.com/search'
    response = requests_retry_session().get(url,
                                            params=params,
                                            headers=headers,
                                            proxies=proxies)

    # if recaptcha in the response, the client that sent the request
    # is blacklisted so lets return False
    if 'https://www.google.com/recaptcha/api.js' in response.text:
        logger.info('Received Captcha request (google search)')
        return False

    logger.info('Received a valid response (google search)')
    return response
import re
from bs4 import BeautifulSoup
import configparser
from requests.adapters import HTTPAdapter
from threading import Thread  # 导入线程函数
from threading import Lock
import threading
import traceback

# import math
# from faker import Factory
# from urllib import parse
# import json
# import jieba.analyse
# from urllib.parse import urlencode
ua = UserAgent(use_cache_server=False, path="fake_useragent_0.1.11.json")
# count_mutex=0#使用量mutex

file = 'config.ini'
# 创建配置文件对象
config_parse = configparser.ConfigParser()

# 读取文件
config_parse.read(file, encoding='utf-8')
use_count = config_parse.getint("ip", "count")  # ip使用量
use_fail = config_parse.getint("ip", "fail")  # ip失败数量
read_urid = config_parse.getint("wjx", "id")  # 问卷星id号
'''
headers = {
    'Connection': 'keep-alive',
    'Cache-Control': 'no-cache',
Example #5
0
 def process_request(self, request, spider):
     request.headers[
         'Referer'] = 'https://www.qimai.cn/app/comment/appid/1084660392/country/cn'
     request.headers['User-Agent'] = UserAgent().random
Example #6
0
import threading
import time
import random
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.header import Header
import pandas as pd
from fake_useragent import *

# 获取配置
from fake_useragent.fake import UserAgent

cfg = configparser.ConfigParser()
cfg.read("config.ini")
ua = UserAgent()

def current_time():

    '''
    返回当前时间
    :return:
    '''
    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())


def toCSV(data, flags):
    '''
    将抓取到的数据转换成CSV文件
    :param data:数据
    :param flags:标志位 0:元组,1:字典