def parse(url): logging.debug("Trying to parse the following source: %s",url) request = urllib2.Request(url) #request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)') request.add_header('User-Agent', ua().random) try: http = bs(urllib2.urlopen(request), "lxml") #grab a soup object and name it http except: logging.error("Error parsing %s",url) return return http #return the http object
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. user_agent = ua().random request.headers['User-Agent'] = user_agent # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None
def geturl1(self): print('5') ip = self.ipQueue.get() while 1: try: url = self.urlQueue.get(block=False, timeout=10) res0 = requests.get(url, headers={"User-Agent": str(ua().random)}, proxies=ip) res0.encoding = 'utf-8' html = res0.text q = re.compile(r'https://gz.lianjia.com/chengjiao/.*?[.]html') r = '//div[@class="total fl"]/span/text()' parseHtml0 = etree.HTML(html) datab = parseHtml0.xpath(r) pn = int(datab[0]) if pn == 0: continue pageend = pn // 30 + 2 for k in range(1, pageend): if k == 1: pass else: urlbase1 = url + 'pg%s/' % k res0 = requests.get( urlbase1, headers={"User-Agent": str(ua().random)}, proxies=ip) res0.encoding = 'utf-8' html = res0.text dataa = q.findall(html) dataa = set(dataa) dataa = list(dataa) for km in dataa: self.urlQueue1.put(km) except: break
def process_request(self, request, spider): user_agent = ua().random request.headers['User-Agent'] = user_agent print(request.headers) #random.choice(iter)可迭代对象随机选择一个 pass
# Scrapy settings for img_design_spider project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'img_design_spider' SPIDER_MODULES = ['img_design_spider.spiders'] NEWSPIDER_MODULE = 'img_design_spider.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = ua().random # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY = 0.5 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16
from glob import glob from hashlib import md5 from fake_useragent import UserAgent as ua from splinter import Browser from splinter.exceptions import * from selenium.common.exceptions import * from random import randint from time import sleep import requests import time import os import logging user_agent = ua() CACHE_FLDR = 'cache' DATA_FLDR = 'data' GEOCODE_URL = ''.join([ 'https://maps.googleapis.com/maps/api/geocode/json', '?address={:s}&key={:s}' ]) logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%d/%m/%Y %I:%M:%S %p', level=logging.INFO) def enable_detailed_logging(): # These two lines enable debugging at httplib level # (requests->urllib3->http.client) # You will see the REQUEST, including HEADERS and DATA, # and RESPONSE with HEADERS but without DATA. # The only thing missing will be the response.body which is not logged.
DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False from fake_useragent import UserAgent as ua # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': ua().random } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'jc_goods.middlewares.JcGoodsSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'jc_goods.middlewares.JcGoodsDownloaderMiddleware': 543, #} # Enable or disable extensions