def parseNum(self, response): # 声明item infoItem = userInfoItem() # 封装爬取的uid和时间 infoItem['uid'] = response.meta['uid'] time.localtime(time.time()) infoItem['crawlTime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 爬取粉丝数、关注数、微博数 sel = Selector(response) infoItem['weiboNum'] = int(sel.xpath('//div[@class="tip2"]/span/text()').extract_first()[3: -1]) infoItem['fansNum'] = int(sel.xpath('//div[@class="tip2"]/a[1]/text()').extract_first()[3: -1]) infoItem['conNum'] = int(sel.xpath('//div[@class="tip2"]/a[2]/text()').extract_first()[3: -1]) # 爬取详细页面的连接并跳转 url = response.meta['urlInfo'] request = Request(url, meta={'infoItem': infoItem}, callback=self.parseInfo) request.cookies = random.choice(self.COOKIEPOOL) ''' # 这里同样设置代理,但是没有检验机制 if self.IPPOOL.__len__() != 0 : proxy = random.choice(self.IPPOOL) request.meta['proxy'] = proxy ''' yield request
def process_request(self, request: Request, spider): # Called for each request that goes through the downloader # middleware. # 设置请求头 # scrapy.Request # request.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' request.headers['User-Agent'] = ua.get() request.headers['Sec-Fetch-Mode'] = 'cors' request.headers['Sec-Fetch-Site'] = 'none' request.headers['Sec-Fetch-User'] = '******' request.headers['Upgrade-Insecure-Requests'] = 1 # 注: cookies是dict类型 request.cookies = cookies.get() # 设置请求的代理 request.meta['proxy'] = 'http://ip:port' # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None
def process_request(self, request: Request, spider): # Called for each request that goes through the downloader # middleware. # 设置请求的头 request.headers['User-Agent'] = user_agents.get_ua() # 设置Cookie request.cookies = cookies.get_cookie() # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None
def start_requests(self): # 初始化ip池(弃用) ''' print u'正在初始化IPPOOL...' ipUrl = 'http://www.xicidaili.com/nn/' request = Request(ipUrl, callback=self.parseIP, dont_filter = True) yield request ''' # 从setting中读取,设置请求频率 request_interval = REQUEST_INTREVAL # 从setting中读取,设置文件的读取位置 inputfile_location = INPUTFILE_LOCATION # 用于计数爬取了多少条用户记录 d = 0 # 用于控制IPPOOL的更新 #ipflushCount = 0 print u'初始化cookie池...' print u'cookie池的期望大小为: ' , cookieList.__len__() # 初始化cookie池 cookiepool = [] # cookies.cookieList中是为处理的cookie,这里需要处理一下然后存到本地的COOKIEPOOL中 for string in cookieList : # cookies.cookieList中的cookie是字符串,我们需要把它转化成字典形式 single_cookie = {} # 所以这里我们可以利用一下Python的字符串方法分词 for str_spl in str(string).split(';') : # 整个字符串分解之后再利用分词将每个小字符串化成key:value的形式,也就是json的格式 cookie_spl = str_spl.split("=") # strip()方法用于去除字符串两边的空格 key = cookie_spl[0].strip() value = cookie_spl[1].strip() # ALF和SSOLoginState两个属性必须是int类型,不然会报错 if (key == 'ALF') | (key == 'SSOLoginState'): value = int(value) single_cookie[key] = value cookiepool.append(single_cookie) print u'新cookie加入cookie池...' # 处理完cookie之后把缓存的结构化的cookie直接复制到COOKIEPOOL self.COOKIEPOOL = cookiepool print u'cookie池初始化完成,大小为: ', self.COOKIEPOOL.__len__() time.sleep(1.5) # 打印一些有的没的的日志 print '\n------------------------------------\n' print u'当前请求的频率为: 每条请求间隔时间 ' , request_interval, u'秒\n' print u'当前输入文件的位置为: ', inputfile_location, '\n' print '------------------------------------\n' print u'开始爬虫进程...' time.sleep(1.5) # 读取文件中的uid f = open(inputfile_location) for id in f.readlines() : # d计数器只是计算已经读了多少条用户id而已,从1开始计数 d = d + 1 # 每秒上限速度4条 time.sleep(request_interval) # 控制IPPOOL的计数器每秒会加一,每次在发出IPPOOL的请求时候会清零 # ipflushCount = ipflushCount + 1 # 首先爬代理ip,把爬下来的ip放在IPPOOL中,每爬20条数据更新一次ip池(已弃用) ''' if (ipflushCount == 20): # 首先检查IPPOOL中随机取得的代理是否有效 #while self.IPPOOL.__len__() != 0 : proxy = random.choice(self.IPPOOL) try : protocol = (str(proxy).split(":"))[0] proxies = {protocol: proxy} # 如果该代理ip能正常访问则跳出循环 if requests.get('http://www.xicidaili.com/nn/', proxies=proxies, timeout=2).status_code == 200: request.meta['proxy'] = proxy break else : self.IPPOOL.remove(proxy) print u'代理 ', proxy, u'已失效并已去除,当前IPPOOL大小为: ' , self.IPPOOL.__len__() except: # 如果代理失效则去除该代理IP并重新随机选择高匿代理IP self.IPPOOL.remove(proxy) print u'代理 ', proxy, u'已失效并已去除,当前IPPOOL大小为: ' , self.IPPOOL.__len__() continue ''' # 开启爬取西刺代理的IP的进程 ''' request = Request(ipUrl, callback=self.parseIP, dont_filter=True) request.meta['proxy'] = proxy ipflushCount = 0 yield request ''' #<end : if> # 该路径为用户的wap端首页,可以取到用户的微博数、关注数、粉丝数 urlNum = "https://weibo.cn/u/%d" % int(id) # 该路径为用户的wap端个人信息页面,可以获取用户的其他信息 urlInfo = 'https://weibo.cn/%d/info' % int(id) # 如果IPPOOL不为空,则从IPPOOL中去高匿代理IP # 首先检查IPPOOL中随机取得的代理是否有效 ''' while self.IPPOOL.__len__() > 0 : proxy = random.choice(self.IPPOOL) try : protocol = 'https' if 'https' in proxy else 'http' proxies = {protocol: proxy} if requests.get('http://www.baidu.com/', proxies=proxies, timeout=2).status_code == 200 : request.meta['proxy'] = proxy print request.meta['proxy'] break else : self.IPPOOL.remove(proxy) print u'代理 ', proxy, u'已失效并已去除,当前IPPOOL大小为: ' , self.IPPOOL.__len__() except: # 如果代理失效则去除该代理IP并重新随机选择高匿代理IP self.IPPOOL.remove(proxy) print u'代理 ', proxy, u'已失效并已去除,当前IPPOOL大小为: ' , self.IPPOOL.__len__() continue ''' # 设置请求的路径和携带的信息 request = Request(urlNum, meta={'uid': id , 'urlInfo': urlInfo }, callback=self.parseNum) # 设置随机cookie request.cookies = random.choice(self.COOKIEPOOL) #print request.cookies # 设置IP代理 ''' if self.IPPOOL.__len__() > 0 : request.meta['proxy'] = random.choice(self.IPPOOL) ''' # 输出日志并开始爬 print u'读取第', d, u'条用户uid: ' , id yield request