class Spider(): base_url = 'http://weixin.sogou.com/weixin' keyword = 'NBA' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'ABTEST=8|1537520547|v1; SNUID=D071F262080D7E55EE03CCD708FECF45; IPLOC=CN3100; SUID=D879F5652930990A000000005BA4B3A3; SUID=D879F5652113940A000000005BA4B3A3; JSESSIONID=aaa45tYJMfZ24hiEC3Bvw; SUV=002D1DEC65F579D85BA4B3A4AC47E291', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } session = Session() queue = RedisQueue() mysql = MySQL() def get_proxy(self): """ 从代理池获取代理 :return: """ try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print('Get Proxy', response.text) return response.text return None except requests.ConnectionError: return None def start(self): """ 初始化工作 """ # 全局更新Headers self.session.headers.update(self.headers) # 拼接参数 start_url = self.base_url + '?' + parse.urlencode({ 'query': self.keyword, 'type': 2 }) # 构建请求对象 weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) # 将请求加入到队列中 self.queue.add(weixin_request) def parse_index(self, response): """ 解析索引页 :param response: 响应 :return: 新的响应 """ doc = pq(response.text) # 解析响应内容,并转为键值对的形式 items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: # 获取详情页URL url = item.attr('href') # 构建详情页请求 weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request # 获取下一页列表页数据 next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) # 构建下一页列表页请求对象 weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request def parse_detail(self, response): """ 解析详情页 :param response: 响应 :return: 微信公众号文章 """ # 这里使用PyQuery解析详情页数据 doc = pq(response.text) data = { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'date': doc('#post-date').text(), 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } print(data) yield data def request(self, weixin_request): """ 执行请求 :param weixin_request: 请求 :return: 响应 """ try: # 如果使用代理 if weixin_request.need_proxy: # 向代理池获取随机代理 proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) except (ConnectionError, ReadTimeout) as e: print(e.args) return False def error(self, weixin_request): """ 错误处理 :param weixin_request: 请求 :return: """ weixin_request.fail_time = weixin_request.fail_time + 1 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) def schedule(self): """ 调度请求 :return: """ while not self.queue.empty(): # 获取列表中的最后一个请求对象 weixin_request = self.queue.pop() # 获取处理响应的回调函数 callback = weixin_request.callback print('Schedule', weixin_request.url) # 发送请求,获取响应 response = self.request(weixin_request) # 如果返回响应成功 if response and response.status_code in VALID_STATUSES: # 解析响应 results = list(callback(response)) if results: for result in results: print('New Result', type(result)) # 如果结果是请求对象,则添加到请求队列中 if isinstance(result, WeixinRequest): self.queue.add(result) # 如果结果是数据,则存入MySql if isinstance(result, dict): self.mysql.insert('articles', result) else: self.error(weixin_request) else: self.error(weixin_request) def run(self): """ 入口 :return: """ self.start() self.schedule()
class Spider(): base_url = 'http://weixin.sogou.com/weixin' keyword = 'NBA' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'SUV=00670A2227B2ABC254DA3D7FD4C0C627; ssuid=8733545508; SUID=07C4CF8C771C900A54BC59A80005FA70; CXID=A448FA79C6A209243C989A4E8B73B7D8; ad=CWjGyZllll2bUpBolllllVsBwh9lllllhX0$pZllll9lllll4h7ll5@@@@@@@@@@; sw_uuid=2614289524; sg_uuid=828389127; pex=C864C03270DED3DD8A06887A372DA219231FFAC25A9D64AE09E82AED12E416AC; IPLOC=CN3100; ABTEST=1|1542847064|v1; SNUID=5EB02193595C226F4A6FCFB45A39C0AD; weixinIndexVisited=1; JSESSIONID=aaaH6l1zlTIDSfIvo71Bw; ppinf=5|1543052130|1544261730|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo2OmZhbmZhbnxjcnQ6MTA6MTU0MzA1MjEzMHxyZWZuaWNrOjY6ZmFuZmFufHVzZXJpZDo0NDpvOXQybHVFbWMtN0pFVEVBcG9fVExPZFRWZ25JQHdlaXhpbi5zb2h1LmNvbXw; pprdig=KsCn3kz7vFkBhBco6qspL2OfbKIVQayZoSqYAcBkWHZ2q28BR-A6z5u_JvcFoZCcdUsBpYHyVjPKLmk78v97jTSiiC4eT8Kgwy5F8p1Cv-KjkzWwYJ2iCIPX1jTrizrKOE5k6me2ap-WYBYIGSx5Xb0pkU7TlfgtWCrvk3mSNak; sgid=12-38089007-AVv5G2Kr8FBOCPPiccG54jibQ; ppmdig=1543052130000000812df3126ffd3a9ba45c73998d7660ec', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } session = Session() queue = RedisQueue() mysql = MySQL() def __init__(self): self.id = 1 def get_proxy(self): try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print('Get Proxy', response.text) return response.text return None except requests.ConnectionError: return None def start(self): self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({ 'query': self.keyword, 'type': 2 }) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) self.queue.add(weixin_request) def schedule(self): while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print('Schedule', weixin_request.url) response = self.request(weixin_request) print(response) if response and response.status_code in VALID_STATUSES + [301]: results = list(callback(response)) if results: for result in results: print('New Result', type(result)) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): print(result) self.mysql.insert('articles', result) # print('##########insert into database################') else: self.error(weixin_request) else: self.error(weixin_request) def request(self, weixin_request): try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=True) except (ConnectionError, ReadTimeout) as e: print(e.args) return False def error(self, weixin_request): weixin_request.fail_time = weixin_request.fail_time + 1 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) def parse_index(self, response): """ 解析索引页 :param response: 响应 :return: 新的响应 """ doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request def parse_detail(self, response): doc = pq(response.text) data = { # 'id': self.id, 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'date': doc('#post-date').text(), 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } yield data def run(self): self.start() self.schedule()
class Spider(): base_url = 'http://weixin.sogou.com/weixin' keyword = 'NBA' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'ABTEST=0|1553070123|v1; SNUID=344B814B7773F2F4278902CD77E51845; IPLOC=CN1100; SUID=423DF73C4631990A000000005C91F82B; SUID=266252D25F20940A000000005C91F82B; JSESSIONID=aaaIO6MIIk4aC_XhZM-Lw; SUV=002F49023CF73D425C91F82BA9EB8957', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } session = Session() # session.keep_alive = False queue = RedisQueue() mysql = MySQL() def update_cookie(self): self.headers['Cookie'] = get_cookie() def get_proxy(self): """ 从代理池获取代理 :return: """ try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print('Get Proxy', response.text) return response.text return None except requests.ConnectionError: return None def start(self): """ 初始化工作 """ # 全局更新Headers # self.update_cookie() self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({'query': self.keyword, 'type': 2,'sut':7956,'lkt':'1%2C1553052272863%2C1553052272863','s_from':'input','_sug_':'y','sst0':'1553052272967','ie':'utf8','w':'01019900','dr':'1'}) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) # 调度第一个请求 self.queue.add(weixin_request) def parse_index(self, response): """ 解析索引页 :param response: 响应 :return: 新的响应 """ doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request def parse_detail(self, response): """ 解析详情页 :param response: 响应 :return: 微信公众号文章 """ doc = pq(response.text) data = { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'date': doc('#post-date').text(), 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } yield data def request(self, weixin_request): """ 执行请求 :param weixin_request: 请求 :return: 响应 """ try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) # return self.session.get(weixin_request.url,headers=self.headers,timeout=weixin_request.timeout, allow_redirects=False) except (ConnectionError, ReadTimeout) as e: print(e.args) return None def error(self, weixin_request): """ 错误处理 :param weixin_request: 请求 :return: """ weixin_request.fail_time = weixin_request.fail_time + 1 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) def schedule(self): """ 调度请求 :return: """ while not self.queue.empty(): # time.sleep(1) weixin_request = self.queue.pop() callback = weixin_request.callback print('Schedule', weixin_request.url) response = self.request(weixin_request) if response and response.status_code in VALID_STATUSES: self.handleResponse200(response,weixin_request) # elif response.status_code in REDIRECT_CODES: # retry_request = loads(dumps(weixin_request)) # retry_request.url = response.headers['Location'] # r = self.request(retry_request) # if r and r.status_code in VALID_STATUSES: # self.handleResponse200(r, weixin_request) # else: # self.error(weixin_request) # else: # self.error(weixin_request) else: self.error(weixin_request) def handleResponse200(self,response,weixin_request): callback = weixin_request.callback results = list(callback(response)) if results: for result in results: print('New Result', type(result)) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): self.mysql.insert('articles', result) else: self.error(weixin_request) def run(self): """ 入口 :return: """ self.start() self.schedule()
class Spider(): base_url = 'http://weixin.sogou.com/weixin' keyword = 'NBA' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'IPLOC=CN1100; SUID=6FEDCF3C541C940A000000005968CF55; SUV=1500041046435211; ABTEST=0|1500041048|v1; SNUID=CEA85AE02A2F7E6EAFF9C1FE2ABEBE6F; weixinIndexVisited=1; JSESSIONID=aaar_m7LEIW-jg_gikPZv; ld=Wkllllllll2BzGMVlllllVOo8cUlllll5G@HbZllll9lllllRklll5@@@@@@@@@@; LSTMV=212%2C350; LCLKINT=4650; ppinf=5|1500042908|1501252508|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo1NDolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOEQlRTQlQjglQTglRTklOUQlOTklRTglQTclODV8Y3J0OjEwOjE1MDAwNDI5MDh8cmVmbmljazo1NDolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOEQlRTQlQjglQTglRTklOUQlOTklRTglQTclODV8dXNlcmlkOjQ0Om85dDJsdUJfZWVYOGRqSjRKN0xhNlBta0RJODRAd2VpeGluLnNvaHUuY29tfA; pprdig=ppyIobo4mP_ZElYXXmRTeo2q9iFgeoQ87PshihQfB2nvgsCz4FdOf-kirUuntLHKTQbgRuXdwQWT6qW-CY_ax5VDgDEdeZR7I2eIDprve43ou5ZvR0tDBlqrPNJvC0yGhQ2dZI3RqOQ3y1VialHsFnmTiHTv7TWxjliTSZJI_Bc; sgid=27-27790591-AVlo1pzPiad6EVQdGDbmwnvM; PHPSESSID=mkp3erf0uqe9ugjg8os7v1e957; SUIR=CEA85AE02A2F7E6EAFF9C1FE2ABEBE6F; sct=11; ppmdig=1500046378000000b7527c423df68abb627d67a0666fdcee; successCount=1|Fri, 14 Jul 2017 15:38:07 GMT', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } session = Session() queue = RedisQueue() mysql = MySQL() def get_proxy(self): """ 从代理池获取代理 :return: """ try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print('Get Proxy', response.text) return response.text return None except requests.ConnectionError: return None def start(self): """ 初始化工作 """ # 全局更新Headers self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({ 'query': self.keyword, 'type': 2 }) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) # 调度第一个请求 self.queue.add(weixin_request) def parse_index(self, response): """ 解析索引页 :param response: 响应 :return: 新的响应 """ doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request def parse_detail(self, response): """ 解析详情页 :param response: 响应 :return: 微信公众号文章 """ doc = pq(response.text) data = { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'date': doc('#post-date').text(), 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } yield data def request(self, weixin_request): """ 执行请求 :param weixin_request: 请求 :return: 响应 """ try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) except (ConnectionError, ReadTimeout) as e: print(e.args) return False def error(self, weixin_request): """ 错误处理 :param weixin_request: 请求 :return: """ weixin_request.fail_time = weixin_request.fail_time + 1 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) def schedule(self): """ 调度请求 :return: """ while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print('Schedule', weixin_request.url) response = self.request(weixin_request) if response and response.status_code in VALID_STATUSES: results = list(callback(response)) if results: for result in results: print('New Result', type(result)) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): self.mysql.insert('articles', result) else: self.error(weixin_request) else: self.error(weixin_request) def run(self): """ 入口 :return: """ self.start() self.schedule()
class Spider(): base_url = 'https://weixin.sougou.com/weixin?' keyword = KEYWORD headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Connection': 'keep-alive', 'Cookie': 'CXID=68999D20535A955E54EEB369EEBDAA87; SUID=7D0481DF3565860A5B922DAB00041476; ' 'SUV=00724ADFDF81047D5B9390FE3CE03520; ad=Ukllllllll2b6ALrlllllVmUX@1lllllTc99Kyllll' '9llllljylll5@@@@@@@@@@; IPLOC=CN5101; ABTEST=0|1536564030|v1; weixinIndexVisited=1; ' 'SNUID=6AEE6B35EBEE9D9F5957A098EBEC0DF0; sct=1; JSESSIONID=aaaqsTn37HldSeg_akWyw; ' 'ppinf=5|1538793682|1540003282|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo0NTol' 'RTYlODUlQTIlRTYlODUlQTIlRTYlODUlQTIlRTYlOTclQjYlRTUlODUlODl8Y3J0OjEwOjE1Mzg3OTM2ODJ8cmVm' 'bmljazo0NTolRTYlODUlQTIlRTYlODUlQTIlRTYlODUlQTIlRTYlOTclQjYlRTUlODUlODl8dXNlcmlkOjQ0Om85d' 'DJsdURabHBHRjJ1TF9vbGtrV01MbTlHWFFAd2VpeGluLnNvaHUuY29tfA; pprdig=YXVgbs0p9dU4aBgDw7V_id' 'ljKjCcGiXgeUpafLd_FO65GO0AMS3VWq_ogoKBR7XpAChV9r3DxwwMN_lwgpTwjbT4al7JXyKKOua-q3IoMvfo2KwI1' 'sXoNQKlyuxomXov9kuvMJkAHq4x6HCYOtsNhkW92H_acgTIeDo65hnDIbc; sgid=15-37413245-AVu4ININKITuO' '1IBrovHceA; ppmdig=153880606700000019649cd69fcbff1cb91d0c6884906b6b; LSTMV=469%2C259; LCLKINT=5007', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', # 为了对付防盗链,对方服务器会事变header中的Referer是不是自己的,所以我们会在头部中加上Referer 'Referer': 'https://weixin.sogou.com/weixin?query=%E9%A3%8E%E6%99%AF&type=2&page=17&ie=utf8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' } # 初始化Session和RedisQueue MySQL对象,分别执行请求、代理调用、存储要求 session = Session() queue = RedisQueue() mysql = MySQL() def get_proxy(url): """ 从代理池中获取代理 """ try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print('Get Proxy', response.text) return response.text return None except requests.ConnectionError: return None def start(self): """ 初始化工作 """ # 全局更新Headers,使得所有请求都能应用Cookies self.session.headers.update(self.headers) # 起始URL的构造 start_url = self.base_url + '?' + urlencode({ 'query': self.keyword, 'type': 2 }) # 构造WeixinRequest对象,回调函数:请求成功后用parse_index()处理和解析 need_proxy参数执行请求须用代理 weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) # 请求加入队列,调度第一个请求 self.queue.add(weixin_request) def parse_index(self, response): """ 解析索引页 :param response: 响应 :return: 新的响应 """ doc = pq(response.text) # 获取本页所有的微信文章链接 items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') # 构造成WeixinRequest之后yield返回 weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request # 获取下一页的链接 next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) # 构造成WeixinRequest之后yield返回 weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request def parse_detail(self, response): """ 解析详情页 :param response: 响应 :return: 微信公众号文章 """ doc = pq(response.text) # 提取标题、正文文本、发布日期、发布人昵称、公众号名称,组合成字典返回 data = { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'date': doc('#publish_time').text(), 'nickname': doc('#meta_content > span.rich_media_meta.rich_media_meta_text'). text(), 'wechat': doc('#profileBt > #js_name').text() } yield data # 返回之后需要判断类型,字典类型调用mysql对象的insert()方法存入数据库 def request(self, weixin_request): """ 执行请求 :param weixin_request: 请求 :return: 响应 """ try: # 先判断请求是否需要代理,调用Session的send()方法执行请求 if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) # 请求调用prepare()方法转化为Prepared Request,不重定向,请求超时时间,响应返回 return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) except (ConnectionError, ReadTimeout) as e: print(e.args) return False def error(self, weixin_request): """ 错误处理 """ weixin_request.fail_time = weixin_request.fail_time + 1 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) def schedule(self): """ 调度请求,schedule()方法,内部是一个循环,条件:队列不为空 """ while not self.queue.empty: # 调用pop()方法取出下一个请求,request()方法执行请求 # 第一次循环结束,while继续执行,队列包含第一页内容的文章详情页请求和下一页的请求, # 第二次循环得到的下一个请求是文章详情页的请求,重新调用request()方法获得响应,对应回调函数parse_detail() weixin_request = self.queue.pop() callback = weixin_request.callback print('Schedule', weixin_request.url) response = self.request(weixin_request) # request()方法得到Response对象的状态码合法判断,调用WeixinRequest的回调函数(parse_index())解析 if response and response.status_code in VALID_STATUS: results = list(callback(response)) # schedule()方法将返回结果遍历,利用isinstance()方法判断返回结果 if results: for result in results: print('New Result', result) # 判断类型是否相同 if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): self.mysql.insert('articles', result) else: self.error(weixin_request) else: self.error(weixin_request) def run(self): """ 入口 :return: """ self.start() self.schedule()
class Spider(): base_url = 'http://weixin.sogou.com/weixin' keyword = 'NBA' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'CXID=1B446AEB1D516FD5C765CE37868DCA1D; SUID=8C9E10743565860A5B4C06780003B8C5; SUV=00504AE374109E5F5B4D680A993BF652; ABTEST=0|1531819884|v1; IPLOC=CN4451; SNUID=D71998F3888DF6A614EEF7B3889DAD9A; JSESSIONID=aaahykvTDvD8R-ZaY7Gsw; ad=Iyllllllll2bFlBclllllVH2CFYlllllNxjW0lllll9lllll9ylll5@@@@@@@@@@; weixinIndexVisited=1; sct=1; ppinf=5|1531884187|1533093787|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTowOnxjcnQ6MTA6MTUzMTg4NDE4N3xyZWZuaWNrOjA6fHVzZXJpZDo0NDpvOXQybHVDdzFUdmVWN2tVaTEyRk5WcndlLWV3QHdlaXhpbi5zb2h1LmNvbXw; pprdig=JKKPygKv7H7Co6cneyKbsXQ8QUUG1LwoV5MgwIaMBBFBV2rEuyGbkhtgql1UoHqJz5SyY5mCnPPUWZqtD6pgw--LWoYRTXTqCeyNamhAEW4EVYs5XW_MLh0OcUkXgv7DjiwCqNj3F7bxIpOjyE_RKMiBAP_OHlBre9MtNgwQwOs; sgid=25-36102587-AVtOsptgowqxnSJe555xWxw; ppmdig=153190021100000074f921db30dd23e8562dd7af293cd8fb', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:54.0) Gecko/20100101 Firefox/54.0' } session = Session() queue = RedisQueue() mysql = MySQL() def get_proxy(self): """ 从代理池获取代理 :return: """ try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print('Get Proxy', response.text) return response.text return None except requests.ConnectionError: return None def parse_index(self, response): """ 解析索引页 :param response: 响应 :return: 新的响应 """ doc = BeautifulSoup(response.text, 'lxml') items = doc.select('h3 a') for item in items: url = item['href'] weixin_request = WeixinRequset(url=url, callback=self.parse_detail) yield weixin_request print(response.text) next = doc.select('#sogou_next')[0]['href'] print(next) if next: url = self.base_url + str(next) weixin_request = WeixinRequset(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request def parse_detail(self, response): """ 解析详情页 :param response:响应 :return: 微信公众号文章 """ doc = BeautifulSoup(response.text, 'lxml') data = { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'date': doc('#post-date').text(), 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } yield data def start(self): """ 初始化工作 :return: """ self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({ 'query': self.keyword, 'type': 2 }) weixin_request = WeixinRequset(url=start_url, callback=self.parse_index, need_proxy=True) # 添加第一个请求 self.queue.add(weixin_request) def request(self, weixin_request): """ 执行请求 :param weixin_request: 请求 :return: 响应 """ try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy, } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) except (ConnectionError) as e: print(e.args) return False def error(self, weixin_request): """ 错误处理 :param weixin_request: :return: """ weixin_request.fail_time = weixin_request.fail_time + 1 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) def schedule(self): """ 调度请求 :return: """ while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print('Schedule', weixin_request.url) response = self.request(weixin_request) print(response) # print(type(response)) # print(response.text) # print('11111111') if response and response.status_code in VALID_STATUSES: results = list(callback(response)) if results: for result in results: print('New Result', type(result)) if isinstance(result, WeixinRequset): self.queue.add(result) if isinstance(result, dict): self.mysql.insert('articles', result) else: self.error(weixin_request) else: self.error(weixin_request) def run(self): """ :return: """ self.start() self.schedule()
class Spider(): base_url = 'http://weixin.sogou.com/weixin' keyword = '吃鸡' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Cookie': 'ABTEST=0|1531205382|v1; IPLOC=CN1100; SUID=4DFF7B7B2930990A000000005B445706; SUID=4DFF7B7B6119940A000000005B445706; weixinIndexVisited=1; SUV=000927B97B7BFF4D5B445710B98DF777; sct=1; SNUID=E250D5D4AEAADE3039217104AF02D5BA; JSESSIONID=aaaYi9nbXUS5thab6Mgrw', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36' } session = Session() queue = RedisQueue() mysql = MySQL() def get_proxy(self): """ 从代理池获取代理 :return: """ try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print('Get Proxy', response.text) return response.text return None except requests.ConnectionError: return None def start(self): """ 初始化工作 """ # 全局更新Headers self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({ 'query': self.keyword, 'type': 2 }) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=False, timeout=15) # 调度第一个请求 self.queue.add(weixin_request) def parse_index(self, response): """ 解析索引页 :param response: 响应 :return: 新的响应 """ doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=False) yield weixin_request def parse_detail(self, response): """ 解析详情页 :param response: 响应 :return: 微信公众号文章 """ doc = pq(response.text) date = re.findall('publish_time = "(.*?)"', response.text, re.S) if date: data = { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), # 'date': doc('#publish_time').text(), 'date': date[0], 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } yield data else: data = { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'date': doc('#publish_time').text(), # 'date': re.findall('publish_time = "(.*?)"', response.text, re.S)[0], 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } print(data) def request(self, weixin_request): """ 执行请求 :param weixin_request: 请求 :return: 响应 """ try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } return self.session.send( weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, ) #proxies=proxies return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) except (ConnectionError, ReadTimeout) as e: print(e.args) return False def error(self, weixin_request): """ 错误处理 :param weixin_request: 请求 :return: """ weixin_request.fail_time = weixin_request.fail_time + 1 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) def schedule(self): """ 调度请求 :return: """ while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print('Schedule', weixin_request.url) # response = self.request(weixin_request,allow_redirects=True)#### response = requests.get(url=weixin_request.url, timeout=50) if response and response.status_code in VALID_STATUSES: results = list(callback(response)) if results: for result in results: print('New Result', type(result)) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): self.mysql.insert('articles', result) else: self.error(weixin_request) else: self.error(weixin_request) def run(self): """ 入口 :return: """ self.start() self.schedule()
class Articles(): headers = { 'Host': 'mp.weixin.qq.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0', 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://mp.weixin.qq.com/s?src=11×tamp=1543224952&ver=1268&signature=n0EW*NEa73Cd39RmRKfaYPU5NUDuN5X6eypDap*--nQ913dIIe3i8EcRnyd7PptsjOAKzDVuI*ikSsioBg0*zMGPbB27CUrORDvEMav2hvZHp2tFF3V4cNyl09Cr73Rl&new=1', 'Cookie': 'rewardsn=; wxtokenkey=777', 'Connection': 'keep-alive', } redis = RedisClient() mysql = MySQL() proxies = None def test_proxy(self): """ :return: """ global proxies url = 'https://mp.weixin.qq.com' proxy = self.redis.weixin_proxy_random() proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } try: r = requests.get(url, headers=self.headers, allow_redirects=False, proxies=proxies, timeout=30) if r.status_code == 200: pass else: self.redis.weixin_proxy_decrease(proxy) self.test_proxy() except: self.redis.weixin_proxy_decrease(proxy) self.test_proxy() def start(self): """ 调用self.header方法,构造请求头headers 向redis的requests队列传送第一个request """ global proxies while not self.redis.request_empty(): url = self.redis.request_pop() try: response = requests.get(url, headers=self.headers, proxies=proxies, allow_redirects=True, timeout=20) print('正在爬取:', url) print(response.status_code) if response and response.status_code in VALID_STATUSE: print('status_code:200') self.parse_detail(response) else: self.test_proxy() self.redis.request_add(url) except: self.test_proxy() self.redis.request_add(url) def parse_detail(self, response): """ 解析详情页 :param response: 响应 :return: 微信公众号文章 """ doc = pq(response.text) data = { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'date': doc('#post-date').text(), 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } if not len(data) == 0: self.mysql.insert('articles', data)
class Spider(object): base_url = 'http://weixin.sogou.com/weixin' keyword = 'NBA' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'SUV=0019178D7139E4CF5A98E24886E85289; SUID=9AF49C733120910A000000005AAA1750; ABTEST=8|1530090681|v' '1;IPLOC=CN4210; weixinIndexVisited=1; SNUID=612CF6BECECAA052EA3A6F50CF791601; JSESSIONID=aaa0_Q-S5EK' '7BcKF4S7qw; ppinf=5|1530098114|1531307714|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTQlQk' 'QlOTglRTUlODElQTUlRTUlQTQlQUJ8Y3J0OjEwOjE1MzAwOTgxMTR8cmVmbmljazoyNzolRTQlQkQlOTglRTUlODElQTUlRTUlQT' 'QlQUJ8dXNlcmlkOjQ0Om85dDJsdU9JbjdQV1F4UC1wQV90UDhDR2dLdE1Ad2VpeGluLnNvaHUuY29tfA; pprdig=RnSWD7qnomx' 'sf-V3yOXjt7Jk9zZwYYiXZKsByre9tciFYGNqAreHjU1paH2_7j9yUJDAxxdJZ4rfTI8EwIRhK_rDckoa0PcwrKB2UzA2ou--Ddl' 'KELNDgr-2EOPJ5BdDFBgJh84r7fsZC2SQBEGzB0kqxVboX6ZzSXdGS86hlRE; sgid=23-35736769-AVszccJNic84GSkZY3bek' 'BMY; ppmdig=15300981140000009a7459d5a937b96a96b9db16b616914a; sct=3', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/59.0.3071.115 Safari/537.36' } session = Session() queue = RedisQueue() mysql = MySQL() def get_proxy(self): """ 从代理池获取代理 :return: """ try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print('Get Proxy', response.text) return response.text return None except requests.ConnectionError: return None def start(self): """ 初始化 :return: """ # 全局更新headers self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({'query': self.keyword, 'type': 2}) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) # 调度第一个请求 self.queue.add(weixin_request) def parse_index(self, response): """ 解析索引页 :param response: :return: """ doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request def parse_detail(self, response): """ 解析详情页 :param response: :return: 微信公众号文章 """ doc = pq(response.text) data = { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'data': doc('#post-data').text(), 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } yield data def request(self, weixin_request): """ 执行请求 :param weixin_request: :return: """ try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) except(ConnectionError, ReadTimeout) as e: print(e.args) return False def error(self, weixin_request): """ 错误处理 :param weixin_request: :return: """ weixin_request.fail_time = weixin_request.fail_time + 1 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) def schedule(self): """ 调度请求 :return: """ while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print('Schedule', weixin_request.url) response = self.request(weixin_request) if response and response.status_code in VALID_STATUSES: results = list(callback(response)) if results: for result in results: print('New Result', type(result)) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): self.mysql.insert('articles', result) else: self.error(weixin_request) else: self.error(weixin_request) def run(self): self.start() self.schedule()