class Sender(threading.Thread): def __init__(self, status_queue, stop_event, config): super(Sender, self).__init__() self.normal_data_queue = RedisQueue('normal') self.retry_data_queue = RedisQueue('retry') self.status_queue = status_queue self.stop_event = stop_event self.base_url = config["api_url"] self.key = config["key"] self.store_energy_url = self.base_url + "/v2/energy" self.backup_file = "backup" self.console_mode = True if config["console_mode"] == "true" else False self.connected = False def run(self): self.send_message_to_listeners(Status.RUNNING, description="Sender has been started") while not self.stop_event.is_set(): if not self.connected: self.connect_to_api() while self.connected: retry_data = self.read_messages_from_retry_queue() if len(retry_data) > 0: self.send_data_to_api(retry_data) break normal_data = self.read_messages_from_normal_queue() if len(normal_data) > 0: self.send_data_to_api(normal_data) break time.sleep(1) time.sleep(5) self.send_message_to_listeners( Status.STOPPED, description="Sender has been terminated") def read_messages_from_retry_queue(self): retry_data = [] while not self.retry_data_queue.empty(): retry_message = self.retry_data_queue.get() retry_data.append(json.loads(retry_message.decode('utf-8'))) if len(retry_data) > 30: break return retry_data def read_messages_from_normal_queue(self): normal_data = [] while not self.normal_data_queue.empty(): normal_message = self.normal_data_queue.get() normal_data.append(json.loads(normal_message.decode('utf-8'))) if len(normal_data) > 30: break return normal_data def connect_to_api(self): try: response = requests.get(self.base_url) self.connected = response.status_code == requests.codes.ok if response.status_code == requests.codes.ok: self.connected = True self.send_message_to_listeners( Status.RUNNING, description="Connected to server running on {}".format( self.base_url)) except requests.exceptions.ConnectionError as e: self.connected = False self.send_message_to_listeners(Status.RUNNING, Error.SERVER_UNREACHABLE, "Could not connect to the server") def send_data_to_api(self, messages): headers = { 'Content-type': 'application/json', 'Accept': 'application/json' } try: response = requests.post(self.store_energy_url, data=json.dumps({ 'data': messages, "rpi_key": self.key }), headers=headers) if response.status_code == requests.codes.created: if self.console_mode: self.send_message_to_listeners( Status.RUNNING, description="Succesfully stored energy data") return if response.status_code == requests.codes.unauthorized: self.send_message_to_listeners( Status.STOPPED, Error.UNAUTHORIZED, "Could not authorize with given key") self.stop_event.set() except requests.exceptions.ConnectionError as e: self.send_message_to_listeners(Status.RUNNING, Error.SERVER_UNREACHABLE, "Could not reach the server") self.connected = False for message in messages: self.retry_data_queue.put(json.dumps(message)) def send_message_to_listeners(self, status, error=None, description=None): message = dict() message["thread"] = Thread.SENDER message["status"] = status if error is not None: message["error"] = error if message is not None: message["description"] = description self.status_queue.put(message)
class Spider(object): def __init__(self): self.base_url = "https://weixin.sogou.com/weixin" self.keyword = KEY self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;' 'q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': COOKIES, 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/73.0.3683.86 Safari/537.36', } self.session = Session() self.queue = RedisQueue() self.mysql = Mysql() def start(self): # 这里更新会后面请求头请求会302, 在wei_request里更新请求头 # self.session.headers.update(self.headers) param = {'query': self.keyword, 'type': 2} start_url = self.base_url + '?' + urlencode(param) weixin_request = WeixinRequest(url=start_url, headers=self.headers, callback=self.parse_index, need_proxy=True) self.queue.add(weixin_request) def schedule(self): while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print("SChedule", weixin_request.url) response = self.request(weixin_request) if response and response.status_code in VALID_STATUS: results = list(callback(response)) if results: for result in results: print('New Request', result) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): self.mysql.insert('articles', result) else: self.error(weixin_request) else: self.error(weixin_request) def request(self, request): time.sleep(1) try: if request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + str(proxy), 'https': 'https://' + str(proxy) } return self.session.send( request.prepare(), timeout=request.timeout, allow_redirects=request.allow_redirects, proxies=proxies) return self.session.send(request.prepare(), timeout=request.timeout, allow_redirects=request.allow_redirects) except (ConnectionError, ReadTimeout) as e: print(e) return False def get_proxy(self): for i in range(5): response = requests.get(PROXY_URL) if response.status_code == 200: return response.text return False def parse_index(self, response): doc = pq(response.text) items = doc('h3 a').items() for item in items: url = item.attr('data-share') weixin_request = WeixinRequest(url=url, callback=self.parse_detail, allow_redirects=True) yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) page = int(re.findall('page=(\d+)', url)[0]) - 1 referer = re.sub('page=\d+', str('page=' + str(page)), url) weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True, referer=referer, headers=self.headers) yield weixin_request def parse_detail(self, response): doc = pq(response.text) data = { 'title': doc('.rich_media_title').text().strip(), 'content': doc('.rich_media_content').text(), 'date': doc('#publish_time').text(), 'nickname': doc('.profile_nickname').text(), 'wechat': doc('div.profile_inner > p:nth-child(3) > span').text() } yield data def error(self, request): request.fail_time += 1 print('Request Failed', request.fail_time, 'times', request.url) if request.fail_time < MAX_FAILED_TIME: self.queue.add(request) def run(self): self.start() self.schedule()