Exemple #1
0
class Sender(threading.Thread):
    def __init__(self, status_queue, stop_event, config):
        super(Sender, self).__init__()

        self.normal_data_queue = RedisQueue('normal')
        self.retry_data_queue = RedisQueue('retry')
        self.status_queue = status_queue
        self.stop_event = stop_event

        self.base_url = config["api_url"]
        self.key = config["key"]
        self.store_energy_url = self.base_url + "/v2/energy"
        self.backup_file = "backup"
        self.console_mode = True if config["console_mode"] == "true" else False

        self.connected = False

    def run(self):
        self.send_message_to_listeners(Status.RUNNING,
                                       description="Sender has been started")

        while not self.stop_event.is_set():
            if not self.connected:
                self.connect_to_api()

            while self.connected:
                retry_data = self.read_messages_from_retry_queue()

                if len(retry_data) > 0:
                    self.send_data_to_api(retry_data)
                    break

                normal_data = self.read_messages_from_normal_queue()

                if len(normal_data) > 0:
                    self.send_data_to_api(normal_data)
                    break

                time.sleep(1)

            time.sleep(5)

        self.send_message_to_listeners(
            Status.STOPPED, description="Sender has been terminated")

    def read_messages_from_retry_queue(self):
        retry_data = []

        while not self.retry_data_queue.empty():
            retry_message = self.retry_data_queue.get()
            retry_data.append(json.loads(retry_message.decode('utf-8')))

            if len(retry_data) > 30:
                break

        return retry_data

    def read_messages_from_normal_queue(self):
        normal_data = []

        while not self.normal_data_queue.empty():
            normal_message = self.normal_data_queue.get()
            normal_data.append(json.loads(normal_message.decode('utf-8')))

            if len(normal_data) > 30:
                break

        return normal_data

    def connect_to_api(self):
        try:
            response = requests.get(self.base_url)
            self.connected = response.status_code == requests.codes.ok

            if response.status_code == requests.codes.ok:
                self.connected = True
                self.send_message_to_listeners(
                    Status.RUNNING,
                    description="Connected to server running on {}".format(
                        self.base_url))

        except requests.exceptions.ConnectionError as e:
            self.connected = False
            self.send_message_to_listeners(Status.RUNNING,
                                           Error.SERVER_UNREACHABLE,
                                           "Could not connect to the server")

    def send_data_to_api(self, messages):
        headers = {
            'Content-type': 'application/json',
            'Accept': 'application/json'
        }

        try:
            response = requests.post(self.store_energy_url,
                                     data=json.dumps({
                                         'data': messages,
                                         "rpi_key": self.key
                                     }),
                                     headers=headers)

            if response.status_code == requests.codes.created:
                if self.console_mode:
                    self.send_message_to_listeners(
                        Status.RUNNING,
                        description="Succesfully stored energy data")
                return

            if response.status_code == requests.codes.unauthorized:
                self.send_message_to_listeners(
                    Status.STOPPED, Error.UNAUTHORIZED,
                    "Could not authorize with given key")
                self.stop_event.set()

        except requests.exceptions.ConnectionError as e:
            self.send_message_to_listeners(Status.RUNNING,
                                           Error.SERVER_UNREACHABLE,
                                           "Could not reach the server")

            self.connected = False

            for message in messages:
                self.retry_data_queue.put(json.dumps(message))

    def send_message_to_listeners(self, status, error=None, description=None):
        message = dict()
        message["thread"] = Thread.SENDER
        message["status"] = status

        if error is not None:
            message["error"] = error

        if message is not None:
            message["description"] = description

        self.status_queue.put(message)
Exemple #2
0
class Spider(object):
    def __init__(self):
        self.base_url = "https://weixin.sogou.com/weixin"
        self.keyword = KEY
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;'
            'q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep-alive',
            'Cookie':
            COOKIES,
            'Host':
            'weixin.sogou.com',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/73.0.3683.86 Safari/537.36',
        }
        self.session = Session()
        self.queue = RedisQueue()
        self.mysql = Mysql()

    def start(self):
        # 这里更新会后面请求头请求会302, 在wei_request里更新请求头
        # self.session.headers.update(self.headers)
        param = {'query': self.keyword, 'type': 2}
        start_url = self.base_url + '?' + urlencode(param)
        weixin_request = WeixinRequest(url=start_url,
                                       headers=self.headers,
                                       callback=self.parse_index,
                                       need_proxy=True)
        self.queue.add(weixin_request)

    def schedule(self):
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print("SChedule", weixin_request.url)
            response = self.request(weixin_request)
            if response and response.status_code in VALID_STATUS:
                results = list(callback(response))
                if results:
                    for result in results:
                        print('New Request', result)
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert('articles', result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def request(self, request):
        time.sleep(1)
        try:
            if request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + str(proxy),
                        'https': 'https://' + str(proxy)
                    }
                    return self.session.send(
                        request.prepare(),
                        timeout=request.timeout,
                        allow_redirects=request.allow_redirects,
                        proxies=proxies)
            return self.session.send(request.prepare(),
                                     timeout=request.timeout,
                                     allow_redirects=request.allow_redirects)
        except (ConnectionError, ReadTimeout) as e:
            print(e)
            return False

    def get_proxy(self):
        for i in range(5):
            response = requests.get(PROXY_URL)
            if response.status_code == 200:
                return response.text
        return False

    def parse_index(self, response):
        doc = pq(response.text)
        items = doc('h3 a').items()
        for item in items:
            url = item.attr('data-share')
            weixin_request = WeixinRequest(url=url,
                                           callback=self.parse_detail,
                                           allow_redirects=True)
            yield weixin_request
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)
            page = int(re.findall('page=(\d+)', url)[0]) - 1
            referer = re.sub('page=\d+', str('page=' + str(page)), url)
            weixin_request = WeixinRequest(url=url,
                                           callback=self.parse_index,
                                           need_proxy=True,
                                           referer=referer,
                                           headers=self.headers)
            yield weixin_request

    def parse_detail(self, response):
        doc = pq(response.text)
        data = {
            'title': doc('.rich_media_title').text().strip(),
            'content': doc('.rich_media_content').text(),
            'date': doc('#publish_time').text(),
            'nickname': doc('.profile_nickname').text(),
            'wechat': doc('div.profile_inner > p:nth-child(3) > span').text()
        }
        yield data

    def error(self, request):
        request.fail_time += 1
        print('Request Failed', request.fail_time, 'times', request.url)
        if request.fail_time < MAX_FAILED_TIME:
            self.queue.add(request)

    def run(self):
        self.start()
        self.schedule()