Esempio n. 1
0
class Spider():
    base_url = 'http://weixin.sogou.com/weixin'
    keyword = 'NBA'
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Cookie':
        'SUV=00670A2227B2ABC254DA3D7FD4C0C627; ssuid=8733545508; SUID=07C4CF8C771C900A54BC59A80005FA70; CXID=A448FA79C6A209243C989A4E8B73B7D8; ad=CWjGyZllll2bUpBolllllVsBwh9lllllhX0$pZllll9lllll4h7ll5@@@@@@@@@@; sw_uuid=2614289524; sg_uuid=828389127; pex=C864C03270DED3DD8A06887A372DA219231FFAC25A9D64AE09E82AED12E416AC; IPLOC=CN3100; ABTEST=1|1542847064|v1; SNUID=5EB02193595C226F4A6FCFB45A39C0AD; weixinIndexVisited=1; JSESSIONID=aaaH6l1zlTIDSfIvo71Bw; ppinf=5|1543052130|1544261730|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo2OmZhbmZhbnxjcnQ6MTA6MTU0MzA1MjEzMHxyZWZuaWNrOjY6ZmFuZmFufHVzZXJpZDo0NDpvOXQybHVFbWMtN0pFVEVBcG9fVExPZFRWZ25JQHdlaXhpbi5zb2h1LmNvbXw; pprdig=KsCn3kz7vFkBhBco6qspL2OfbKIVQayZoSqYAcBkWHZ2q28BR-A6z5u_JvcFoZCcdUsBpYHyVjPKLmk78v97jTSiiC4eT8Kgwy5F8p1Cv-KjkzWwYJ2iCIPX1jTrizrKOE5k6me2ap-WYBYIGSx5Xb0pkU7TlfgtWCrvk3mSNak; sgid=12-38089007-AVv5G2Kr8FBOCPPiccG54jibQ; ppmdig=1543052130000000812df3126ffd3a9ba45c73998d7660ec',
        'Host':
        'weixin.sogou.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
    }
    session = Session()
    queue = RedisQueue()
    mysql = MySQL()

    def __init__(self):
        self.id = 1

    def get_proxy(self):
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('Get Proxy', response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None

    def start(self):
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + urlencode({
            'query': self.keyword,
            'type': 2
        })
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)
        self.queue.add(weixin_request)

    def schedule(self):
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            response = self.request(weixin_request)
            print(response)
            if response and response.status_code in VALID_STATUSES + [301]:
                results = list(callback(response))
                if results:
                    for result in results:
                        print('New Result', type(result))
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            print(result)
                            self.mysql.insert('articles', result)
                            # print('##########insert into database################')
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def request(self, weixin_request):
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https': 'https://' + proxy
                    }
                    return self.session.send(weixin_request.prepare(),
                                             timeout=weixin_request.timeout,
                                             allow_redirects=False,
                                             proxies=proxies)
            return self.session.send(weixin_request.prepare(),
                                     timeout=weixin_request.timeout,
                                     allow_redirects=True)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    def error(self, weixin_request):
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times',
              weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)

    def parse_index(self, response):
        """
        解析索引页
        :param response: 响应
        :return: 新的响应
        """
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)
            weixin_request = WeixinRequest(url=url,
                                           callback=self.parse_index,
                                           need_proxy=True)
            yield weixin_request

    def parse_detail(self, response):
        doc = pq(response.text)
        data = {
            # 'id': self.id,
            'title':
            doc('.rich_media_title').text(),
            'content':
            doc('.rich_media_content').text(),
            'date':
            doc('#post-date').text(),
            'nickname':
            doc('#js_profile_qrcode > div > strong').text(),
            'wechat':
            doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
        }
        yield data

    def run(self):
        self.start()
        self.schedule()
Esempio n. 2
0
class Spider():
    base_url = 'http://weixin.sogou.com/weixin'
    keyword = 'NBA'
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Cookie':
        'IPLOC=CN1100; SUID=6FEDCF3C541C940A000000005968CF55; SUV=1500041046435211; ABTEST=0|1500041048|v1; SNUID=CEA85AE02A2F7E6EAFF9C1FE2ABEBE6F; weixinIndexVisited=1; JSESSIONID=aaar_m7LEIW-jg_gikPZv; ld=Wkllllllll2BzGMVlllllVOo8cUlllll5G@HbZllll9lllllRklll5@@@@@@@@@@; LSTMV=212%2C350; LCLKINT=4650; ppinf=5|1500042908|1501252508|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo1NDolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOEQlRTQlQjglQTglRTklOUQlOTklRTglQTclODV8Y3J0OjEwOjE1MDAwNDI5MDh8cmVmbmljazo1NDolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOEQlRTQlQjglQTglRTklOUQlOTklRTglQTclODV8dXNlcmlkOjQ0Om85dDJsdUJfZWVYOGRqSjRKN0xhNlBta0RJODRAd2VpeGluLnNvaHUuY29tfA; pprdig=ppyIobo4mP_ZElYXXmRTeo2q9iFgeoQ87PshihQfB2nvgsCz4FdOf-kirUuntLHKTQbgRuXdwQWT6qW-CY_ax5VDgDEdeZR7I2eIDprve43ou5ZvR0tDBlqrPNJvC0yGhQ2dZI3RqOQ3y1VialHsFnmTiHTv7TWxjliTSZJI_Bc; sgid=27-27790591-AVlo1pzPiad6EVQdGDbmwnvM; PHPSESSID=mkp3erf0uqe9ugjg8os7v1e957; SUIR=CEA85AE02A2F7E6EAFF9C1FE2ABEBE6F; sct=11; ppmdig=1500046378000000b7527c423df68abb627d67a0666fdcee; successCount=1|Fri, 14 Jul 2017 15:38:07 GMT',
        'Host':
        'weixin.sogou.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
    }
    session = Session()
    queue = RedisQueue()
    mysql = MySQL()

    def get_proxy(self):
        """
        从代理池获取代理
        :return:
        """
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('Get Proxy', response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None

    def start(self):
        """
        初始化工作
        """
        # 全局更新Headers
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + urlencode({
            'query': self.keyword,
            'type': 2
        })
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)
        # 调度第一个请求
        self.queue.add(weixin_request)

    def parse_index(self, response):
        """
        解析索引页
        :param response: 响应
        :return: 新的响应
        """
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)
            weixin_request = WeixinRequest(url=url,
                                           callback=self.parse_index,
                                           need_proxy=True)
            yield weixin_request

    def parse_detail(self, response):
        """
        解析详情页
        :param response: 响应
        :return: 微信公众号文章
        """
        doc = pq(response.text)
        data = {
            'title':
            doc('.rich_media_title').text(),
            'content':
            doc('.rich_media_content').text(),
            'date':
            doc('#post-date').text(),
            'nickname':
            doc('#js_profile_qrcode > div > strong').text(),
            'wechat':
            doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
        }
        yield data

    def request(self, weixin_request):
        """
        执行请求
        :param weixin_request: 请求
        :return: 响应
        """
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https': 'https://' + proxy
                    }
                    return self.session.send(weixin_request.prepare(),
                                             timeout=weixin_request.timeout,
                                             allow_redirects=False,
                                             proxies=proxies)
            return self.session.send(weixin_request.prepare(),
                                     timeout=weixin_request.timeout,
                                     allow_redirects=False)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    def error(self, weixin_request):
        """
        错误处理
        :param weixin_request: 请求
        :return:
        """
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times',
              weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)

    def schedule(self):
        """
        调度请求
        :return:
        """
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            response = self.request(weixin_request)
            if response and response.status_code in VALID_STATUSES:
                results = list(callback(response))
                if results:
                    for result in results:
                        print('New Result', type(result))
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert('articles', result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def run(self):
        """
        入口
        :return:
        """
        self.start()
        self.schedule()
Esempio n. 3
0
class Spider():
    base_url = 'http://weixin.sogou.com/weixin'
    keyword = 'NBA'
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie':
        'ABTEST=8|1537520547|v1; SNUID=D071F262080D7E55EE03CCD708FECF45; IPLOC=CN3100; SUID=D879F5652930990A000000005BA4B3A3; SUID=D879F5652113940A000000005BA4B3A3; JSESSIONID=aaa45tYJMfZ24hiEC3Bvw; SUV=002D1DEC65F579D85BA4B3A4AC47E291',
        'Host': 'weixin.sogou.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest'
    }
    session = Session()
    queue = RedisQueue()
    mysql = MySQL()

    def get_proxy(self):
        """
        从代理池获取代理
        :return:
        """
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('Get Proxy', response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None

    def start(self):
        """
        初始化工作
        """
        # 全局更新Headers
        self.session.headers.update(self.headers)
        # 拼接参数
        start_url = self.base_url + '?' + parse.urlencode({
            'query': self.keyword,
            'type': 2
        })
        # 构建请求对象
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)
        # 将请求加入到队列中
        self.queue.add(weixin_request)

    def parse_index(self, response):
        """
        解析索引页
        :param response: 响应
        :return: 新的响应
        """
        doc = pq(response.text)
        # 解析响应内容,并转为键值对的形式
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            # 获取详情页URL
            url = item.attr('href')
            # 构建详情页请求
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        # 获取下一页列表页数据
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)
            # 构建下一页列表页请求对象
            weixin_request = WeixinRequest(url=url,
                                           callback=self.parse_index,
                                           need_proxy=True)
            yield weixin_request

    def parse_detail(self, response):
        """
        解析详情页
        :param response: 响应
        :return: 微信公众号文章
        """
        # 这里使用PyQuery解析详情页数据
        doc = pq(response.text)
        data = {
            'title':
            doc('.rich_media_title').text(),
            'content':
            doc('.rich_media_content').text(),
            'date':
            doc('#post-date').text(),
            'nickname':
            doc('#js_profile_qrcode > div > strong').text(),
            'wechat':
            doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
        }
        print(data)
        yield data

    def request(self, weixin_request):
        """
        执行请求
        :param weixin_request: 请求
        :return: 响应
        """
        try:
            # 如果使用代理
            if weixin_request.need_proxy:
                # 向代理池获取随机代理
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https': 'https://' + proxy
                    }
                    return self.session.send(weixin_request.prepare(),
                                             timeout=weixin_request.timeout,
                                             allow_redirects=False,
                                             proxies=proxies)
            return self.session.send(weixin_request.prepare(),
                                     timeout=weixin_request.timeout,
                                     allow_redirects=False)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    def error(self, weixin_request):
        """
        错误处理
        :param weixin_request: 请求
        :return:
        """
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times',
              weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)

    def schedule(self):
        """
        调度请求
        :return:
        """
        while not self.queue.empty():
            # 获取列表中的最后一个请求对象
            weixin_request = self.queue.pop()
            # 获取处理响应的回调函数
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            # 发送请求,获取响应
            response = self.request(weixin_request)
            # 如果返回响应成功
            if response and response.status_code in VALID_STATUSES:
                # 解析响应
                results = list(callback(response))
                if results:
                    for result in results:
                        print('New Result', type(result))
                        # 如果结果是请求对象,则添加到请求队列中
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        # 如果结果是数据,则存入MySql
                        if isinstance(result, dict):
                            self.mysql.insert('articles', result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def run(self):
        """
        入口
        :return:
        """
        self.start()
        self.schedule()
Esempio n. 4
0
class Spider():
    base_url = 'http://weixin.sogou.com/weixin'
    keyword = 'NBA'
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': 'ABTEST=0|1553070123|v1; SNUID=344B814B7773F2F4278902CD77E51845; IPLOC=CN1100; SUID=423DF73C4631990A000000005C91F82B; SUID=266252D25F20940A000000005C91F82B; JSESSIONID=aaaIO6MIIk4aC_XhZM-Lw; SUV=002F49023CF73D425C91F82BA9EB8957',
        'Host': 'weixin.sogou.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
    }
    session = Session()
    # session.keep_alive = False
    queue = RedisQueue()
    mysql = MySQL()

    def update_cookie(self):
        self.headers['Cookie'] = get_cookie()

    def get_proxy(self):
        """
        从代理池获取代理
        :return:
        """
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('Get Proxy', response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None

    def start(self):
        """
        初始化工作
        """
        # 全局更新Headers
        # self.update_cookie()
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + urlencode({'query': self.keyword, 'type': 2,'sut':7956,'lkt':'1%2C1553052272863%2C1553052272863','s_from':'input','_sug_':'y','sst0':'1553052272967','ie':'utf8','w':'01019900','dr':'1'})
        weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True)
        # 调度第一个请求
        self.queue.add(weixin_request)

    def parse_index(self, response):
        """
        解析索引页
        :param response: 响应
        :return: 新的响应
        """
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)
            weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True)
            yield weixin_request

    def parse_detail(self, response):
        """
        解析详情页
        :param response: 响应
        :return: 微信公众号文章
        """
        doc = pq(response.text)
        data = {
            'title': doc('.rich_media_title').text(),
            'content': doc('.rich_media_content').text(),
            'date': doc('#post-date').text(),
            'nickname': doc('#js_profile_qrcode > div > strong').text(),
            'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
        }
        yield data

    def request(self, weixin_request):
        """
        执行请求
        :param weixin_request: 请求
        :return: 响应
        """
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https': 'https://' + proxy
                    }
                    return self.session.send(weixin_request.prepare(),
                                             timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies)

            return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False)
            # return self.session.get(weixin_request.url,headers=self.headers,timeout=weixin_request.timeout, allow_redirects=False)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return None

    def error(self, weixin_request):
        """
        错误处理
        :param weixin_request: 请求
        :return:
        """
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)

    def schedule(self):
        """
        调度请求
        :return:
        """
        while not self.queue.empty():
            # time.sleep(1)
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            response = self.request(weixin_request)

            if response and response.status_code in VALID_STATUSES:
                   self.handleResponse200(response,weixin_request)
                # elif response.status_code in REDIRECT_CODES:
                #     retry_request = loads(dumps(weixin_request))
                #     retry_request.url = response.headers['Location']
                #     r = self.request(retry_request)
                #     if r and r.status_code in VALID_STATUSES:
                #         self.handleResponse200(r, weixin_request)
                #     else:
                #         self.error(weixin_request)
                # else:
                #     self.error(weixin_request)
            else:
                self.error(weixin_request)

    def handleResponse200(self,response,weixin_request):
        callback = weixin_request.callback
        results = list(callback(response))
        if results:
            for result in results:
                print('New Result', type(result))
                if isinstance(result, WeixinRequest):
                    self.queue.add(result)
                if isinstance(result, dict):
                    self.mysql.insert('articles', result)
        else:
            self.error(weixin_request)

    def run(self):
        """
        入口
        :return:
        """
        self.start()
        self.schedule()
Esempio n. 5
0
class Spider():
    base_url = 'https://weixin.sougou.com/weixin?'
    keyword = KEYWORD
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate, br',
        'Accept-Language':
        'zh-CN,zh;q=0.9,en;q=0.8',
        'Connection':
        'keep-alive',
        'Cookie':
        'CXID=68999D20535A955E54EEB369EEBDAA87; SUID=7D0481DF3565860A5B922DAB00041476; '
        'SUV=00724ADFDF81047D5B9390FE3CE03520; ad=Ukllllllll2b6ALrlllllVmUX@1lllllTc99Kyllll'
        '9llllljylll5@@@@@@@@@@; IPLOC=CN5101; ABTEST=0|1536564030|v1; weixinIndexVisited=1; '
        'SNUID=6AEE6B35EBEE9D9F5957A098EBEC0DF0; sct=1; JSESSIONID=aaaqsTn37HldSeg_akWyw; '
        'ppinf=5|1538793682|1540003282|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo0NTol'
        'RTYlODUlQTIlRTYlODUlQTIlRTYlODUlQTIlRTYlOTclQjYlRTUlODUlODl8Y3J0OjEwOjE1Mzg3OTM2ODJ8cmVm'
        'bmljazo0NTolRTYlODUlQTIlRTYlODUlQTIlRTYlODUlQTIlRTYlOTclQjYlRTUlODUlODl8dXNlcmlkOjQ0Om85d'
        'DJsdURabHBHRjJ1TF9vbGtrV01MbTlHWFFAd2VpeGluLnNvaHUuY29tfA; pprdig=YXVgbs0p9dU4aBgDw7V_id'
        'ljKjCcGiXgeUpafLd_FO65GO0AMS3VWq_ogoKBR7XpAChV9r3DxwwMN_lwgpTwjbT4al7JXyKKOua-q3IoMvfo2KwI1'
        'sXoNQKlyuxomXov9kuvMJkAHq4x6HCYOtsNhkW92H_acgTIeDo65hnDIbc; sgid=15-37413245-AVu4ININKITuO'
        '1IBrovHceA; ppmdig=153880606700000019649cd69fcbff1cb91d0c6884906b6b; LSTMV=469%2C259; LCLKINT=5007',
        'Host':
        'weixin.sogou.com',
        'Upgrade-Insecure-Requests':
        '1',
        # 为了对付防盗链,对方服务器会事变header中的Referer是不是自己的,所以我们会在头部中加上Referer
        'Referer':
        'https://weixin.sogou.com/weixin?query=%E9%A3%8E%E6%99%AF&type=2&page=17&ie=utf8',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
    }
    # 初始化Session和RedisQueue MySQL对象,分别执行请求、代理调用、存储要求
    session = Session()
    queue = RedisQueue()
    mysql = MySQL()

    def get_proxy(url):
        """
        从代理池中获取代理
        """
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('Get Proxy', response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None

    def start(self):
        """
        初始化工作
        """
        # 全局更新Headers,使得所有请求都能应用Cookies
        self.session.headers.update(self.headers)
        # 起始URL的构造
        start_url = self.base_url + '?' + urlencode({
            'query': self.keyword,
            'type': 2
        })
        # 构造WeixinRequest对象,回调函数:请求成功后用parse_index()处理和解析 need_proxy参数执行请求须用代理
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)
        # 请求加入队列,调度第一个请求
        self.queue.add(weixin_request)

    def parse_index(self, response):
        """
        解析索引页
        :param response: 响应
        :return: 新的响应
        """
        doc = pq(response.text)
        # 获取本页所有的微信文章链接
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            # 构造成WeixinRequest之后yield返回
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        # 获取下一页的链接
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)
            # 构造成WeixinRequest之后yield返回
            weixin_request = WeixinRequest(url=url,
                                           callback=self.parse_index,
                                           need_proxy=True)
            yield weixin_request

    def parse_detail(self, response):
        """
        解析详情页
        :param response: 响应
        :return: 微信公众号文章
        """
        doc = pq(response.text)
        # 提取标题、正文文本、发布日期、发布人昵称、公众号名称,组合成字典返回
        data = {
            'title':
            doc('.rich_media_title').text(),
            'content':
            doc('.rich_media_content').text(),
            'date':
            doc('#publish_time').text(),
            'nickname':
            doc('#meta_content > span.rich_media_meta.rich_media_meta_text').
            text(),
            'wechat':
            doc('#profileBt > #js_name').text()
        }
        yield data
        # 返回之后需要判断类型,字典类型调用mysql对象的insert()方法存入数据库

    def request(self, weixin_request):
        """
        执行请求
        :param weixin_request: 请求
        :return: 响应
        """
        try:
            # 先判断请求是否需要代理,调用Session的send()方法执行请求
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https': 'https://' + proxy
                    }
                    return self.session.send(weixin_request.prepare(),
                                             timeout=weixin_request.timeout,
                                             allow_redirects=False,
                                             proxies=proxies)
                # 请求调用prepare()方法转化为Prepared Request,不重定向,请求超时时间,响应返回
                return self.session.send(weixin_request.prepare(),
                                         timeout=weixin_request.timeout,
                                         allow_redirects=False)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    def error(self, weixin_request):
        """
        错误处理
        """
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times',
              weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)

    def schedule(self):
        """
        调度请求,schedule()方法,内部是一个循环,条件:队列不为空
        """
        while not self.queue.empty:
            # 调用pop()方法取出下一个请求,request()方法执行请求
            # 第一次循环结束,while继续执行,队列包含第一页内容的文章详情页请求和下一页的请求,
            # 第二次循环得到的下一个请求是文章详情页的请求,重新调用request()方法获得响应,对应回调函数parse_detail()
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            response = self.request(weixin_request)
            # request()方法得到Response对象的状态码合法判断,调用WeixinRequest的回调函数(parse_index())解析
            if response and response.status_code in VALID_STATUS:
                results = list(callback(response))
                # schedule()方法将返回结果遍历,利用isinstance()方法判断返回结果
                if results:
                    for result in results:
                        print('New Result', result)
                        # 判断类型是否相同
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert('articles', result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def run(self):
        """
        入口
        :return:
        """
        self.start()
        self.schedule()
Esempio n. 6
0
class Spider():
    base_url = 'http://weixin.sogou.com/weixin'
    keyword = 'NBA'
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Cookie':
        'CXID=1B446AEB1D516FD5C765CE37868DCA1D; SUID=8C9E10743565860A5B4C06780003B8C5; SUV=00504AE374109E5F5B4D680A993BF652; ABTEST=0|1531819884|v1; IPLOC=CN4451; SNUID=D71998F3888DF6A614EEF7B3889DAD9A; JSESSIONID=aaahykvTDvD8R-ZaY7Gsw; ad=Iyllllllll2bFlBclllllVH2CFYlllllNxjW0lllll9lllll9ylll5@@@@@@@@@@; weixinIndexVisited=1; sct=1; ppinf=5|1531884187|1533093787|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTowOnxjcnQ6MTA6MTUzMTg4NDE4N3xyZWZuaWNrOjA6fHVzZXJpZDo0NDpvOXQybHVDdzFUdmVWN2tVaTEyRk5WcndlLWV3QHdlaXhpbi5zb2h1LmNvbXw; pprdig=JKKPygKv7H7Co6cneyKbsXQ8QUUG1LwoV5MgwIaMBBFBV2rEuyGbkhtgql1UoHqJz5SyY5mCnPPUWZqtD6pgw--LWoYRTXTqCeyNamhAEW4EVYs5XW_MLh0OcUkXgv7DjiwCqNj3F7bxIpOjyE_RKMiBAP_OHlBre9MtNgwQwOs; sgid=25-36102587-AVtOsptgowqxnSJe555xWxw; ppmdig=153190021100000074f921db30dd23e8562dd7af293cd8fb',
        'Host':
        'weixin.sogou.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:54.0) Gecko/20100101 Firefox/54.0'
    }
    session = Session()
    queue = RedisQueue()
    mysql = MySQL()

    def get_proxy(self):
        """
        从代理池获取代理
        :return:
        """
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('Get Proxy', response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None

    def parse_index(self, response):
        """
        解析索引页
        :param response: 响应
        :return: 新的响应
        """
        doc = BeautifulSoup(response.text, 'lxml')
        items = doc.select('h3 a')
        for item in items:
            url = item['href']
            weixin_request = WeixinRequset(url=url, callback=self.parse_detail)
            yield weixin_request
        print(response.text)
        next = doc.select('#sogou_next')[0]['href']
        print(next)
        if next:
            url = self.base_url + str(next)
            weixin_request = WeixinRequset(url=url,
                                           callback=self.parse_index,
                                           need_proxy=True)
            yield weixin_request

    def parse_detail(self, response):
        """
        解析详情页
        :param response:响应
        :return: 微信公众号文章
        """
        doc = BeautifulSoup(response.text, 'lxml')
        data = {
            'title':
            doc('.rich_media_title').text(),
            'content':
            doc('.rich_media_content').text(),
            'date':
            doc('#post-date').text(),
            'nickname':
            doc('#js_profile_qrcode > div > strong').text(),
            'wechat':
            doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
        }
        yield data

    def start(self):
        """
        初始化工作
        :return:
        """
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + urlencode({
            'query': self.keyword,
            'type': 2
        })
        weixin_request = WeixinRequset(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)
        # 添加第一个请求
        self.queue.add(weixin_request)

    def request(self, weixin_request):
        """
        执行请求
        :param weixin_request: 请求
        :return: 响应
        """
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https': 'https://' + proxy,
                    }
                    return self.session.send(weixin_request.prepare(),
                                             timeout=weixin_request.timeout,
                                             allow_redirects=False,
                                             proxies=proxies)

            return self.session.send(weixin_request.prepare(),
                                     timeout=weixin_request.timeout,
                                     allow_redirects=False)
        except (ConnectionError) as e:
            print(e.args)
            return False

    def error(self, weixin_request):
        """
        错误处理
        :param weixin_request:
        :return:
        """
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times',
              weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)

    def schedule(self):
        """
        调度请求
        :return:
        """
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            response = self.request(weixin_request)
            print(response)
            # print(type(response))
            # print(response.text)
            # print('11111111')
            if response and response.status_code in VALID_STATUSES:
                results = list(callback(response))
                if results:
                    for result in results:
                        print('New Result', type(result))
                        if isinstance(result, WeixinRequset):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert('articles', result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def run(self):
        """

        :return:
        """
        self.start()
        self.schedule()
Esempio n. 7
0
class Spider():
    base_url = 'http://weixin.sogou.com/weixin'
    keyword = '吃鸡'
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Language':
        'zh-CN,zh;q=0.9',
        'Connection':
        'keep-alive',
        'Cookie':
        'ABTEST=0|1531205382|v1; IPLOC=CN1100; SUID=4DFF7B7B2930990A000000005B445706; SUID=4DFF7B7B6119940A000000005B445706; weixinIndexVisited=1; SUV=000927B97B7BFF4D5B445710B98DF777; sct=1; SNUID=E250D5D4AEAADE3039217104AF02D5BA; JSESSIONID=aaaYi9nbXUS5thab6Mgrw',
        'Host':
        'weixin.sogou.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'
    }
    session = Session()
    queue = RedisQueue()
    mysql = MySQL()

    def get_proxy(self):
        """
        从代理池获取代理
        :return:
        """
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('Get Proxy', response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None

    def start(self):
        """
        初始化工作
        """
        # 全局更新Headers
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + urlencode({
            'query': self.keyword,
            'type': 2
        })
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=False,
                                       timeout=15)
        # 调度第一个请求
        self.queue.add(weixin_request)

    def parse_index(self, response):
        """
        解析索引页
        :param response: 响应
        :return: 新的响应
        """
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)
            weixin_request = WeixinRequest(url=url,
                                           callback=self.parse_index,
                                           need_proxy=False)
            yield weixin_request

    def parse_detail(self, response):
        """
        解析详情页
        :param response: 响应
        :return: 微信公众号文章
        """
        doc = pq(response.text)
        date = re.findall('publish_time = "(.*?)"', response.text, re.S)
        if date:
            data = {
                'title':
                doc('.rich_media_title').text(),
                'content':
                doc('.rich_media_content').text(),
                # 'date': doc('#publish_time').text(),
                'date':
                date[0],
                'nickname':
                doc('#js_profile_qrcode > div > strong').text(),
                'wechat':
                doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
            }
            yield data
        else:
            data = {
                'title':
                doc('.rich_media_title').text(),
                'content':
                doc('.rich_media_content').text(),
                'date':
                doc('#publish_time').text(),
                # 'date': re.findall('publish_time = "(.*?)"', response.text, re.S)[0],
                'nickname':
                doc('#js_profile_qrcode > div > strong').text(),
                'wechat':
                doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
            }
            print(data)

    def request(self, weixin_request):
        """
        执行请求
        :param weixin_request: 请求
        :return: 响应
        """
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https': 'https://' + proxy
                    }
                    return self.session.send(
                        weixin_request.prepare(),
                        timeout=weixin_request.timeout,
                        allow_redirects=False,
                    )  #proxies=proxies
            return self.session.send(weixin_request.prepare(),
                                     timeout=weixin_request.timeout,
                                     allow_redirects=False)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    def error(self, weixin_request):
        """
        错误处理
        :param weixin_request: 请求
        :return:
        """
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times',
              weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)

    def schedule(self):
        """
        调度请求
        :return:
        """
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            # response = self.request(weixin_request,allow_redirects=True)####
            response = requests.get(url=weixin_request.url, timeout=50)
            if response and response.status_code in VALID_STATUSES:
                results = list(callback(response))
                if results:
                    for result in results:
                        print('New Result', type(result))
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert('articles', result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def run(self):
        """
        入口
        :return:
        """
        self.start()
        self.schedule()
Esempio n. 8
0
class Spider(object):
    base_url = 'http://weixin.sogou.com/weixin'
    keyword = 'NBA'
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': 'SUV=0019178D7139E4CF5A98E24886E85289; SUID=9AF49C733120910A000000005AAA1750; ABTEST=8|1530090681|v'
                  '1;IPLOC=CN4210; weixinIndexVisited=1; SNUID=612CF6BECECAA052EA3A6F50CF791601; JSESSIONID=aaa0_Q-S5EK'
                  '7BcKF4S7qw; ppinf=5|1530098114|1531307714|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTQlQk'
                  'QlOTglRTUlODElQTUlRTUlQTQlQUJ8Y3J0OjEwOjE1MzAwOTgxMTR8cmVmbmljazoyNzolRTQlQkQlOTglRTUlODElQTUlRTUlQT'
                  'QlQUJ8dXNlcmlkOjQ0Om85dDJsdU9JbjdQV1F4UC1wQV90UDhDR2dLdE1Ad2VpeGluLnNvaHUuY29tfA; pprdig=RnSWD7qnomx'
                  'sf-V3yOXjt7Jk9zZwYYiXZKsByre9tciFYGNqAreHjU1paH2_7j9yUJDAxxdJZ4rfTI8EwIRhK_rDckoa0PcwrKB2UzA2ou--Ddl'
                  'KELNDgr-2EOPJ5BdDFBgJh84r7fsZC2SQBEGzB0kqxVboX6ZzSXdGS86hlRE; sgid=23-35736769-AVszccJNic84GSkZY3bek'
                  'BMY; ppmdig=15300981140000009a7459d5a937b96a96b9db16b616914a; sct=3',
        'Host': 'weixin.sogou.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/59.0.3071.115 Safari/537.36'
    }
    session = Session()
    queue = RedisQueue()
    mysql = MySQL()

    def get_proxy(self):
        """
        从代理池获取代理
        :return:
        """
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('Get Proxy', response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None

    def start(self):
        """
        初始化
        :return:
        """
        # 全局更新headers
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + urlencode({'query': self.keyword, 'type': 2})
        weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True)
        # 调度第一个请求
        self.queue.add(weixin_request)

    def parse_index(self, response):
        """
        解析索引页
        :param response:
        :return:
        """
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)
            weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True)
            yield weixin_request

    def parse_detail(self, response):
        """
        解析详情页
        :param response:
        :return: 微信公众号文章
        """
        doc = pq(response.text)
        data = {
            'title': doc('.rich_media_title').text(),
            'content': doc('.rich_media_content').text(),
            'data': doc('#post-data').text(),
            'nickname': doc('#js_profile_qrcode > div > strong').text(),
            'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
        }
        yield data

    def request(self, weixin_request):
        """
        执行请求
        :param weixin_request:
        :return:
        """
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https': 'https://' + proxy
                    }
                    return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout,
                                             allow_redirects=False, proxies=proxies)
            return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False)
        except(ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    def error(self, weixin_request):
        """
        错误处理
        :param weixin_request:
        :return:
        """
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)

    def schedule(self):
        """
        调度请求
        :return:
        """
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            response = self.request(weixin_request)
            if response and response.status_code in VALID_STATUSES:
                results = list(callback(response))
                if results:
                    for result in results:
                        print('New Result', type(result))
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert('articles', result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def run(self):
        self.start()
        self.schedule()