コード例 #1
0
    def fetch_ori_page(self, page_url):
        """拿到单个文章页面,在文章url里加上参数f=json可以直接得到
        json格式的数据。
        """
        # 先拿到搜狗跳转到微信文章的地址
        pre_r = get(page_url, headers=self.headers)
        wechat_url = pre_r.url.split('#')[0] + '&f=json'

        if 'mp.weixin' not in wechat_url:
            return

        r = get(wechat_url, headers=self.headers)
        self.logger.info(wechat_url)
        if self.col.find_one(dict(nick_name=self.name, url=wechat_url)):
            raise DocumentExistsException("article exist")
        if r.status_code != 200:
            return

        o = json.loads(r.text)
        self.col.update(
            {
                '_id': gid(),
                'nick_name': self.name,
                'url': wechat_url,
            }, {'$set': {
                'json': o
            }},
            upsert=True)
コード例 #2
0
ファイル: sg.py プロジェクト: PegasusWang/wechannel
    def search(self, retries=3):
        """搜索搜狗微信公众号并返回公众号文章列表页面,返回列表格式如下
        http://weixin.sogou.com/gzh?openid=oIWsFt2uCBiQ3mWa2BSUtmdKD3gs&ext=p8lVKENENbkGdvuPNCqHoUqzuLEPtZheP6oyzp3YVsY_-OJEvXMz4yk2nJytyUxY
        """
        query_url = 'http://weixin.sogou.com/weixin?type=1&' + urlencode({'query': self.name})
        self.logger.info('query_url: %s', query_url)

        while retries > 0:
            self.logger.info('retry search %s %d' % (self.name, retries))
            html = get(query_url, headers=self.headers).text
            soup = BeautifulSoup(html)
            a_tag_list = soup.find_all(attrs={'uigs': re.compile('account_name')})
            href = None
            try:
                for a_tag in a_tag_list:
                    if a_tag and a_tag.text.lower() == self.name.lower():
                        href = a_tag.get('href')
                        break
            except Exception:
                self.logger.info('found %s failed' % self.name)
                continue

            if href is not None:
                break
            else:
                self.update_headers()
                time.sleep(random.randint(30, 60))
            retries -= 1

        res = href or None
        return res
コード例 #3
0
ファイル: sg.py プロジェクト: PegasusWang/wechannel
    def search(self, retries=3):
        """搜索搜狗微信公众号并返回公众号文章列表页面,返回列表格式如下
        http://weixin.sogou.com/gzh?openid=oIWsFt2uCBiQ3mWa2BSUtmdKD3gs&ext=p8lVKENENbkGdvuPNCqHoUqzuLEPtZheP6oyzp3YVsY_-OJEvXMz4yk2nJytyUxY
        """
        query_url = 'http://weixin.sogou.com/weixin?type=1&' + urlencode(
            {'query': self.name})
        self.logger.info('query_url: %s', query_url)

        while retries > 0:
            self.logger.info('retry search %s %d' % (self.name, retries))
            html = get(query_url, headers=self.headers).text
            soup = BeautifulSoup(html)
            a_tag_list = soup.find_all(
                attrs={'uigs': re.compile('account_name')})
            href = None
            try:
                for a_tag in a_tag_list:
                    if a_tag and a_tag.text.lower() == self.name.lower():
                        href = a_tag.get('href')
                        break
            except Exception:
                self.logger.info('found %s failed' % self.name)
                continue

            if href is not None:
                break
            else:
                self.update_headers()
                time.sleep(random.randint(30, 60))
            retries -= 1

        res = href or None
        return res
コード例 #4
0
    def fetch_article_list(self, url, update=False):
        """ 微信号列表页面获取文章列表,返回json格式的数据,请求如下
        http://weixin.sogou.com/gzhjs?openid=oIWsFt0qY9YvyYESHey3MOPfbNy0&ext=lA5I5al3X8CtYOmsUDOgMhZWHWk6xQhEnWXQ_8nrROTPnk351KTH-rcTJUTGDdZq&cb=sogou.weixin_gzhcb&page=3
        """
        query_url = 'http://weixin.sogou.com/gzhjs?cb=sogou.weixin_gzhcb&'
        query_dict = dict(urlparse.parse_qsl(urlparse.urlsplit(url).query))

        while True:
            page = self.page
            if update and page > 2:
                page = 1
            if not update and page > 10:
                break
            self.logger.info('抓取:%s page: %d' % (self.name, page))
            query_dict['page'] = page
            json_url = query_url + urlencode(query_dict)
            json_str = get(json_url, headers=self.headers).text
            try:
                url_list, total_pages = self.parse_list_page(json_str.strip())
            except Exception:
                traceback.print_exc()
                self.update_headers()
                continue

            if not url_list or page > min(10, total_pages):  # 非登录只能抓100条
                self.logger.info('%s爬取完毕' % self.name)
                break

            try:
                for page_url in url_list:
                    time.sleep(random.randint(3, 10))  # sougou频率限制
                    self.logger.info(page_url)
                    self.fetch_page(page_url)
                    # self.fetch_ori_page(page_url)
            except (DocumentExistsException, DocumentExpireException):
                self.logger.info("更新完毕")
                break
            except Exception:
                traceback.print_exc()
                self.update_headers()
                continue

            page += 1
            self.page = page
コード例 #5
0
    def fetch_channel_json(self, channel_json_url):
        time.sleep(random.randint(30, 60))
        self.logger.info(channel_json_url)
        res = get(channel_json_url, headers=self.headers)
        # http://stackoverflow.com/questions/24027589/how-to-convert-raw-javascript-object-to-python-dictionary
        html = res.text.strip()
        o = ast.literal_eval(html)
        if not o:
            self.logger.info(pprint.pformat(html))
            self.logger.info('fetch channel_json_url: %s failed',
                             channel_json_url)
            change_ip()
            return
        nick_name = o['nick_name']
        general_msg_list = o['general_msg_list']
        article_list = ast.literal_eval(general_msg_list)['list']
        article_dict_list = []
        for article in article_list:
            app_msg_ext_info = article['app_msg_ext_info']
            comm_msg_info = article['comm_msg_info']
            ori_create_time = comm_msg_info['datetime']

            article_dict_list.append(
                self._get_articel_info(app_msg_ext_info, nick_name,
                                       ori_create_time))
            if app_msg_ext_info['is_multi']:
                for article_info in app_msg_ext_info[
                        'multi_app_msg_item_list']:
                    article_dict_list.append(
                        self._get_articel_info(article_info, nick_name,
                                               ori_create_time))

        article_dict_list = self.get_remove_too_old_days_article(
            article_dict_list)
        article_dict_list = self.get_remove_mongodb_already_has_article(
            nick_name, article_dict_list)

        for article_dict in article_dict_list:
            article_dict['link'] = self.get_permanent_wechat_article_url(
                article_dict['link'])
        self.logger.info(pprint.pformat(article_dict_list))
        self.save_article_dict_list(nick_name, article_dict_list)
コード例 #6
0
ファイル: sg.py プロジェクト: PegasusWang/wechannel
    def fetch_channel_json(self, channel_json_url):
        time.sleep(random.randint(60, 120))
        self.logger.info(channel_json_url)
        res = get(channel_json_url, headers=self.headers)
        # http://stackoverflow.com/questions/24027589/how-to-convert-raw-javascript-object-to-python-dictionary
        html = res.text.strip()
        o = ast.literal_eval(html)
        if not o:
            self.logger.debug(pprint.pformat(html))
            self.logger.info(
                'fetch channel_json_url: %s failed', channel_json_url
            )
            change_ip()
            return
        nick_name = o['nick_name']
        general_msg_list = o['general_msg_list']
        article_list = ast.literal_eval(general_msg_list)['list']
        article_dict_list = []
        for article in article_list:
            app_msg_ext_info = article['app_msg_ext_info']
            comm_msg_info = article['comm_msg_info']
            ori_create_time = comm_msg_info['datetime']

            article_dict_list.append(
                self._get_articel_info(
                    app_msg_ext_info, nick_name, ori_create_time
                )
            )
            if app_msg_ext_info['is_multi']:
                for article_info in app_msg_ext_info['multi_app_msg_item_list']:
                    article_dict_list.append(
                        self._get_articel_info(
                            article_info, nick_name, ori_create_time
                        )
                    )

        article_dict_list = self.get_remove_too_old_days_article(article_dict_list)
        article_dict_list = self.get_remove_mongodb_already_has_article(nick_name, article_dict_list)

        self.logger.info(pprint.pformat(article_dict_list))
        self.save_article_dict_list(nick_name, article_dict_list)
コード例 #7
0
    def search(self, retries=3):
        """搜索搜狗微信公众号并返回公众号文章列表页面,返回列表格式如下
        http://weixin.sogou.com/gzh?openid=oIWsFt2uCBiQ3mWa2BSUtmdKD3gs&ext=p8lVKENENbkGdvuPNCqHoUqzuLEPtZheP6oyzp3YVsY_-OJEvXMz4yk2nJytyUxY
        """
        if not self.name:
            return
        if self.page > 10:
            self.logger.info("抓取前10页结束: %s" % self.name)
            return None
        query_url = 'http://weixin.sogou.com/weixin?type=1&' + \
            urlencode({'query': self.name})
        self.logger.info('query_url: %s', query_url)

        while retries > 0:
            self.logger.info('retry search %s %d' % (self.name, retries))
            html = get(query_url, headers=self.headers).text
            soup = BeautifulSoup(html)
            item_tag_li = soup.find_all('div',
                                        class_="wx-rb bg-blue wx-rb_v1 _item")
            href = None
            try:
                for item_tag in item_tag_li:
                    _href = item_tag.get('href')
                    _title = item_tag.find(class_='txt-box').h3.text
                    if (_title.strip() == self.name.strip()
                            and '最近文章' in item_tag.get_text()):
                        href = _href
                        break
            except Exception:
                self.logger.info('found %s failed' % self.name)
                continue

            if href is not None:
                break
            else:
                self.update_headers()
                time.sleep(3)
            retries -= 1

        res = href or None
        return res
コード例 #8
0
    def get_cookie_str(cls):
        """生成一个搜狗微信的cookie并返回
        """
        while True:
            time.sleep(5)
            url = 'http://weixin.sogou.com/weixin?query=%s' % \
                random.choice('abcdefghijklmnopqrstuvwxyz')

            # 获取SNUID
            cookie = get(url, headers=cls.get_headers())
            headers = cookie.headers
            try:
                cookie_str = headers.get('Set-Cookie') + '; ' + \
                    SougouWechat.getSUV()
            except Exception:
                cookie_str = None

            cls.logger.info('cookie_str: %s' % cookie_str)
            # 跳过没有设置SNUID的
            if cookie_str and 'SUID' in cookie_str and 'SNUID' in cookie_str:
                return cookie_str
コード例 #9
0
ファイル: sg.py プロジェクト: PegasusWang/wechannel
    def get_cookie_str(cls):
        """生成一个搜狗微信的cookie并返回
        """
        while True:
            time.sleep(random.randint(30, 60))
            url = 'http://weixin.sogou.com/weixin?query=%s' % \
                random.choice('abcdefghijklmnopqrstuvwxyz')

            # 获取SNUID
            cookie = get(url, headers=cls.get_headers())
            headers = cookie.headers
            try:
                cookie_str = headers.get('Set-Cookie') + '; ' + \
                    SougouWechat.getSUV()
            except Exception:
                cookie_str = None

            cls.logger.info('cookie_str: %s' % cookie_str)
            # 跳过没有设置SNUID的
            if cookie_str and 'SUID' in cookie_str and 'SNUID' in cookie_str:
                return cookie_str
コード例 #10
0
ファイル: morningstar.py プロジェクト: sjl421/pyhome
def fetch_parse():
    url = 'http://cn.morningstar.com/handler/fundranking.ashx?date=2016-04-08&fund=&category=mix_radical&rating=&company=&cust=&sort=Return2Year&direction=desc&tabindex=1&pageindex=1&pagesize=10000&randomid=0.043611296370827723'
    html = get(url).text
    parse_html(html)
コード例 #11
0
    def fetch_page(self, page_url):
        """拿到单个文章页面,在文章url里加上参数f=json可以直接得到json格式
        的数据,处理json拿到需要的字段。
        """
        if self.col.find(dict(nick_name=self.name)).count() > self.limit:
            oldest_doc = list(
                self.col.find(dict(nick_name=self.name)).sort([
                    ('ori_create_time', 1)
                ]).limit(1))[0]
            oldest_doc_id = oldest_doc.get('_id')
            self.col.remove({'_id': oldest_doc_id})
            self.logger.info(
                "%s:删除:%s : %s\n" %
                (self.name, oldest_doc.get('title'),
                 datestr_from_stamp(oldest_doc.get('ori_create_time'),
                                    '%Y-%m-%d')))

        # 先拿到搜狗跳转到微信文章的地址
        pre_r = get(page_url, headers=self.headers)
        wechat_url = pre_r.url.split('#')[0] + '&f=json'

        if 'mp.weixin' not in wechat_url:
            return

        r = get(wechat_url, headers=self.headers)
        self.logger.info(wechat_url)
        if self.col.find_one(dict(nick_name=self.name, url=wechat_url)):
            raise DocumentExistsException("article exist")

        if r.status_code != 200:
            return

        o = json.loads(r.text)
        if o.get('title') is None:  # 文章被投诉后没有此字段,跳过
            return

        fields = {
            'cdn_url', 'nick_name', 'title', 'content', 'desc', 'link',
            'ori_create_time'
        }
        media_fields = {'round_head_img', 'nick_name', 'signature'}
        media_dict = {k: o.get(k) for k in media_fields}
        article_dict = {k: o.get(k) for k in fields}

        if self.col.find_one(dict(nick_name=self.name, title=o['title'])):
            raise DocumentExistsException("article exist")

        too_old_days = 10
        if days_from_now(o['ori_create_time']) > too_old_days:  # 10天之前的跳过
            self.logger.info('%s跳过%d天前文章 title : %s\n', self.name,
                             too_old_days, o['title'])
            raise DocumentExpireException("expire")

        if o['title'] and o['content']:
            o_date = datestr_from_stamp(o.get('ori_create_time'), '%Y-%m-%d')
            self.logger.info('%s-保存文章 title : %s %s\n', self.name, o['title'],
                             o_date)

            article_dict['nick_name'] = self.name
            article_dict['url'] = wechat_url
            article_dict['tag_id'] = self.tag_id
            del article_dict['content']
            self.col.update({'_id': gid()}, {'$set': article_dict}, True)

        # http://mp.weixin.qq.com/s?__biz=MjM5NjAxMDc4MA==&mid=404900944&idx=1&sn=fe2d53ce562ee51e7163a60d4c95484a#rd
        biz = extract('__biz=', '==', article_dict['link'])
        self.media_col.update({'_id': biz}, {'$set': media_dict}, True)
コード例 #12
0
 def get(self, *args, **kwargs):
     return get(*args, **kwargs)  # use web_util get