예제 #1
0
    def getArticleList(self, num=None):
        """
        :param num: 如果是具体数字则 准备最近发布的num篇文章
        :return: 根据公众号的昵称准备该公众号的所有或者前n篇文章的全部数据 如果某些字段没有就使用默认值
        """
        from pymongo import DESCENDING
        article_list = []
        col = DB(self.ori_nickname)
        if num:
            article_datas = col.table.find().sort('p_date', DESCENDING)()[:num]
        else:
            article_datas = col.get()
        begin_time = time.time()
        for doc in article_datas:
            item = {}
            doc['id'] = doc['content_url']
            for key in l1llll1lll_wcplus_:
                if key in doc:
                    item[key] = doc[key]
                else:
                    item[key] = -2

            article_list.append(item)

        logger.info('解析文章文本用时 %.3f' % (time.time() - begin_time))
        return article_list
예제 #2
0
 def __init__(self):
     self.l1l11ll1l_wcplus_ = rd.articles_detail()
     self.nickname = self.l1l11ll1l_wcplus_[0]['nickname']
     self.l11ll1l1ll_wcplus_ = 3.0
     self.l11ll1l111_wcplus_ = len(self.l1l11ll1l_wcplus_)
     self.delay = round(self.l11ll1l1ll_wcplus_ / self.l11ll1l111_wcplus_,
                        3)
     self.l11lll11l_wcplus_ = []
     self.l11ll111l_wcplus_ = DB(self.nickname)
     self.l11ll1lll1_wcplus_ = time.time()
예제 #3
0
 def __init__(self):
     self.articles_detail = rd.articles_detail()
     self.nickname = self.articles_detail[0]['nickname']
     self.delay_criterion = 2.0
     self.length = len(self.articles_detail)
     self.delay = round(self.delay_criterion / self.length, 3)
     self.length = 0
     self.length = 0
     self.data = []
     self.nickname_table = DB(self.nickname)
     self.time_now = time.time()
예제 #4
0
def l1l1llll1ll_wcplus_(nickname, md5):
    """
    :param filename:
    :return: 返回一个静动态文件
    """
    from cmp.db.mongodb import DB
    if DB(nickname).count(id=md5, comment_id={'$exists': True}):
        from webbrowser import open
        import os
        if os_version == 'win':
            file_name = os.getcwd(
            ) + '\\\\web_server\\\\static\\\\html\\\\' + nickname + '\\' + md5 + '.html'
            if os.path.isfile(file_name):
                open(file_name)
            else:
                return '找不到该文章 可能是没有迁移到新版本的WCplus 请先从旧版本的WCplus中复制或移动到新版本的WCplus的web_server/static/html目录下'
        else:
            file_name = os.getcwd(
            ) + '/web_server/static/html/' + nickname + '/' + md5 + '.html'
            if os.path.isfile(file_name):
                open('file://' + file_name)
            else:
                return '找不到该文章 可能是没有迁移到新版本的WCplus 请先从旧版本的WCplus中复制或移动到新版本的WCplus的web_server/static/html目录下'
            return ('', 204)
        return '未保存该文章 请先采集'
예제 #5
0
def l1ll1111l_wcplus_(_1l1l1ll1l_wcplus_, l1ll11lll_wcplus_=128, process=None):
    global l11ll111l_wcplus_
    global l1l1l1111l_wcplus_
    global l1ll1l11ll_wcplus_
    global nickname
    l1ll1l11ll_wcplus_ = process
    l1l1l1111l_wcplus_ = []
    nickname = _1l1l1ll1l_wcplus_
    l11ll111l_wcplus_ = DB(nickname)
    rc = l1ll11lll1_wcplus_()
    l1l1l1lll1_wcplus_ = rc.l1l1l1l11l_wcplus_(
        nickname,
        l1ll11lll_wcplus_=l1ll11lll_wcplus_,
        l1l1111ll1_wcplus_=1,
        l1l1llll11_wcplus_=l1l11lll11_wcplus_())
    if not l1l1l1lll1_wcplus_:
        return
    rc.l1ll1l11l1_wcplus_()
    rc.l1l111ll11_wcplus_()
    l1l1ll1l11_wcplus_.l1l11ll11l_wcplus_()
예제 #6
0
def l1l1ll1ll_wcplus_(l1ll11lll_wcplus_=128, process=None):
    global l11ll111l_wcplus_
    global l1l1l1111l_wcplus_
    global l1ll1l11ll_wcplus_
    global nickname
    l1ll1l11ll_wcplus_ = process
    l1l1l1111l_wcplus_ = []
    from instance import rd
    nickname = rd.articles_detail()[0]['nickname']
    l11ll111l_wcplus_ = DB(nickname)
    rc = l1ll11lll1_wcplus_()
    l1l1l1lll1_wcplus_ = rc.l1l1l1l11l_wcplus_(
        nickname,
        l1ll11lll_wcplus_=l1ll11lll_wcplus_,
        l1l1111ll1_wcplus_=1,
        l1l1llll11_wcplus_=l1l11lll11_wcplus_())
    if not l1l1l1lll1_wcplus_:
        return
    rc.l1ll1l11l1_wcplus_()
    rc.l1l111ll11_wcplus_()
    l1l1ll1l11_wcplus_.l1l11ll11l_wcplus_()
예제 #7
0
# uncompyle6 version 3.2.6
# Python bytecode 3.6 (3379)
# Decompiled from: Python 3.6.6 (default, Mar 29 2019, 00:03:27)
# [GCC 4.8.5 20150623 (Red Hat 4.8.5-36)]
# Embedded file name: instance\__init__.py
"""
定义全局对象
"""
from cmp.db.mongodb import DB
crawler_log_table_instance = DB('crawler_log')
req_data_table_instance = DB('req_data')
from app.crawl_manag.params_handle import HandleParam
rd = HandleParam()
from app.api.settings import l11l111ll_wcplus_
l1l1111ll_wcplus_ = l11l111ll_wcplus_()
from app.crawl_manag import CrawlManage
crawler_manager = CrawlManage()
from utils.base import osVersion
os_version = osVersion()
예제 #8
0
# uncompyle6 version 3.2.6
# Python bytecode 3.6 (3379)
# Decompiled from: Python 3.6.6 (default, Mar 29 2019, 00:03:27)
# [GCC 4.8.5 20150623 (Red Hat 4.8.5-36)]
# Embedded file name: app\api\settings.py
"""
提供数据设置API
"""
from cmp.db.mongodb import DB

setting_table_instance = DB('settings')


class l11l111ll_wcplus_:
    def __int__(self):
        pass

    def get(self):
        """
        :return: 获取所有的设置字段{}
        """
        sd = setting_table_instance.get()
        datas_dict = {}
        for s in sd:
            datas_dict[s['key']] = s['value']

        from cmp.protect import Passport
        from utils.network import getLocalIp
        datas_dict['uuid'] = Passport.getUUid()
        passport_expire_time = Passport.examplePassport()
        if not passport_expire_time:
예제 #9
0
 def l111l1l1l_wcplus_(self):
     l111l11ll_wcplus_ = DB(self.nickname).get()
     return l111l11ll_wcplus_
예제 #10
0
class CrawlHistoryAricleList:
    """
    优雅地拿下一个公众号的全部历史文章列表
    如果有必要直接调用自动操作手机的方法
    采集完毕之后结束对象的生命周期
    """

    def __init__(self):
        self.articles_detail = rd.articles_detail()
        self.nickname = self.articles_detail[0]['nickname']
        self.delay_criterion = 2.0
        self.length = len(self.articles_detail)
        self.delay = round(self.delay_criterion / self.length, 3)
        self.length = 0
        self.length = 0
        self.data = []
        self.nickname_table = DB(self.nickname)
        self.time_now = time.time()

    def parseHandleArticleList(self, filter=None, process=None):
        """
        :param filter: 过滤器比如按照时间过滤 按照数量过滤
        :param process: 前端进度显示实例
        :return: 轮流调用list中的微信 获取所有的历史文章列表
        """
        offset = 0
        flag = 1
        cnt = 0
        if 'load_more' in self.articles_detail[0]:
            while flag:
                while time.time() - self.time_now <= self.delay:
                    time.sleep(0.05)

                self.time_now = time.time()
                article_list = Crawler(offset, self.articles_detail[cnt % self.length]).run()
                article_list = self.check(article_list, offset, cnt)
                flag = int(article_list['des']['can_msg_continue'])
                offset = int(article_list['des']['next_offset'])
                cnt += 1
                self.data = article_list['data']
                self.length += len(self.data)
                flag = self.checkFIlter(filter)
                self.length += len(self.data)
                crawler_log_table_instance.insert('id', {'id':self.nickname, 'num':self.length, 'nickname':self.nickname, 'time':datetime.now()})
                process.reportCrawlNum(self.length)
                if self.save(self.data) == 'UPDATE':
                    break
                if not flag:
                    break
                time.sleep(self.delay)

        else:
            logger.warning('没有上滑加载更多历史文章')

    def save(self, l1l11111ll_wcplus_):
        """
        :return: 保存数据
        """
        res = None
        res = self.nickname_table.insert('id', l1l11111ll_wcplus_)
        return res

    def checkFIlter(self, filter):
        """
        :param filter:
        :return: 根据过滤器中的条件 决定继续还是结束文章列表的采集 True继续 false停止
        """
        if filter['type'] == 'true':
            if int(filter['num']) == 0:
                return True
            if self.length >= int(filter['num']):
                return False
            return True
        else:
            l11ll1llll_wcplus_ = []
            res = True
            for a in self.data:
                l11ll1l1l1_wcplus_ = a['p_date'].timestamp()
                if l11ll1l1l1_wcplus_ >= filter['start_time'] and l11ll1l1l1_wcplus_ <= filter['end_time']:
                    l11ll1llll_wcplus_.append(a)
                elif l11ll1l1l1_wcplus_ < filter['start_time']:
                    res = False

            self.data = l11ll1llll_wcplus_
            return res

    def check(self, flag, offset, cnt):
        """
        :param list_data: 请求返回的结果
        :param offset:
        :return: 带着本次请求的参数和结果一起过安检
        请求失败导致安检不通过 安检提醒人重新操作手机 操作完之后再次发起请求
        不排除还是会失败  继续调用自己
        """
        if flag != 'req_data_error':
            crawler_manager.check({'crawler': '历史文章列表', 'msg': 'success'})
        else:
            crawler_manager.check({'crawler': '历史文章列表', 'msg': 'req_data_error'})
            self.articles_detail = rd.articles_detail()
            while len(self.articles_detail) == 0:
                self.articles_detail = rd.articles_detail()
                from utils.front import sendNotification
                sendNotification('没有发现参数', '参数错误', _type='error')
                time.sleep(3)

            flag = Crawler(offset, self.articles_detail[0]).run()
            self.check(flag, offset, cnt)
        return flag
예제 #11
0
class l1l1ll11l_wcplus_:
    def __init__(self):
        self.l1l11ll1l_wcplus_ = rd.articles_detail()
        self.nickname = self.l1l11ll1l_wcplus_[0]['nickname']
        self.l11ll1l1ll_wcplus_ = 3.0
        self.l11ll1l111_wcplus_ = len(self.l1l11ll1l_wcplus_)
        self.delay = round(self.l11ll1l1ll_wcplus_ / self.l11ll1l111_wcplus_,
                           3)
        self.l11lll11l_wcplus_ = []
        self.l11ll111l_wcplus_ = DB(self.nickname)
        self.l11ll1lll1_wcplus_ = time.time()

    def l1l1l1l11_wcplus_(self, process=None, mov=10):
        """
        :param mov: 10~17
        :return: 轮流调用wx_req_data_list中的微信参数 采集文章的阅读数据
        """
        if 'getappmsgext' in self.l1l11ll1l_wcplus_[0]:
            l11l1ll11l_wcplus_ = self.l11ll111l_wcplus_.table.find({
                '$and': [{
                    'read_num': {
                        '$exists': False
                    }
                }, {
                    'mov': {
                        '$lte': int(mov)
                    }
                }]
            })
            cnt = 0
            for a in l11l1ll11l_wcplus_:
                if 'mp.weixin.qq.com' in a['content_url']:
                    if 'comment_id' not in a:
                        a['comment_id'] = 0
                    self.l11lll11l_wcplus_.append(
                        [cnt, a['content_url'], a['comment_id']])
                    cnt += 1

            for l11l1ll111_wcplus_ in self.l11lll11l_wcplus_:
                while time.time() - self.l11ll1lll1_wcplus_ <= self.delay:
                    time.sleep(0.05)

                self.l11ll1lll1_wcplus_ = time.time()
                l1ll1l1l1_wcplus_ = l11lll1l1l_wcplus_(
                    l11l1ll111_wcplus_[1], l11l1ll111_wcplus_[2],
                    self.l1l11ll1l_wcplus_[l11l1ll111_wcplus_[0] %
                                           self.l11ll1l111_wcplus_]).run()
                l1ll1l1l1_wcplus_ = self.check(l1ll1l1l1_wcplus_,
                                               l11l1ll111_wcplus_)
                l1ll1l1l1_wcplus_['id'] = encryptString(l11l1ll111_wcplus_[1])
                self.l11ll111l_wcplus_.insert('id', l1ll1l1l1_wcplus_)
                process.l11l1lll1_wcplus_(l11l1ll111_wcplus_[0] + 1,
                                          len(self.l11lll11l_wcplus_),
                                          self.delay)

        else:
            logger.warning('点击查看该公众号的任意一篇文章且出现阅读量')

    def save(self, l1ll1l1l1_wcplus_):
        """
        :param reading_data:
        :return: 保存数据
        """
        pass

    def l11111l11_wcplus_(self):
        """
        :return: 多线程的方式准备任务
        """
        for item in self.l11lll11l_wcplus_:
            yield {'index': item[0], 'url': item[1]}

    def l1llllll1l_wcplus_(self, task):
        """
        :return: 多线程的方式任务处理器
        """
        l11lll1l1l_wcplus_(
            task['url'],
            self.l1l11ll1l_wcplus_[task['index'] %
                                   self.l11ll1l111_wcplus_]).run()

    def check(self, l1ll1l1l1_wcplus_, item):
        """
        :return: 带着本次请求的参数和结果一起过安检
        请求失败导致安检不通过 安检提醒人重新操作手机 操作完之后再次发起请求
        不排除还是会失败  继续调用自己 反正想办法让其获得成功的请求  最后返回成功的请求
        """
        if l1ll1l1l1_wcplus_ != 'req_data_error':
            crawler_manager.check({'crawler': '阅读数据', 'msg': 'success'})
        else:
            crawler_manager.check({'crawler': '阅读数据', 'msg': 'req_data_error'})
            self.l1l11ll1l_wcplus_ = rd.articles_detail()
            while len(self.l1l11ll1l_wcplus_) == 0:
                self.l1l11ll1l_wcplus_ = rd.articles_detail()
                from utils.front import sendNotification
                sendNotification('没有发现参数', '参数错误', _type='error')
                time.sleep(3)

            l1ll1l1l1_wcplus_ = l11lll1l1l_wcplus_(
                item[1], item[2], self.l1l11ll1l_wcplus_[0]).run()
            self.check(l1ll1l1l1_wcplus_, item)
        return l1ll1l1l1_wcplus_