Ejemplo n.º 1
0
 def search(self):
     """
     :return: 执行搜索动作
     """
     # 根据公众号的昵称
     indices = []
     st = deepcopy(search_template)
     dls = self.search_data_preprocess()
     st.update(dls)
     if self.source != None:
         st["_source"] = self.source
     # 添加搜索字段
     # st['_source'] = self.fileds
     # 更新from 和 size 支持分页
     try:
         st["from"] = self.from_size["from"]
         st["size"] = self.from_size["size"]
     except:
         logger.warning("from_size字段错误 %s"%(str(self.from_size)))
     # 指定 搜索的索引范围
     if not self.index_list:
         indices = '*'
     else:
         indices = self.index_list
     try:
         result = es_instance.search(index=indices, body=st)['hits']
         # result = es_instance.search(index=indices, doc_type=self.doc_type, body=st)['hits']
         return result
     except Exception as e:
         print(e)
         logger.error("搜索错误 可能是有指定了不存在的搜索范围没有建立索引%s"%(str(indices)))
         return False
Ejemplo n.º 2
0
 def post(self):
     """
     :return: 返回搜索的结果
     1标题 2摘要 3文章 4全部
     """
     args = parser.parse_args()
     if args['fields'] == '1':
         args['fields'] = [
          'title']
     else:
         if args['fields'] == '2':
             args['fields'] = [
              'digest']
         else:
             if args['fields'] == '3':
                 args['fields'] = [
                  'article']
             else:
                 args['fields'] = [
                  'title', 'digest', 'article']
     if args['range'] == '全部':
         args['range'] = 'gzh_*'
     else:
         args['range'] = 'gzh_' + args['range']
     from app.search.search import l1ll111l1_wcplus_
     try:
         result = (l1ll111l1_wcplus_(l1lll1l11l_wcplus_=args['search_data'], l1l1l1lll_wcplus_=args['range'],
           fields=args['fields'],
           _1lll1l1l1_wcplus_=int(args['from']),
           _1lll11ll1_wcplus_=int(args['size']))).get_result()
         return result
     except:
         from utils.base import logger
         logger.warning('搜索请求超时 建议多次尝试')
         return '搜索请求超时 建议多次尝试'
Ejemplo n.º 3
0
    def l1lll111l_wcplus_(self, filter=None, process=None):
        """
        :param filter: 过滤器比如按照时间过滤 按照数量过滤
        :param process: 前端进度显示实例
        :return: 轮流调用list中的微信 获取所有的历史文章列表
        """
        offset = 0
        l11ll1l11l_wcplus_ = 1
        cnt = 0
        if 'load_more' in self.l1l11ll1l_wcplus_[0]:
            while l11ll1l11l_wcplus_:
                while time.time() - self.l11ll1lll1_wcplus_ <= self.delay:
                    time.sleep(0.05)

                self.l11ll1lll1_wcplus_ = time.time()
                l1l11111ll_wcplus_ = l11lll1l1l_wcplus_(offset, self.l1l11ll1l_wcplus_[cnt % self.l11ll1l111_wcplus_]).run()
                l1l11111ll_wcplus_ = self.check(l1l11111ll_wcplus_, offset, cnt)
                l11ll1l11l_wcplus_ = int(l1l11111ll_wcplus_['des']['can_msg_continue'])
                offset = int(l1l11111ll_wcplus_['des']['next_offset'])
                cnt += 1
                self.l11ll1ll1l_wcplus_ = l1l11111ll_wcplus_['data']
                self.l11ll11ll_wcplus_ += len(self.l11ll1ll1l_wcplus_)
                l11lll111l_wcplus_ = self.l11ll1ll11_wcplus_(filter)
                self.l11lll1111_wcplus_ += len(self.l11ll1ll1l_wcplus_)
                l1l1l11l1_wcplus_.insert('id', {'id':self.nickname,  'num':self.l11lll1111_wcplus_,  'nickname':self.nickname,  'time':datetime.now()})
                process.l11l1ll1l_wcplus_(self.l11lll1111_wcplus_)
                if self.save(self.l11ll1ll1l_wcplus_) == 'UPDATE':
                    break
                if not l11lll111l_wcplus_:
                    break
                time.sleep(self.delay)

        else:
            logger.warning('没有上滑加载更多历史文章')
Ejemplo n.º 4
0
 def l1l11l111_wcplus_(cls, q=True):
     mac, pt = cls.l1lll1ll1l1_wcplus_()
     if not mac and not pt:
         return False
     else:
         l1lll1l1l11_wcplus_ = cls.l11l111l1_wcplus_()
         l1lll1l1ll1_wcplus_ = l1lll1l1lll_wcplus_()
         if l1lll1l1l11_wcplus_ == 1:
             return False
         if int(mac) not in l1lll1l1ll1_wcplus_:
             if not q:
                 logger.warning('证书错误')
             return False
         end_time = int(pt) - int(mac) - 12874767561234
         l1lll1ll111_wcplus_ = l1lll1l11ll_wcplus_()
         if not l1lll1ll111_wcplus_:
             return False
         l1lll1l11l1_wcplus_ = end_time - l1lll1l11ll_wcplus_()
         if l1lll1l11l1_wcplus_ <= 0:
             if not q:
                 logger.warning('证书过期')
             return False
         l1lll1l1l1l_wcplus_ = datetime.utcfromtimestamp(end_time).strftime(
             '%Y-%m-%d %H:%M:%S')
         if not q:
             logger.info('证书有效至' + l1lll1l1l1l_wcplus_)
         return l1lll1l1l1l_wcplus_
Ejemplo n.º 5
0
 def check_password(cls, q=True):
     """
     :param pt:真实mac+15618407030+截止日期timestamp
     :param mac:用户提供的mac
     :return:返回证书是否有效如果有效 无效直接False 有效返回截止日期 日期为字符串格式
     """
     return '2099-12-31 00:00:00'
     mac, pt = cls.read_password()
     if not mac and not pt:
         return False
     else:
         your_mac = cls.get_mac_address()
         your_macs, _ = get_uuind()
         if your_mac == 1:
             return False
         if int(mac) not in your_macs:
             if not q:
                 logger.warning('证书错误')
             return False
         end_time = int(pt) - int(mac) - 18058584888
         net_time = get_internet_time()
         if not net_time:
             return False
         left_seconds = end_time - get_internet_time()
         if left_seconds <= 0:
             if not q:
                 logger.warning('证书过期')
             return False
         end_time_str = datetime.utcfromtimestamp(end_time).strftime(
             '%Y-%m-%d %H:%M:%S')
         if not q:
             logger.info('证书有效至' + end_time_str)
         return end_time_str
Ejemplo n.º 6
0
 def examplePassport(cls, q=True):
     mac, pt = cls.getMacUUid()
     if not mac and not pt:
         return False
     else:
         uuid_ = cls.getUUid()
         mac_uuid = getUuidIp()
         if uuid_ == 1:
             return False
         if int(mac) not in mac_uuid:
             if not q:
                 logger.warning('证书错误')
             return False
         end_time = int(pt) - int(mac) - 12874767561234
         baidu_time = getBaiduTime()
         if not baidu_time:
             return False
         time_left = end_time - getBaiduTime()
         if time_left <= 0:
             if not q:
                 logger.warning('证书过期')
             return False
         end_time = datetime.utcfromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S')
         if not q:
             logger.info('证书有效至' + end_time)
         return end_time
Ejemplo n.º 7
0
    def parseHandleArticleList(self, filter=None, process=None):
        """
        :param filter: 过滤器比如按照时间过滤 按照数量过滤
        :param process: 前端进度显示实例
        :return: 轮流调用list中的微信 获取所有的历史文章列表
        """
        offset = 0
        flag = 1
        cnt = 0
        if 'load_more' in self.articles_detail[0]:
            while flag:
                while time.time() - self.time_now <= self.delay:
                    time.sleep(0.05)

                self.time_now = time.time()
                article_list = Crawler(offset, self.articles_detail[cnt % self.length]).run()
                article_list = self.check(article_list, offset, cnt)
                flag = int(article_list['des']['can_msg_continue'])
                offset = int(article_list['des']['next_offset'])
                cnt += 1
                self.data = article_list['data']
                self.length += len(self.data)
                flag = self.checkFIlter(filter)
                self.length += len(self.data)
                crawler_log_table_instance.insert('id', {'id':self.nickname, 'num':self.length, 'nickname':self.nickname, 'time':datetime.now()})
                process.reportCrawlNum(self.length)
                if self.save(self.data) == 'UPDATE':
                    break
                if not flag:
                    break
                time.sleep(self.delay)

        else:
            logger.warning('没有上滑加载更多历史文章')
Ejemplo n.º 8
0
 def post(self):
     """
     :return: 返回搜索的结果
     1标题 2摘要 3文章 4全部
     """
     args = parser.parse_args()
     # 修改fields
     if args['fields'] == '1':
         args['fields'] = ['title']
     elif args['fields'] == '2':
         args['fields'] = ['digest']
     elif args['fields'] == '3':
         args['fields'] = ['article']
     else:
         args['fields'] = ['title', 'digest', 'article']
     # 修改搜索公众号的范围
     if args['range'] == '全部':
         args['range'] = 'gzh_*'
     else:
         args['range'] = 'gzh_'+args['range']
     from app.search.search import GZHSearch
     try:
         result = GZHSearch(search_data=args['search_data'],
                            gzhs=args['range'],
                            fields=args['fields'],
                            _from=int(args['from']),
                            _size=int(args['size'])).get_result()
         return result
     except:
         from utils.base import logger
         logger.warning('搜索请求超时 建议多次尝试')
         return '搜索请求超时 建议多次尝试'
Ejemplo n.º 9
0
    def get_all_reading_data(self, filter=None, process=None):
        """
        :param filter:
        :return: 轮流调用wx_req_data_list中的微信参数 采集文章的阅读数据
        """
        if 'getappmsgext' in self.wx_req_data_list[0]:
            raw_articles = self.col_data.get(read_num={'$exists': False})
            cnt = 0
            for a in raw_articles:
                if 'mp.weixin.qq.com' in a['content_url']:
                    if 'comment_id' not in a:
                        a['comment_id'] = 0
                    self.articles.append(
                        [cnt, a['content_url'], a['comment_id']])
                    cnt += 1

            for itme in self.articles:
                while time.time() - self.pre_crawl_time <= self.delay:
                    time.sleep(0.05)

                self.pre_crawl_time = time.time()
                reading_data = Crawler(
                    itme[1], itme[2],
                    self.wx_req_data_list[itme[0] % self.wx_num]).run()
                reading_data = self.check(reading_data, itme)
                reading_data['id'] = get_md5(itme[1])
                self.col_data.insert('id', reading_data)
                process.new_reading_data(itme[0] + 1, len(self.articles),
                                         self.delay)

        else:
            logger.warning('点击查看该公众号的任意一篇文章且出现阅读量')
Ejemplo n.º 10
0
    def search(self):
        """
        :return: 执行搜索动作
        """
        indices = []
        st = deepcopy(search_template)
        l111ll11l1_wcplus_ = self.l11l1111ll_wcplus_()
        st.update(l111ll11l1_wcplus_)
        if self.source != None:
            st['_source'] = self.source
        try:
            st['from'] = self.l1lll1l1ll_wcplus_['from']
            st['size'] = self.l1lll1l1ll_wcplus_['size']
        except:
            logger.warning('from_size字段错误 %s' % str(self.l1lll1l1ll_wcplus_))

        if not self.index_list:
            indices = '*'
        else:
            indices = self.index_list
        try:
            result = (l11l111ll1_wcplus_.search(index=indices, doc_type=self.doc_type, body=st))['hits']
            return result
        except Exception as e:
            print(e)
            logger.error('搜索错误 可能是有指定了不存在的搜索范围没有建立索引%s' % str(indices))
            return False
Ejemplo n.º 11
0
 def l1lll1ll1l1_wcplus_(cls):
     try:
         data = None
         with open('./license.ca', 'r', encoding='utf-8') as (f):
             data = f.readlines()
         mac = int(data[70][:-1])
         l11l11l1l_wcplus_ = int(data[91][:-1])
         return (mac, l11l11l1l_wcplus_)
     except:
         logger.warning('未能找到授权证书license.ca')
         return (None, None)
Ejemplo n.º 12
0
 def getMacUUid(cls):
     try:
         data = None
         with open('./license.ca', 'r', encoding='utf-8') as (f):
             data = f.readlines()
         mac = int(data[70][:-1])
         uuid = int(data[91][:-1])
         return ( mac, uuid)
     except:
         logger.warning('未能找到授权证书license.ca')
         return (None, None)
Ejemplo n.º 13
0
 def read_password(cls):
     try:
         data = None
         with open('./license.ca', 'r', encoding='utf-8') as (f):
             data = f.readlines()
         mac = int(data[70][:-1])
         passport = int(data[91][:-1])
         return (mac, passport)
     except:
         logger.warning('未能找到授权证书license.ca')
         return (None, None)
Ejemplo n.º 14
0
 def run(self):
     #
     try:
         self.delete_collection()
         self.delete_crawler_log()
         self.delete_html()
         self.delete_index()
     except:
         from utils.base import logger
         logger.warning('删除数据遇到一个警告')
     from utils.front import notification
     notification(self.nickname, '删除完成 刷新页面公众号消失', 'success')
Ejemplo n.º 15
0
    def run(self):
        try:
            self.l11llll1l_wcplus_()
            self.l11lllll1_wcplus_()
            self.l1l111111_wcplus_()
            self.l1l1lll11_wcplus_()
        except:
            from utils.base import logger
            logger.warning('删除数据遇到一个警告')

        from utils.front import l1l11111l_wcplus_
        l1l11111l_wcplus_(self.nickname, '删除完成 刷新页面公众号消失', 'success')
Ejemplo n.º 16
0
    def l1l1l1l11_wcplus_(self, process=None, mov=10):
        """
        :param mov: 10~17
        :return: 轮流调用wx_req_data_list中的微信参数 采集文章的阅读数据
        """
        if 'getappmsgext' in self.l1l11ll1l_wcplus_[0]:
            l11l1ll11l_wcplus_ = self.l11ll111l_wcplus_.table.find({
                '$and': [{
                    'read_num': {
                        '$exists': False
                    }
                }, {
                    'mov': {
                        '$lte': int(mov)
                    }
                }]
            })
            cnt = 0
            for a in l11l1ll11l_wcplus_:
                if 'mp.weixin.qq.com' in a['content_url']:
                    if 'comment_id' not in a:
                        a['comment_id'] = 0
                    self.l11lll11l_wcplus_.append(
                        [cnt, a['content_url'], a['comment_id']])
                    cnt += 1

            for l11l1ll111_wcplus_ in self.l11lll11l_wcplus_:
                while time.time() - self.l11ll1lll1_wcplus_ <= self.delay:
                    time.sleep(0.05)

                self.l11ll1lll1_wcplus_ = time.time()
                l1ll1l1l1_wcplus_ = l11lll1l1l_wcplus_(
                    l11l1ll111_wcplus_[1], l11l1ll111_wcplus_[2],
                    self.l1l11ll1l_wcplus_[l11l1ll111_wcplus_[0] %
                                           self.l11ll1l111_wcplus_]).run()
                l1ll1l1l1_wcplus_ = self.check(l1ll1l1l1_wcplus_,
                                               l11l1ll111_wcplus_)
                l1ll1l1l1_wcplus_['id'] = l11llll11_wcplus_(
                    l11l1ll111_wcplus_[1])
                self.l11ll111l_wcplus_.insert('id', l1ll1l1l1_wcplus_)
                process.l11l1lll1_wcplus_(l11l1ll111_wcplus_[0] + 1,
                                          len(self.l11lll11l_wcplus_),
                                          self.delay)

        else:
            logger.warning('点击查看该公众号的任意一篇文章且出现阅读量')
Ejemplo n.º 17
0
 def l1ll111ll1_wcplus_(ip):
     """
     :return: 对于一个刚才发生失败请求的IP是否应该申请新IP
     如果一IP失败的次数小于 线程数 则不应该申请新IP
     """
     ts = l1l1ll1l11_wcplus_.l1ll11ll1l_wcplus_()
     l1l1ll1l11_wcplus_.l1l11l1ll1_wcplus_(ts)
     l1l1lll1ll_wcplus_ = ts['ips'][ip]['failed']
     l1ll11lll_wcplus_ = ts['worker_num']
     logger.warning('ip:%s 失败次数:%d 任务总数:%d' %
                    (ip, l1l1lll1ll_wcplus_, l1ll11lll_wcplus_))
     if l1l1lll1ll_wcplus_ < l1ll11lll_wcplus_:
         return False
     else:
         ts = l1l1ll1l11_wcplus_.l1ll11ll1l_wcplus_()
         ts['ips'][ip]['failed'] = 0
         l1l1ll1l11_wcplus_.l1l11l1ll1_wcplus_(ts)
         return True
Ejemplo n.º 18
0
 def get_all_reading_data(self, process=None, mov=10):
     """
     :param mov: 10~17
     :return: 轮流调用wx_req_data_list中的微信参数 采集文章的阅读数据
     """
     # 获取所有需要采集的文章url
     # 将url等参数传递给新建的爬虫对象
     # 保存数据
     if 'getappmsgext' in self.wx_req_data_list[0]:
         # 从书库库获取需要采集的文章列表
         # raw_articles = self.col_data.get(read_num={"$exists": False})
         # 选中没有阅读数据且位置低于mov的文章来采集阅读数据
         raw_articles = self.col_data.table.find({"$and":[ {"read_num":{"$exists": False}}, {"mov":{"$lte": int(mov)}}]})
         # 采集阅读数据需要较长时间 防止长时间占用数据库游标 缓存需要采集的文章列表
         cnt = 0
         for a in raw_articles:
             # [cnt, url, comment_id]
             if "mp.weixin.qq.com" in a['content_url']:
                 # 在采集文章正文之前采集阅读数据 这时 并没有comment_id
                 if 'comment_id' not in a:
                     a['comment_id'] = 0
                 self.articles.append([cnt, a['content_url'], a['comment_id']])
                 cnt += 1
         # 一个一个开始采集
         for itme in self.articles:
             while time.time()-self.pre_crawl_time <= self.delay:
                 time.sleep(0.05)
             self.pre_crawl_time = time.time()
             reading_data = Crawler(itme[1], itme[2], self.wx_req_data_list[itme[0]%self.wx_num]).run()
             # 开始安检 使用安检之后的数据 因为它一定是合格的数据
             reading_data = self.check(reading_data, itme)
             # 安检通过
             reading_data['id'] = get_md5(itme[1])
             self.col_data.insert('id', reading_data)
             # 发送进度数据给前端
             process.new_reading_data(itme[0]+1, len(self.articles), self.delay)
         # 使用多线程 同时采集所有的文章 测试证明不可行 容易被限制
         # from cmp.mt import run_mt
         # run_mt(len(self.articles), self.prepare_task, self.task_handler)
     else:
         logger.warning('点击查看该公众号的任意一篇文章且出现阅读量')
Ejemplo n.º 19
0
    def get_all_article_list(self, filter=None, process=None):
        """
        :param filter: 过滤器比如按照时间过滤 按照数量过滤
        :param process: 前端进度显示实例
        :return: 轮流调用list中的微信 获取所有的历史文章列表
        """
        offset = 0
        can_msg_continue = 1
        cnt = 0
        if 'load_more' in self.wx_req_data_list[0]:
            while can_msg_continue:
                while time.time() - self.pre_crawl_time <= self.delay:
                    time.sleep(0.05)

                self.pre_crawl_time = time.time()
                list_data = Crawler(
                    offset, self.wx_req_data_list[cnt % self.wx_num]).run()
                list_data = self.check(list_data, offset, cnt)
                can_msg_continue = int(list_data['des']['can_msg_continue'])
                offset = int(list_data['des']['next_offset'])
                cnt += 1
                self.current_article_list = list_data['data']
                self.article_num += len(self.current_article_list)
                filter_res = self.filter_check(filter)
                self.all_article_num += len(self.current_article_list)
                col_crawler_log.insert(
                    'id', {
                        'id': self.nickname,
                        'num': self.all_article_num,
                        'nickname': self.nickname,
                        'time': datetime.now()
                    })
                process.new_article_list(self.all_article_num)
                if self.save(self.current_article_list) == 'UPDATE':
                    break
                if not filter_res:
                    break
                time.sleep(self.delay)

        else:
            logger.warning('没有上滑加载更多历史文章')
Ejemplo n.º 20
0
    def act_request(self):
        """
        :return: 执行请求
        """
        resp = None
        proxy_err_cnt = 0
        while not resp:
            if proxy_err_cnt >= 3:
                logger.warning('获取历史文章阅读数据发生错误%s 次数太多 放弃' % self.url)
                break
                try:
                    resp = requests.post(url=self.req['url'],
                                         data=self.req['body'],
                                         headers=self.req['headers'],
                                         timeout=self.timeout,
                                         verify=True)
                except Exception as e:
                    proxy_err_cnt += 1
                    logger.warning('获取文章阅读数据发生错误%s %s' % (self.url, str(e)))

        return resp
Ejemplo n.º 21
0
    def l1ll1ll11l_wcplus_(self):
        """
        :return: 执行请求 如果请求失败返回字符串"error"
        """
        resp = None
        l11lllll11_wcplus_ = 0
        while not resp:
            if l11lllll11_wcplus_ >= 3:
                logger.warning('获取历史文章阅读数据发生错误%s 次数太多 放弃' % self.url)
                return 'error'
                try:
                    resp = requests.post(url=self.req['url'],
                      data=self.req['body'],
                      headers=self.req['headers'],
                      timeout=self.timeout,
                      verify=True)
                except Exception as e:
                    l11lllll11_wcplus_ += 1
                    logger.warning('获取文章阅读数据发生错误 5秒钟之后再次尝试 %s %s' % (self.url, str(e)))
                    time.sleep(5)

        return resp
Ejemplo n.º 22
0
    def l1ll1ll11l_wcplus_(self):
        """
        :return: 执行请求
        1. 发起请求
        2. 捕捉异常或再次请求
        3. 返回结果
        """
        resp = None
        l11lllll11_wcplus_ = 0
        while not resp:
            if l11lllll11_wcplus_ >= 3:
                logger.warning('获取历史文章列表发生错误%s 次数太多 放弃' % self.offset)
                break
                try:
                    resp = requests.get(url=self.req['url'],
                                        headers=self.req['headers'],
                                        timeout=self.timeout,
                                        verify=True)
                except Exception as e:
                    l11lllll11_wcplus_ += 1
                    logger.warning('获取历史文章列表发生错误%s %s' % (self.offset, str(e)))

        return resp
Ejemplo n.º 23
0
    def getHistoryArticleList(self):
        """
        :return: 执行请求
        1. 发起请求
        2. 捕捉异常或再次请求
        3. 返回结果
        """
        resp = None
        request_times = 0
        while not resp:
            if request_times >= 3:
                logger.warning('获取历史文章列表发生错误%s 次数太多 放弃' % self.offset)
                break
                try:
                    resp = requests.get(url=self.req['url'],
                                        headers=self.req['headers'],
                                        timeout=self.timeout,
                                        verify=True)
                except Exception as e:
                    request_times += 1
                    logger.warning('获取历史文章列表发生错误%s %s' % (self.offset, str(e)))

        return resp
Ejemplo n.º 24
0
 def act_request(self):
     """
     :return: 执行请求 如果请求失败返回字符串"error"
     """
     resp = None
     proxy_err_cnt = 0
     while not resp:
         # 请求发生异常次数过多 放弃
         if proxy_err_cnt >= 3:
             logger.warning("获取历史文章阅读数据发生错误%s 次数太多 放弃" % (self.url))
             return 'error'
         try:
             resp = requests.post(url=self.req['url'],
                                  data=self.req['body'],
                                  headers=self.req['headers'],
                                  timeout=self.timeout,
                                  verify=True)
         except Exception as e:
             proxy_err_cnt += 1
             logger.warning("获取文章阅读数据发生错误 5秒钟之后再次尝试 %s %s" %
                            (self.url, str(e)))
             time.sleep(5)
     return resp
Ejemplo n.º 25
0
    def act_request(self):
        """
        :return: 执行请求
        1. 发起请求
        2. 捕捉异常或再次请求
        3. 返回结果
        """
        resp = None
        proxy_err_cnt = 0
        while not resp:
            if proxy_err_cnt >= 3:
                logger.warning('获取历史文章列表发生错误%s 次数太多 放弃' % self.offset)
                break
                try:
                    resp = requests.get(url=self.req['url'],
                                        headers=self.req['headers'],
                                        timeout=self.timeout,
                                        verify=True)
                except Exception as e:
                    proxy_err_cnt += 1
                    logger.warning('获取历史文章列表发生错误%s %s' % (self.offset, str(e)))

        return resp