Ejemplo n.º 1
0
 def run(self):
     # 加载数据url
     LoadParam.load_url()
     spider_thread_list = []
     # 主爬虫优先启动
     # spider_thread = Spider()
     # spider_thread_list.append(spider_thread)
     # spider_thread.start()
     # print('main spider_thread is {0}'.format(spider_thread.status))
     # time.sleep(10)
     for s in range(MAX_SPIDER_THREAD_NUM):
         time.sleep(5)
         spider_thread = Spider()
         spider_thread_list.append(spider_thread)
         spider_thread.start()
         logger.info('spider_thread{0} is {1}'.format(
             s, spider_thread.status))
     # 监控爬虫状态 失败(403 或 爬取结束)结束(被屏蔽则关闭爬虫)
     while True:
         for Thread_spider in spider_thread_list:
             if Thread_spider.status == 'stop':
                 spider_thread_list.remove(Thread_spider)
                 logger.info('Thread_spider----{0} status is {1}'.format(
                     Thread_spider, Thread_spider.status))
                 #     重新启动
                 spider_thread = Spider()
                 spider_thread_list.append(spider_thread)
                 spider_thread.start()
             else:
                 continue
         time.sleep(360)
Ejemplo n.º 2
0
def show_schools():
    status = '000000'
    if len(school_data) == 0:
        # all_sch = db.distinct("user_schools")
        all_985 = University985.objects.all()
        all_sch = []
        for name in all_985:
            all_sch.append(name['univ_name'])
        obj = {}
        count = 0
        for b in range(len(all_sch)):
            num = db.find({'user_schools': {'$regex': all_sch[b]}}).count()
            count = count + num
            if all_sch[b] == '':
                obj.update({"未知": num})
            else:
                obj.update({all_sch[b]: num})
        all_count = db.find({}).count()

        # 对business_objs 的值(values)排序 取排行前20个行业
        objs_list = (sorted(CommUtils.dict2list(obj),
                            key=lambda x: x[1],
                            reverse=True))[0:20]
        y_data = []
        series_data = []
        for tup in objs_list:
            y_data.append(tup[0])
            series_data.append(tup[1])
        logger.info('前二十的985大学排名{0}'.format(y_data))
        logger.info('前二十的985大学排名占比{0}'.format(series_data))
        retn_data = {'status': status, 'ydata': y_data, 'data': series_data}
        school_data.update(retn_data)
    else:
        retn_data = school_data
    return jsonify(retn_data)
Ejemplo n.º 3
0
def delete_info():
    print(type(request.form))
    # < class 'werkzeug.datastructures.ImmutableMultiDict'>
    _id = request.form["_id"]
    if CommUtils.check_params(_id):
        db.remove({'_id': ObjectId(_id)})
        logger.info("删除 {0} 成功!".format(_id))
        return jsonify([])
    else:
        logger.info('_id 为None ,删除用户详细信息失败!')
        return jsonify([])
Ejemplo n.º 4
0
 def run(self):
     while True:
         logger.info('FetchParseThread is running !')
         try:
             if validate_proxy_pool.qsize(
             ) < MIN_VALIDATE_POOL_SIZE and unchecked_ip_pool.qsize(
             ) < MIN_UNCHECKED_SIZE:
                 self.add_unchecked_to_queue()
                 time.sleep(180)
             else:
                 time.sleep(30)
         except Exception as err:
             logger.info('FetchParseThread err is :{0}'.format(err))
             self.status = 'error'
Ejemplo n.º 5
0
def show_following():
    # 粉丝数排行前15
    status = '000000'
    # mongoengine 对原生sort()方法不支持 使用order_by()
    user_data = UserInfoData.objects.order_by('-user_following').skip(0).limit(
        15)
    series_data = []
    y_data = []
    for data in user_data:
        y_data.append(data['user_name'])
        series_data.append(data['user_following'])
    logger.info("关注数前15的用户:{0}和关注数:{1}".format(y_data, series_data))
    retn_data = {'status': status, 'ydata': y_data, 'data': series_data}
    return jsonify(retn_data)
Ejemplo n.º 6
0
def show_answers():
    # 回答问题数前15
    status = '000000'
    # mongoengine 对原生sort()方法不支持 使用order_by()
    user_data = UserInfoData.objects.order_by('-user_answer_count').skip(
        0).limit(15)
    series_data = []
    x_data = []
    for data in user_data:
        x_data.append(data['user_name'])
        series_data.append(data['user_answer_count'])
    logger.info("问答问题数前15的用户:{0}和回答问题数:{1}".format(x_data, series_data))
    retn_data = {'status': status, 'data': series_data, 'xdata': x_data}
    return jsonify(retn_data)
Ejemplo n.º 7
0
def begin_spider():
    print("start spider")
    status = '000000'
    if pid_queue.empty():
        try:
            logger.info('Parent process {0}'.format(os.getpid()))
            logger.info('spider_process is started now :')
            process = Process(target=spider_process,
                              args=(
                                  'spider_process',
                                  pid_queue,
                              ))
            process.start()
            # join()子进程结束后再继续往下运行,通常用于进程间的同步
            # process.join()
            time.sleep(3)
            if not pid_queue.empty():
                spider_status = 1
            else:
                spider_status = 0
        except:
            spider_status = 0
            status = '999999'
            logger.debug('进程启动异常!')
    else:
        spider_status = 1
        logger.info('进程已存在!正在爬取知乎信息!')
    retn_data = {'status': status, 'spiderStatus': spider_status}
    logger.info('返回状态:{0}和启动状态:{1}'.format(status, spider_status))
    return jsonify(retn_data)
Ejemplo n.º 8
0
def query_detail():
    _id = request.args.get("_id")
    data_list = []
    if CommUtils.check_params(_id):
        user_info = db.find({'_id': ObjectId(_id)})
        for data in user_info:
            data['_id'] = str(data['_id'])
            data['create_time'] = data['create_time'].strftime(
                "%Y-%m-%d %H:%M:%S")
            data_list.append(data)
        logger.info("用户:{0} 的所有明细信息:{1}".format(_id, data_list))
        return jsonify(data_list)
    else:
        logger.info('_id 为None ,获取用户详细信息失败!')
        return jsonify(data_list)
Ejemplo n.º 9
0
def show_spider_status():
    try:
        # 已经爬去的url
        all_url = db_f.find({}).count()
        # 已经解析的url
        had_parsed = db_f.find({'queueUrl': 'none'}).count()
        # 解析成功的url
        parsed_success = db.find({}).count()
        # 解析失败的url
        parsed_failure = had_parsed - parsed_success
        # 成功率
        rate = str(
            Decimal(str((parsed_success / had_parsed * 100))).quantize(
                Decimal('0.00')))
        # 总用户数
        all_users = parsed_success
        # 爬虫状态 1正在爬取 0结束爬取 查看爬虫进程是否存在
        if not pid_queue.empty():
            spider_status = 1
        else:
            spider_status = 0
            logger.info(
                "all_url:{0}, had_parsed:{1} ,parsed_success:{2} ,parsed_failure:{3} ,rate:{4}%,all_users:{5}"
                .format(all_url, had_parsed, parsed_success, parsed_failure,
                        rate, all_users))
        data_list = [{
            'all_url': all_url,
            'had_parsed': had_parsed,
            'parsed_success': parsed_success,
            'parsed_failure': parsed_failure,
            'rate': rate,
            'all_users': all_users,
            'spider_status': spider_status
        }]
        retn_dict = {
            'status': '000000',
            'total': 1,
            'pageSize': 10,
            'curPage': 1,
            'spiderStatus': spider_status,
            'list': data_list
        }
        return jsonify(retn_dict)
    except:
        logger.info("获取爬虫状态信息失败!")
        error_dict = {'status': '999999'}
        return jsonify(error_dict)
Ejemplo n.º 10
0
 def parse_page_html(self, page_html):
     try:
         soup_page = BeautifulSoup(page_html, 'html5lib')
         # 分析分页页面 得到个人信息,解析的到用户信息json串
         data = soup_page.find('div', attrs={
             'id': 'data'
         }).attrs['data-state']
         if data is not None:
             # 将网页解析的用户信息转成json串data_json
             data_json = json.loads(str(data))
             # 当前页所有user的数据集
             all_user_data = data_json['entities']['users']
             self.add_following_url(all_user_data)
         else:
             logger.info('parse_page_html data is none!')
     except Exception as err:
         logger.debug("parse_page_html error ! {0}".format(err))
Ejemplo n.º 11
0
def show_education():
    status = '000000'
    # all_sch = db.distinct("user_schools")
    if len(education_data) == 0:
        all_211 = University211.objects.all()
        all_sch = []
        for name in all_211:
            all_sch.append(name['univ_name'])
        # 未填写
        all_sch.append('')
        sch211_count = 0
        sch_none = 0
        for b in range(len(all_sch)):
            num = db.find({'user_schools': {'$regex': all_sch[b]}}).count()
            if all_sch[b] == '':
                sch_none = sch_none + num
            else:
                sch211_count = sch211_count + num
        all_count = db.find({}).count()
        # 填写的其他大学
        sch_others = all_count - sch211_count - sch_none
        retn_keys = ['211高校', '其他学校', '未知学校']
        retn_objs = [{
            'name': '211高校',
            'value': sch211_count
        }, {
            'name': '其他学校',
            'value': sch_others
        }, {
            'name': '未知学校',
            'value': sch_none
        }]
        retn_data = {
            'status': status,
            'keysData': retn_keys,
            'data': retn_objs
        }
        education_data.update(retn_data)
        logger.info('受教育程度返回值:{0}'.format(retn_data))
    else:
        retn_data = education_data

    return jsonify(retn_data)
 def get_proxy_ip_html(self, current_page):
     self.session.headers = header_ip
     # while current_page <= MAX_PROXY_PAGE:
     url = base_url + str(current_page)
     try:
         res = self.session.get(url, timeout=MAX_TIME_OUT)
         # 解析 获得 代理ip
         if res.status_code == 200 and res is not None:
             return res.text
             # all_ip_info = self.parse_proxy_ip(res.text)
             # # 校验ip可用性  和 去重
             # for i in range(len(all_ip_info)):
             #     if self.validate_IP.is_validate_ip(all_ip_info[i]):
             #         # 判断 该ip是否存在于 内存中有效代理ip
             #         # TODO 或使用 队列处理(存入全局的有效代理池中)
             #         validate_ip_list.append(all_ip_info[i])
             #     else:
         elif res.status_code == 403:
             logger.info('403 proxy ip web forbidden')
             return None
         else:
             logger.info('proxy ip web return code is :{0}'.format(
                 res.status_code))
             return None
     except Exception as err:
         logger.info(
             'ger_proxy_ip_html err:{0}, return status_code is {1}!'.format(
                 err, res.status_code))
         return None
Ejemplo n.º 13
0
 def get_parse_url():
     if not queue_follow_url.empty():
         using_url = queue_follow_url.get()
     else:
         # 从数据库加载已经爬取的url数据 和 之前在队列中的数据
         all_query_set = FollowingUrl.objects.all()
         for followingUrl in all_query_set:
             try:
                 # 加载所以已经爬取的url(再次判重,将重复的数据从数据库删除)
                 if followingUrl.urlToken not in had_url:
                     had_url.add(followingUrl.urlToken)
                     if followingUrl.queueUrl != 'none':
                         # 加载程序结束前队列中的url到队列中
                         queue_follow_url.put(followingUrl.queueUrl)
                     else:
                         continue
                 else:
                     logger.info("删除重复的 followingUrl:{0} 的 _id:{1}".format(
                         followingUrl._data, followingUrl.id))
                     followingUrl.delete({'_id': str(followingUrl.id)})
                     continue
             except Exception as err:
                 logger.debug('get_parse_url err :{0}'.format(err))
                 logger.info('error happened in reload urls from mongodb!')
                 continue
         # 加载完毕 重新从队列取值
         if not queue_follow_url.empty() and len(had_url) > 0:
             using_url = queue_follow_url.get()
         elif queue_follow_url.empty() and len(had_url) == 0:
             # 爬虫入口
             using_url = follow_url_into
         else:
             logger.info("爬取结束了!")
             return
     return using_url
Ejemplo n.º 14
0
 def get_proxies_ip(self):
     try:
         # 代理池是否有代理ip 且不少于四个,否则等待(代理线程爬取完毕)
         if not validate_proxy_pool.empty(
         ) and validate_proxy_pool.qsize() > MIN_PROXY_IP_POOL:
             self.proxy_ip_info = validate_proxy_pool.get()
             self.proxies_ip = {
                 self.proxy_ip_info.get('protocol'):
                 'http://' + self.proxy_ip_info.get('ip') + ':' +
                 self.proxy_ip_info.get('port')
             }
             time.sleep(1)
             # 将代理ip返回队列
             validate_proxy_pool.put(self.proxy_ip_info)
         else:
             logger.info("等待代理ip线程获取代理或其他爬虫归还可用代理!")
             # 如果没有代理ip 使用本ip直接访问
             time.sleep(MAX_WAIT_PROXY_TIME)
             # validate_proxy_pool.put(123)
             self.proxies_ip = 1
             logger.info('使用本机ip或购买的稳定ip!proxies_ip = '.format(
                 self.proxies_ip))
             # 如果代理ip量多稳定 ,使用递归调用,死等代理ip(该方法最稳)
             # self.get_proxies_ip()
             # TODO 不使用代理 (或者使用购买的稳定代理ip)
         return self.proxies_ip
     except Exception as err:
         logger.info('get_proxies_ip err is :{0}'.format(err))
         return None
Ejemplo n.º 15
0
def close_spider():
    try:
        # 获得 进程共享 pid值
        if not pid_queue.empty():
            value = pid_queue.get()
            logger.info('kill pid'.format(value))
            os.kill(value, signal.SIGTERM)
        else:
            logger.info('进程不存在')
            # a = os.popen('taskkill.exe/pid:' + str(value)+'-t -f')
    except OSError:
        logger.info('没有如此进程!!!')
    logger.info('spider_process is closed!')

    return 'none'
Ejemplo n.º 16
0
    def is_validate_ip(self, ip_info):
        # def is_validate_ip(self):
        # if ip_info is None:
        #     return False
        # 截取ip值
        # ip_str = str(ip_info.get('http')).replace('http://', '')
        # real_ip = ip_str[0:ip_str.index(":")]
        # print('real_ip', real_ip)

        # 拼接ip
        ip = ip_info.get("ip")
        port = ip_info.get('port')
        protocol = ip_info.get('protocol')
        proxy_ip = {protocol: protocol + '://' + ip + ':' + port}
        logger.info('check proxy_ip :{0}'.format(proxy_ip))
        # proxy_ip = {'http': 'http//202.121.96.33:8086'}

        session = requests.session()
        session.headers = headerr
        session.proxies = proxy_ip
        retry_time = 0
        while retry_time < max_try_time:
            try:
                response = session.get(v_url, timeout=max_time_out)
                # print(response.status_code)
                # print(response.text)
                if response.status_code == 200:
                    match_list = re.findall(r'[0-9]+(?:\.[0-9]+){3}',
                                            response.text)
                    print(match_list)
                    if len(match_list) > 0:
                        current_ip = match_list.pop()
                        logger.info('current_ip:{0}'.format(current_ip))
                        if current_ip is not None and current_ip == ip:
                            logger.info(
                                'this is validate ip------------> {0}'.format(
                                    current_ip))
                            return True
                        else:
                            retry_time += 1
                            continue
                    else:
                        retry_time += 1
                        continue
            except Exception as err:
                logger.info('is_validate_ip err is :{0}'.format(err))
                return False
        return False
Ejemplo n.º 17
0
 def add_following_url(follow_user__data):
     try:
         base_url = 'https://www.zhihu.com/people/'
         #  dict类型 keys 和values 可以使用如下遍历(也可以.keys()he .values()直接输出)
         for key, value in follow_user__data.items():
             new_url = base_url + (str(value['urlToken']))
             if new_url in had_url:
                 continue
             elif str(value['urlToken']) == 'None':
                 continue
             else:
                 # 存入had_url
                 had_url.add(new_url)
                 # 存入队列 had_url去重后队列值都是唯一性,未解析用户信息的url
                 logger.info('new following url is :{0}'.format(new_url))
                 queue_follow_url.put(new_url)
                 # 已经爬去的url和追踪队列中的url 存储到mongodb(已经去重)
                 db_url = FollowingUrl()
                 db_url.urlToken = new_url
                 db_url.queueUrl = new_url
                 db_url.save()
     except Exception as err:
         logger.debug('add_following_url has err :{0}'.format(err))
Ejemplo n.º 18
0
def spider_process(name, queue):
    logger.info('begin spider_process! ')
    logger.info('Run child process :{0} {1}..'.format(name, os.getpid()))
    pid = os.getpid()
    logger.info('subprocess pid is {0}'.format(pid))
    queue.put(pid)
    # 执行爬虫系统
    start_spider = ZhiHuUser()
    start_spider.start()
Ejemplo n.º 19
0
def show_voted_more():
    # 粉丝数排行前15
    status = '000000'
    # mongoengine 对原生sort()方法不支持 使用order_by()
    user_data = UserInfoData.objects.order_by('-user_vote_up_count').skip(
        0).limit(10)
    x_data = []
    voted_data = []
    thanked_data = []
    favorite_data = []
    for data in user_data:
        x_data.append(data['user_name'])
        voted_data.append(data['user_vote_up_count'])
        thanked_data.append(data['user_thanked_count'])
        favorite_data.append(data['user_favorite_count'])
    series_data = {
        'votedData': voted_data,
        'thankedData': thanked_data,
        'favoriteData': favorite_data
    }
    logger.info("赞同数数前10的用户:{0}和对应的赞同数:{1},感谢数:{2},和收藏数:{3}".format(
        x_data, voted_data, thanked_data, favorite_data))
    retn_data = {'status': status, 'xdata': x_data, 'data': series_data}
    return jsonify(retn_data)
Ejemplo n.º 20
0
def show_business():
    print('business_data :', business_data)
    if len(business_data) == 0:
        all_business = db.distinct("user_business")
        all_count = db.find({}).count()
        business_objs = {}
        # result.fromkeys(business)
        for b in range(len(all_business)):
            num = db.find({'user_business': all_business[b]}).count()
            if all_business[b] == '':
                business_objs.update({"未知": num})
            else:
                business_objs.update({all_business[b]: num})
        # 对business_objs 的值(values)排序 取排行前20个行业
        objs_list = (sorted(CommUtils.dict2list(business_objs),
                            key=lambda x: x[1],
                            reverse=True))[0:20]
        retn_keys = []
        retn_objs = []
        count = 0
        for tup in objs_list:
            retn_keys.append(tup[0])
            retn_objs.append({'name': tup[0], 'value': tup[1]})
            count = count + tup[1]
        # 剩余其他职业
        retn_keys.append('其他')
        other_count = all_count - count
        retn_objs.append({'name': '其他', 'value': other_count})
        logger.info("排行前20 的行业{0}".format(retn_keys))
        logger.info("排行前20 的行业 占比{0}".format(retn_objs))

        return_data = {'keysData': retn_keys, 'data': retn_objs}
        business_data.update(return_data)
    else:
        return_data = business_data
    return jsonify(return_data)
Ejemplo n.º 21
0
 def parse_html_info(self, source):
     try:
         soup = BeautifulSoup(source, "html5lib")
         # 分析页面 发现个人信息都在 id = "data" data-state={} 里面 获取源码,解析的到用户信息json串
         data_div = soup.find('div', attrs={'id': 'data'})
         if data_div is not None:
             data = soup.find('div', attrs={
                 'id': 'data'
             }).attrs['data-state']
             # 将网页解析的用户信息转成json串data_json
             data_json = json.loads(str(data))
             # 首页中的所有用户集合
             all_users_data = data_json['entities']['users']
             if len(all_users_data) > 0 and all_users_data is not None:
                 # 截取用户连接中的名字
                 url_user_name = self.url.split("/")[-1]
                 user_data = all_users_data[url_user_name]
                 if len(user_data) > 0 and user_data is not None:
                     # 解析用户信息,存入数据库
                     self.fetch_user_info.parsed_user_info(
                         user_data, self.url)
                     # 将已经解析获得用户信息的url值设为none
                     self.change_queue_url2none(self.url)
                     # 查询网页所有分页信息
                     pages_html = soup.find_all(
                         'button',
                         attrs={
                             'class':
                             'Button PaginationButton Button--plain'
                         })
                     # 得到总页码数
                     if len(pages_html) > 0:
                         total_page = int(pages_html[-1].contents[0])
                     else:
                         total_page = 1
                     # 异步io爬取每一页的关注人地址(如果全局最大爬取页数为1页时候,直接爬取第一页following_url,减少page=1访问次数)
                     if max_page > 1:
                         self.getFollowingUrl.get_other_page_following(
                             self.url, total_page)
                     else:
                         self.getFollowingUrl.add_following_url(
                             all_users_data)
                 else:
                     logger.info('user_data is none!')
             else:
                 logger.info('all_users_data is none!')
         else:
             logger.info(
                 'data_div is none!(NoneType object has no attribute attrs)'
             )
             self.change_queue_url2none(self.url)
     except Exception as err:
         logger.debug('parse_html_info err is : {0}'.format(err))
         self.change_queue_url2none(self.url)
Ejemplo n.º 22
0
 def run(self):
     while True:
         logger.info('CheckValidateProxyPool is running !')
         try:
             self.get_old_ip()
             if not old_ip_pool.empty():
                 logger.info(
                     "begin scan validate ip ,old_ip_pool size is :{0}".
                     format(old_ip_pool.qsize()))
                 for i in range(old_ip_pool.qsize()):
                     old_ip = old_ip_pool.get()
                     if self.validate_ip.is_validate_ip(old_ip):
                         # ip还有效则加入可用对列
                         validate_proxy_pool.put(old_ip)
                     else:
                         continue
             else:
                 logger.info('CheckValidateProxyPool is over!')
             # 五分检查一次
             time.sleep(240)
         except Exception as err:
             logger.info('CheckValidateProxyPool is err :{0}'.format(err))
             self.status = 'error'
Ejemplo n.º 23
0
 def load_url():
     logger.info("load url begin!")
     # 从数据库加载已经爬取的url数据 和 之前在队列中的数据
     all_query_set = FollowingUrl.objects.all()
     for followingUrl in all_query_set:
         try:
             # 加载所以已经爬取的url(再次判重,将重复的数据从数据库删除)
             if followingUrl.urlToken not in had_url:
                 had_url.add(followingUrl.urlToken)
                 if followingUrl.queueUrl != 'none':
                     # 加载程序结束前队列中的url到队列中
                     queue_follow_url.put(followingUrl.queueUrl)
                 else:
                     continue
             else:
                 logger.info("删除重复的 followingUrl:{0} 的 _id:{1}".format(
                     followingUrl._data, followingUrl.id))
                 followingUrl.delete({'_id': str(followingUrl.id)})
                 continue
         except Exception as err:
             logger.debug('load url err :{0}'.format(err))
             logger.info('error happened in load urls from mongodb!')
             continue
     logger.info("load url end!")
 def parse_proxy_ip(web_data):
     if web_data is not None:
         try:
             soup = BeautifulSoup(web_data, 'html5lib')
             ips_tr = soup.find_all('tr')
             ip_info_list = []
             for i in range(1, len(ips_tr)):
                 ip_info = ips_tr[i]
                 tds = ip_info.find_all('td')
                 ip = str(tds[1].text)
                 port = str(tds[2].text)
                 protocol = str(tds[5].text).lower()
                 # 封装代理ip
                 ip_info = {'ip': ip, 'port': port, 'protocol': protocol}
                 ip_info_list.append(ip_info)
             logger.info('xicidaili get http ip proxy list is:{0}'.format(
                 ip_info_list))
             return ip_info_list
         except Exception as err:
             logger.info('parse_proxy_ip err is :{0}'.format(err))
             return None
     else:
         logger.info('parse_proxy_ip fail web_data is None!')
         return None
Ejemplo n.º 25
0
 def run(self):
     try:
         self.add_validate_to_queue()
     except Exception as err:
         logger.info('ValidateIPThread err is:{0}'.format(err))
         self.status = 'error'
Ejemplo n.º 26
0
    def parsed_user_info(user_data, user_url):
        #  user_info_categories 对象
        categories = mongo.UserInfoData()
        try:
            categories.user_url = user_url + "/following"
            logger.info('user_url:{0}'.format(categories.user_url))  # 用户姓名
            if 'name' in user_data.keys():
                categories.user_name = user_data['name'].strip()
                logger.info("user_name :{0}".format(categories.user_name))
            else:
                categories.user_name = ''
                logger.info("user_name is none ")
            # 用户头像地址
            if 'avatarUrl' in user_data.keys():
                categories.user_avatar_url = user_data['avatarUrl'].strip()
                logger.info('user_avatar_url:{0}'.format(
                    categories.user_avatar_url))
            else:
                categories.user_avatar_url = ''
                logger.info('user_avatar_url is none')

            if 'headline' in user_data.keys():
                categories.user_head_line = user_data['headline'].strip()
                logger.info('user_head_line:{0}'.format(
                    categories.user_head_line))
            else:
                categories.user_head_line = ''
                logger.info('user_head_line is none')
            # TODO 处理标题里面的连接
            # --------------------------获取性别gender  -1未知性别 0 女性 1男性 begin----------------------------
            if 'gender' in user_data.keys():
                gender = user_data['gender']
                if gender == 1:
                    categories.user_sex = 'man'
                elif gender == 0:
                    categories.user_sex = 'female'
                else:
                    categories.user_sex = 'none'
                logger.info("user_sex:{0}".format(categories.user_sex))
            else:
                categories.user_head_line = 'none'
                logger.info('user_sex is none')
            # ---------------------------获取性别gender  -1未知性别 0 女性 1男性 end----------------------------

            # -----------------------行业 begin----------------------------
            # 判断business key是否存在(不存在报异常)
            if 'business' in user_data.keys():
                data_business = user_data['business']
                if len(data_business) > 0 and data_business is not None:
                    categories.user_business = user_data['business']['name']
                    logger.info("user_business:{0}".format(
                        categories.user_business))
                else:
                    categories.user_business = ''
                    logger.info("user_business is none")
            else:
                categories.user_business = ''
                logger.info("business key is none")
            # -----------------------行业 end----------------------------

            # ---------------------------用户居住所在地 begin---------------------------------------
            if 'locations' in user_data.keys():
                data_locations = user_data['locations']
                if len(data_locations) > 0 and data_locations is not None:
                    for i in range(len(data_locations)):
                        categories.user_locations.append(
                            data_locations[i]['name'].strip())
                else:
                    logger.info('user_locations is none')
                logger.info('user_locations:{0}'.format(
                    categories.user_locations))
            else:
                logger.info('user_locations is none')

            # ---------------------------用户居住所在地 end---------------------------------------

            #  --------------------------获取就读学校和专业 begin-------------------------
            if 'educations' in user_data.keys():
                data_educations = user_data['educations']
                if len(data_educations) > 0 and data_educations is not None:
                    for i in range(len(data_educations)):
                        if 'school' in data_educations[i].keys():
                            categories.user_schools.append(
                                data_educations[i]['school']['name'].strip())
                        else:
                            categories.user_schools.append('none')
                        if 'major' in data_educations[i].keys():
                            categories.user_majors.append(
                                data_educations[i]['major']['name'].strip())
                        else:
                            categories.user_majors.append('none')
                else:
                    logger.info('data_educations is none')
                logger.info("user_majors:{0}".format(categories.user_majors))
                logger.info("user_schools{0}".format(categories.user_schools))
            else:
                logger.info("data_educations is none")
            # --------------------------获取就读学校和专业 begin-------------------------
            # ---------------------------公司和公司职位 employments company  begin---------------------
            if 'employments' in user_data.keys():
                data_employments = user_data['employments']

                if len(data_employments) > 0 and data_employments is not None:
                    for i in range(len(data_employments)):
                        if 'company' in data_employments[i].keys():
                            categories.user_companies.append(
                                data_employments[i]['company']['name'].strip())
                        else:
                            categories.user_companies.append('none')
                        if 'job' in data_employments[i].keys():
                            categories.user_jobs.append(
                                data_employments[i]['job']['name'].strip())
                        else:
                            categories.user_jobs.append('none')
                else:
                    logger.info('data_employments is none')
                logger.info("user_companies:{0}".format(
                    categories.user_companies))
                logger.info("user_jobs:{0}".format(categories.user_jobs))
            else:
                logger.info('data_employments is none')
            # ----------------------------公司和公司职位 employments company  end--------------------
            if 'description' in user_data.keys():
                categories.user_description = user_data['description'].strip()
                logger.info('user_description:{0}'.format(
                    categories.user_description))
            else:
                categories.user_description = ''
                logger.info('user_description is none')
            # TODO 字符串里面标签处理
            # 被赞同数
            if 'voteupCount' in user_data.keys():
                categories.user_vote_up_count = user_data['voteupCount']
                logger.info('user_vote_up_count:{0}'.format(
                    categories.user_vote_up_count))
            else:
                categories.user_vote_up_count = 0
                logger.info('user_vote_up_count is none')
            # 被收藏数
            if 'favoritedCount' in user_data.keys():
                categories.user_favorite_count = user_data['favoritedCount']
                logger.info('user_favorite_count:{0}'.format(
                    categories.user_favorite_count))
            else:
                categories.user_favorite_count = 0
                logger.info('user_favorite_count is none')
            # 被感谢数
            if 'thankedCount' in user_data.keys():
                categories.user_thanked_count = user_data['thankedCount']
                logger.info('user_thanked_count:{0}'.format(
                    categories.user_thanked_count))
            else:
                categories.user_thanked_count = 0
                logger.info('user_thanked_count is none')
            # 参与公共编辑数
            if 'logsCount' in user_data.keys():
                categories.user_logs_count = user_data['logsCount']
                logger.info('user_logs_count:{0}'.format(
                    categories.user_logs_count))
            else:
                categories.user_logs_count = 0
                logger.info('user_logs_count is none')
            # 关注数
            if 'followingCount' in user_data.keys():
                categories.user_following = user_data['followingCount']
                logger.info('user_following:{0}'.format(
                    categories.user_following))
            else:
                categories.user_following = 0
                logger.info('user_following is none')
            # 被关注数
            if 'followerCount' in user_data.keys():
                categories.user_followers = user_data['followerCount']
                logger.info('user_followers:{0}'.format(
                    categories.user_followers))
            else:
                categories.user_followers = 0
                logger.info('user_followers is none')
            # 参与赞助过的live 数
            if 'participatedLiveCount' in user_data.keys():
                categories.user_participated_live_count = user_data[
                    'participatedLiveCount']
                logger.info('user_participated_live_count:{0}'.format(
                    categories.user_participated_live_count))
            else:
                categories.user_participated_live_count = 0
                logger.info('user_participated_live_count is none')
            # 关注的话题数
            if 'followingTopicCount' in user_data.keys():
                categories.user_following_topic_count = user_data[
                    'followingTopicCount']
                logger.info('user_following_topic_count:{0}'.format(
                    categories.user_following_topic_count))
            else:
                categories.user_following_topic_count = 0
                logger.info('user_following_topic_count is none')
            # 关注的专栏数
            if 'followingColumnsCount' in user_data.keys():
                categories.user_following_columns_count = user_data[
                    'followingColumnsCount']
                logger.info('user_following_columns_count:{0}'.format(
                    categories.user_following_columns_count))
            else:
                categories.user_following_columns_count = 0
                logger.info('user_following_columns_count is none')
            # 关注的问题数
            if 'followingQuestionCount' in user_data.keys():
                categories.user_following_question_count = user_data[
                    'followingQuestionCount']
                logger.info('user_following_question_count:{0}'.format(
                    categories.user_following_question_count))
            else:
                categories.user_following_question_count = 0
                logger.info('user_following_question_count is none')
            # 关注的收藏夹
            if 'followingFavlistsCount' in user_data.keys():
                categories.user_following_favlists_count = user_data[
                    'followingFavlistsCount']
                logger.info('user_following_favlists_count:{0}'.format(
                    categories.user_following_favlists_count))
            else:
                categories.user_following_favlists_count = 0
                logger.info('user_following_favlists_count is none')
            # 回答数
            if 'answerCount' in user_data.keys():
                categories.user_answer_count = user_data['answerCount']
                logger.info('user_answer_count:{0}'.format(
                    categories.user_answer_count))
            else:
                categories.user_answer_count = 0
                logger.info('user_answer_count is none')
            # 分享数
            if 'articlesCount' in user_data.keys():
                categories.user_share_count = user_data['articlesCount']
                logger.info('user_share_count:{0}'.format(
                    categories.user_share_count))
            else:
                categories.user_share_count = 0
                logger.info('user_share_count is none')
            # 提问数
            if 'questionCount' in user_data.keys():
                categories.user_question_count = user_data['questionCount']
                logger.info('user_question_count:{0}'.format(
                    categories.user_question_count))
            else:
                categories.user_question_count = 0
                logger.info('user_question_count is none')
            # 收藏数
            if 'favoriteCount' in user_data.keys():
                categories.user_collections = user_data['favoriteCount']
                logger.info('user_collections:{0}'.format(
                    categories.user_collections))
            else:
                categories.user_collections = 0
                logger.info('user_collections is none')
            # 勋章类型(是否是最好回答者) 勋章话题(回答的相关话题)
            if 'badge' in user_data.keys():
                badge_data = user_data['badge']
                if len(badge_data) > 0:
                    for i in range(len(badge_data)):
                        categories.badge_description.append(
                            badge_data[i]['description'])
                        if len(badge_data[i]['topics']) > 0:
                            for m in range(len(badge_data[i]['topics'])):
                                categories.badge_topics.append(
                                    badge_data[i]['topics'][m]['name'])
                        else:
                            categories.badge_topics = []
                    logger.info(
                        'badge_description:{0}, badge_topics:{1}'.format(
                            categories.badge_description,
                            categories.badge_topics))
                else:
                    categories.badge_description = []
                    categories.badge_topics = []
                    logger.info('badge description and topics is none')
            else:
                logger.info('badge is none')
            logger.info(
                '--------------------------------------------------------------------'
            )

        except Exception as err:
            logger.info('Exception is :{0}'.format(err))
            logger.info("parsed user info err or ser_data is none!")
        try:
            time.sleep(1)
            categories.save()
        except Exception as err:
            logger.info('save to db err is:{0}'.format(err))
            logger.info('mongodb data save fail!!! url is {0}'.format(
                categories.user_url))
Ejemplo n.º 27
0
 def exception_handler(request, exception):
     logger.info('got exception request: {0}, exception {1}'.format(
         request, exception))
Ejemplo n.º 28
0
# __author_="gLinlf"
# coding=utf-8
import logging.config
import queue
import redis
import configparser
from src.logs.Logger import logger
# from src.logs.SpiderLogger import logs

logger.info('1333' + '123')

t1 = [1, 2]
t2 = []
# t2 = t1.copy()
t2 = (t1).copy()
print(t2)
print(t1)
print(len({}))

section = 'redis_py'
CONFIG = configparser.ConfigParser()
CONFIG.read('redis-py.ini', encoding='utf8')
redis_host = CONFIG.get(section, "REDIS_HOST")
print(redis_host)
redis_port = CONFIG.get(section, "REDIS_PORT")
print(redis_port)
# redis_db = CONFIG.get(section, "REDIS_DB")
# 使用连接池
redis_pool = redis.ConnectionPool(host=redis_host, port=redis_port, db=0)
rd = redis.Redis(connection_pool=redis_pool)
Ejemplo n.º 29
0
def get_user_info():
    user_name = request.args.get('user_name')
    badge_topic = request.args.get('badge_topic')
    logger.info('user_name is {0} ,badge_topic is {1}'.format(
        user_name, badge_topic))
    page = int(request.args.get('state'))
    limit_size = int(request.args.get('limit'))
    if CommUtils.check_params(page, limit_size):
        start = (page - 1) * limit_size
        if CommUtils.is_not_empty(
                user_name) and not CommUtils.is_not_empty(badge_topic):
            total_size = db.find({'user_name': user_name}).count()
            user_data = db.find({
                'user_name': user_name
            }).skip(start).limit(limit_size)
        elif not CommUtils.is_not_empty(user_name) and CommUtils.is_not_empty(
                badge_topic):
            total_size = db.find({
                'badge_topics': {
                    '$regex': badge_topic
                }
            }).count()
            user_data = db.find({
                'badge_topics': {
                    '$regex': badge_topic
                }
            }).skip(start).limit(limit_size)
        elif CommUtils.is_not_empty(user_name) and CommUtils.is_not_empty(
                badge_topic):
            total_size = db.find({
                'badge_topics': {
                    '$regex': badge_topic
                },
                'user_name': user_name
            }).count()
            user_data = db.find({
                'badge_topics': {
                    '$regex': badge_topic
                },
                'user_name': user_name
            }).skip(start).limit(limit_size)
        else:
            total_size = db.find({}).count()
            user_data = db.find().skip(start).limit(limit_size)
        print(type(user_data))
        data_list = []
        for data in user_data:
            # _id Objectid (不能序列化,转成json)??
            data['_id'] = str(data['_id'])
            data_list.append(data)
        logger.info('第:{0} 页显示:{1} 条的用户信息:{2}'.format(page, limit_size,
                                                      data_list))
        succ_dict = {
            'status': '000000',
            'total': total_size,
            'pageSize': limit_size,
            'curPage': page,
            'list': data_list
        }
        return jsonify(succ_dict)
    else:
        error_dict = {'status': '999999'}
        logger.info("参数传递有误!")
        return jsonify(error_dict)
Ejemplo n.º 30
0
    def run(self):
        # 线程池管理
        validate_thread_list = []
        # for i in range(PROXY_VALIDATE_THREAD_NUM):
        # 启动获取代理ip的线程
        fetch_parse_thread = FetchParseThread()
        fetch_parse_thread.start()
        logger.info('fetch and parse proxy ip thread is:{0}'.format(
            fetch_parse_thread.status))

        # 启动可用代理池扫描线程5 分钟扫描检测一次
        scan_proxy_thread = CheckValidateProxyPool()
        scan_proxy_thread.start()
        logger.info('scan validate proxy ip thread is:{0}'.format(
            scan_proxy_thread.status))

        # 启动代理ip验证线程(打开3个线程 加快验证)
        for i in range(VALIDATE_THREAD_NUM):
            validate_thread = ValidateIPThread()
            validate_thread_list.append(validate_thread)
            validate_thread.start()
            logger.info('validate_thread---- {0} status is {1}'.format(
                i, validate_thread.status))
        # 线程监控 如果发现线程出现异常 status= error 删除线程重新启动
        logger.info('unchecked_ip_pool size is :{0}'.format(
            unchecked_ip_pool.qsize()))
        logger.info('validate_proxy_pool size is :{0}'.format(
            validate_proxy_pool.qsize()))
        while True:
            if fetch_parse_thread.status == 'error':
                fetch_parse_thread = FetchParseThread()
                fetch_parse_thread.start()
                logger.info('获取代理IP线程出现问题,已经重新启动!')

            if scan_proxy_thread.status == 'error':
                scan_proxy_thread = CheckValidateProxyPool()
                scan_proxy_thread.start()
                logger.info('扫描检测可用代理线程出现问题,已经重新启动!')

            for thread in validate_thread_list:
                if thread.status == 'error':
                    validate_thread_list.remove(thread)
                    validate_thread = ValidateIPThread()
                    validate_thread_list.append(validate_thread)
                    logger.info('验证代理ip线程出现问题,已经重启!')
            time.sleep(300)