def run(self): # 加载数据url LoadParam.load_url() spider_thread_list = [] # 主爬虫优先启动 # spider_thread = Spider() # spider_thread_list.append(spider_thread) # spider_thread.start() # print('main spider_thread is {0}'.format(spider_thread.status)) # time.sleep(10) for s in range(MAX_SPIDER_THREAD_NUM): time.sleep(5) spider_thread = Spider() spider_thread_list.append(spider_thread) spider_thread.start() logger.info('spider_thread{0} is {1}'.format( s, spider_thread.status)) # 监控爬虫状态 失败(403 或 爬取结束)结束(被屏蔽则关闭爬虫) while True: for Thread_spider in spider_thread_list: if Thread_spider.status == 'stop': spider_thread_list.remove(Thread_spider) logger.info('Thread_spider----{0} status is {1}'.format( Thread_spider, Thread_spider.status)) # 重新启动 spider_thread = Spider() spider_thread_list.append(spider_thread) spider_thread.start() else: continue time.sleep(360)
def show_schools(): status = '000000' if len(school_data) == 0: # all_sch = db.distinct("user_schools") all_985 = University985.objects.all() all_sch = [] for name in all_985: all_sch.append(name['univ_name']) obj = {} count = 0 for b in range(len(all_sch)): num = db.find({'user_schools': {'$regex': all_sch[b]}}).count() count = count + num if all_sch[b] == '': obj.update({"未知": num}) else: obj.update({all_sch[b]: num}) all_count = db.find({}).count() # 对business_objs 的值(values)排序 取排行前20个行业 objs_list = (sorted(CommUtils.dict2list(obj), key=lambda x: x[1], reverse=True))[0:20] y_data = [] series_data = [] for tup in objs_list: y_data.append(tup[0]) series_data.append(tup[1]) logger.info('前二十的985大学排名{0}'.format(y_data)) logger.info('前二十的985大学排名占比{0}'.format(series_data)) retn_data = {'status': status, 'ydata': y_data, 'data': series_data} school_data.update(retn_data) else: retn_data = school_data return jsonify(retn_data)
def delete_info(): print(type(request.form)) # < class 'werkzeug.datastructures.ImmutableMultiDict'> _id = request.form["_id"] if CommUtils.check_params(_id): db.remove({'_id': ObjectId(_id)}) logger.info("删除 {0} 成功!".format(_id)) return jsonify([]) else: logger.info('_id 为None ,删除用户详细信息失败!') return jsonify([])
def run(self): while True: logger.info('FetchParseThread is running !') try: if validate_proxy_pool.qsize( ) < MIN_VALIDATE_POOL_SIZE and unchecked_ip_pool.qsize( ) < MIN_UNCHECKED_SIZE: self.add_unchecked_to_queue() time.sleep(180) else: time.sleep(30) except Exception as err: logger.info('FetchParseThread err is :{0}'.format(err)) self.status = 'error'
def show_following(): # 粉丝数排行前15 status = '000000' # mongoengine 对原生sort()方法不支持 使用order_by() user_data = UserInfoData.objects.order_by('-user_following').skip(0).limit( 15) series_data = [] y_data = [] for data in user_data: y_data.append(data['user_name']) series_data.append(data['user_following']) logger.info("关注数前15的用户:{0}和关注数:{1}".format(y_data, series_data)) retn_data = {'status': status, 'ydata': y_data, 'data': series_data} return jsonify(retn_data)
def show_answers(): # 回答问题数前15 status = '000000' # mongoengine 对原生sort()方法不支持 使用order_by() user_data = UserInfoData.objects.order_by('-user_answer_count').skip( 0).limit(15) series_data = [] x_data = [] for data in user_data: x_data.append(data['user_name']) series_data.append(data['user_answer_count']) logger.info("问答问题数前15的用户:{0}和回答问题数:{1}".format(x_data, series_data)) retn_data = {'status': status, 'data': series_data, 'xdata': x_data} return jsonify(retn_data)
def begin_spider(): print("start spider") status = '000000' if pid_queue.empty(): try: logger.info('Parent process {0}'.format(os.getpid())) logger.info('spider_process is started now :') process = Process(target=spider_process, args=( 'spider_process', pid_queue, )) process.start() # join()子进程结束后再继续往下运行,通常用于进程间的同步 # process.join() time.sleep(3) if not pid_queue.empty(): spider_status = 1 else: spider_status = 0 except: spider_status = 0 status = '999999' logger.debug('进程启动异常!') else: spider_status = 1 logger.info('进程已存在!正在爬取知乎信息!') retn_data = {'status': status, 'spiderStatus': spider_status} logger.info('返回状态:{0}和启动状态:{1}'.format(status, spider_status)) return jsonify(retn_data)
def query_detail(): _id = request.args.get("_id") data_list = [] if CommUtils.check_params(_id): user_info = db.find({'_id': ObjectId(_id)}) for data in user_info: data['_id'] = str(data['_id']) data['create_time'] = data['create_time'].strftime( "%Y-%m-%d %H:%M:%S") data_list.append(data) logger.info("用户:{0} 的所有明细信息:{1}".format(_id, data_list)) return jsonify(data_list) else: logger.info('_id 为None ,获取用户详细信息失败!') return jsonify(data_list)
def show_spider_status(): try: # 已经爬去的url all_url = db_f.find({}).count() # 已经解析的url had_parsed = db_f.find({'queueUrl': 'none'}).count() # 解析成功的url parsed_success = db.find({}).count() # 解析失败的url parsed_failure = had_parsed - parsed_success # 成功率 rate = str( Decimal(str((parsed_success / had_parsed * 100))).quantize( Decimal('0.00'))) # 总用户数 all_users = parsed_success # 爬虫状态 1正在爬取 0结束爬取 查看爬虫进程是否存在 if not pid_queue.empty(): spider_status = 1 else: spider_status = 0 logger.info( "all_url:{0}, had_parsed:{1} ,parsed_success:{2} ,parsed_failure:{3} ,rate:{4}%,all_users:{5}" .format(all_url, had_parsed, parsed_success, parsed_failure, rate, all_users)) data_list = [{ 'all_url': all_url, 'had_parsed': had_parsed, 'parsed_success': parsed_success, 'parsed_failure': parsed_failure, 'rate': rate, 'all_users': all_users, 'spider_status': spider_status }] retn_dict = { 'status': '000000', 'total': 1, 'pageSize': 10, 'curPage': 1, 'spiderStatus': spider_status, 'list': data_list } return jsonify(retn_dict) except: logger.info("获取爬虫状态信息失败!") error_dict = {'status': '999999'} return jsonify(error_dict)
def parse_page_html(self, page_html): try: soup_page = BeautifulSoup(page_html, 'html5lib') # 分析分页页面 得到个人信息,解析的到用户信息json串 data = soup_page.find('div', attrs={ 'id': 'data' }).attrs['data-state'] if data is not None: # 将网页解析的用户信息转成json串data_json data_json = json.loads(str(data)) # 当前页所有user的数据集 all_user_data = data_json['entities']['users'] self.add_following_url(all_user_data) else: logger.info('parse_page_html data is none!') except Exception as err: logger.debug("parse_page_html error ! {0}".format(err))
def show_education(): status = '000000' # all_sch = db.distinct("user_schools") if len(education_data) == 0: all_211 = University211.objects.all() all_sch = [] for name in all_211: all_sch.append(name['univ_name']) # 未填写 all_sch.append('') sch211_count = 0 sch_none = 0 for b in range(len(all_sch)): num = db.find({'user_schools': {'$regex': all_sch[b]}}).count() if all_sch[b] == '': sch_none = sch_none + num else: sch211_count = sch211_count + num all_count = db.find({}).count() # 填写的其他大学 sch_others = all_count - sch211_count - sch_none retn_keys = ['211高校', '其他学校', '未知学校'] retn_objs = [{ 'name': '211高校', 'value': sch211_count }, { 'name': '其他学校', 'value': sch_others }, { 'name': '未知学校', 'value': sch_none }] retn_data = { 'status': status, 'keysData': retn_keys, 'data': retn_objs } education_data.update(retn_data) logger.info('受教育程度返回值:{0}'.format(retn_data)) else: retn_data = education_data return jsonify(retn_data)
def get_proxy_ip_html(self, current_page): self.session.headers = header_ip # while current_page <= MAX_PROXY_PAGE: url = base_url + str(current_page) try: res = self.session.get(url, timeout=MAX_TIME_OUT) # 解析 获得 代理ip if res.status_code == 200 and res is not None: return res.text # all_ip_info = self.parse_proxy_ip(res.text) # # 校验ip可用性 和 去重 # for i in range(len(all_ip_info)): # if self.validate_IP.is_validate_ip(all_ip_info[i]): # # 判断 该ip是否存在于 内存中有效代理ip # # TODO 或使用 队列处理(存入全局的有效代理池中) # validate_ip_list.append(all_ip_info[i]) # else: elif res.status_code == 403: logger.info('403 proxy ip web forbidden') return None else: logger.info('proxy ip web return code is :{0}'.format( res.status_code)) return None except Exception as err: logger.info( 'ger_proxy_ip_html err:{0}, return status_code is {1}!'.format( err, res.status_code)) return None
def get_parse_url(): if not queue_follow_url.empty(): using_url = queue_follow_url.get() else: # 从数据库加载已经爬取的url数据 和 之前在队列中的数据 all_query_set = FollowingUrl.objects.all() for followingUrl in all_query_set: try: # 加载所以已经爬取的url(再次判重,将重复的数据从数据库删除) if followingUrl.urlToken not in had_url: had_url.add(followingUrl.urlToken) if followingUrl.queueUrl != 'none': # 加载程序结束前队列中的url到队列中 queue_follow_url.put(followingUrl.queueUrl) else: continue else: logger.info("删除重复的 followingUrl:{0} 的 _id:{1}".format( followingUrl._data, followingUrl.id)) followingUrl.delete({'_id': str(followingUrl.id)}) continue except Exception as err: logger.debug('get_parse_url err :{0}'.format(err)) logger.info('error happened in reload urls from mongodb!') continue # 加载完毕 重新从队列取值 if not queue_follow_url.empty() and len(had_url) > 0: using_url = queue_follow_url.get() elif queue_follow_url.empty() and len(had_url) == 0: # 爬虫入口 using_url = follow_url_into else: logger.info("爬取结束了!") return return using_url
def get_proxies_ip(self): try: # 代理池是否有代理ip 且不少于四个,否则等待(代理线程爬取完毕) if not validate_proxy_pool.empty( ) and validate_proxy_pool.qsize() > MIN_PROXY_IP_POOL: self.proxy_ip_info = validate_proxy_pool.get() self.proxies_ip = { self.proxy_ip_info.get('protocol'): 'http://' + self.proxy_ip_info.get('ip') + ':' + self.proxy_ip_info.get('port') } time.sleep(1) # 将代理ip返回队列 validate_proxy_pool.put(self.proxy_ip_info) else: logger.info("等待代理ip线程获取代理或其他爬虫归还可用代理!") # 如果没有代理ip 使用本ip直接访问 time.sleep(MAX_WAIT_PROXY_TIME) # validate_proxy_pool.put(123) self.proxies_ip = 1 logger.info('使用本机ip或购买的稳定ip!proxies_ip = '.format( self.proxies_ip)) # 如果代理ip量多稳定 ,使用递归调用,死等代理ip(该方法最稳) # self.get_proxies_ip() # TODO 不使用代理 (或者使用购买的稳定代理ip) return self.proxies_ip except Exception as err: logger.info('get_proxies_ip err is :{0}'.format(err)) return None
def close_spider(): try: # 获得 进程共享 pid值 if not pid_queue.empty(): value = pid_queue.get() logger.info('kill pid'.format(value)) os.kill(value, signal.SIGTERM) else: logger.info('进程不存在') # a = os.popen('taskkill.exe/pid:' + str(value)+'-t -f') except OSError: logger.info('没有如此进程!!!') logger.info('spider_process is closed!') return 'none'
def is_validate_ip(self, ip_info): # def is_validate_ip(self): # if ip_info is None: # return False # 截取ip值 # ip_str = str(ip_info.get('http')).replace('http://', '') # real_ip = ip_str[0:ip_str.index(":")] # print('real_ip', real_ip) # 拼接ip ip = ip_info.get("ip") port = ip_info.get('port') protocol = ip_info.get('protocol') proxy_ip = {protocol: protocol + '://' + ip + ':' + port} logger.info('check proxy_ip :{0}'.format(proxy_ip)) # proxy_ip = {'http': 'http//202.121.96.33:8086'} session = requests.session() session.headers = headerr session.proxies = proxy_ip retry_time = 0 while retry_time < max_try_time: try: response = session.get(v_url, timeout=max_time_out) # print(response.status_code) # print(response.text) if response.status_code == 200: match_list = re.findall(r'[0-9]+(?:\.[0-9]+){3}', response.text) print(match_list) if len(match_list) > 0: current_ip = match_list.pop() logger.info('current_ip:{0}'.format(current_ip)) if current_ip is not None and current_ip == ip: logger.info( 'this is validate ip------------> {0}'.format( current_ip)) return True else: retry_time += 1 continue else: retry_time += 1 continue except Exception as err: logger.info('is_validate_ip err is :{0}'.format(err)) return False return False
def add_following_url(follow_user__data): try: base_url = 'https://www.zhihu.com/people/' # dict类型 keys 和values 可以使用如下遍历(也可以.keys()he .values()直接输出) for key, value in follow_user__data.items(): new_url = base_url + (str(value['urlToken'])) if new_url in had_url: continue elif str(value['urlToken']) == 'None': continue else: # 存入had_url had_url.add(new_url) # 存入队列 had_url去重后队列值都是唯一性,未解析用户信息的url logger.info('new following url is :{0}'.format(new_url)) queue_follow_url.put(new_url) # 已经爬去的url和追踪队列中的url 存储到mongodb(已经去重) db_url = FollowingUrl() db_url.urlToken = new_url db_url.queueUrl = new_url db_url.save() except Exception as err: logger.debug('add_following_url has err :{0}'.format(err))
def spider_process(name, queue): logger.info('begin spider_process! ') logger.info('Run child process :{0} {1}..'.format(name, os.getpid())) pid = os.getpid() logger.info('subprocess pid is {0}'.format(pid)) queue.put(pid) # 执行爬虫系统 start_spider = ZhiHuUser() start_spider.start()
def show_voted_more(): # 粉丝数排行前15 status = '000000' # mongoengine 对原生sort()方法不支持 使用order_by() user_data = UserInfoData.objects.order_by('-user_vote_up_count').skip( 0).limit(10) x_data = [] voted_data = [] thanked_data = [] favorite_data = [] for data in user_data: x_data.append(data['user_name']) voted_data.append(data['user_vote_up_count']) thanked_data.append(data['user_thanked_count']) favorite_data.append(data['user_favorite_count']) series_data = { 'votedData': voted_data, 'thankedData': thanked_data, 'favoriteData': favorite_data } logger.info("赞同数数前10的用户:{0}和对应的赞同数:{1},感谢数:{2},和收藏数:{3}".format( x_data, voted_data, thanked_data, favorite_data)) retn_data = {'status': status, 'xdata': x_data, 'data': series_data} return jsonify(retn_data)
def show_business(): print('business_data :', business_data) if len(business_data) == 0: all_business = db.distinct("user_business") all_count = db.find({}).count() business_objs = {} # result.fromkeys(business) for b in range(len(all_business)): num = db.find({'user_business': all_business[b]}).count() if all_business[b] == '': business_objs.update({"未知": num}) else: business_objs.update({all_business[b]: num}) # 对business_objs 的值(values)排序 取排行前20个行业 objs_list = (sorted(CommUtils.dict2list(business_objs), key=lambda x: x[1], reverse=True))[0:20] retn_keys = [] retn_objs = [] count = 0 for tup in objs_list: retn_keys.append(tup[0]) retn_objs.append({'name': tup[0], 'value': tup[1]}) count = count + tup[1] # 剩余其他职业 retn_keys.append('其他') other_count = all_count - count retn_objs.append({'name': '其他', 'value': other_count}) logger.info("排行前20 的行业{0}".format(retn_keys)) logger.info("排行前20 的行业 占比{0}".format(retn_objs)) return_data = {'keysData': retn_keys, 'data': retn_objs} business_data.update(return_data) else: return_data = business_data return jsonify(return_data)
def parse_html_info(self, source): try: soup = BeautifulSoup(source, "html5lib") # 分析页面 发现个人信息都在 id = "data" data-state={} 里面 获取源码,解析的到用户信息json串 data_div = soup.find('div', attrs={'id': 'data'}) if data_div is not None: data = soup.find('div', attrs={ 'id': 'data' }).attrs['data-state'] # 将网页解析的用户信息转成json串data_json data_json = json.loads(str(data)) # 首页中的所有用户集合 all_users_data = data_json['entities']['users'] if len(all_users_data) > 0 and all_users_data is not None: # 截取用户连接中的名字 url_user_name = self.url.split("/")[-1] user_data = all_users_data[url_user_name] if len(user_data) > 0 and user_data is not None: # 解析用户信息,存入数据库 self.fetch_user_info.parsed_user_info( user_data, self.url) # 将已经解析获得用户信息的url值设为none self.change_queue_url2none(self.url) # 查询网页所有分页信息 pages_html = soup.find_all( 'button', attrs={ 'class': 'Button PaginationButton Button--plain' }) # 得到总页码数 if len(pages_html) > 0: total_page = int(pages_html[-1].contents[0]) else: total_page = 1 # 异步io爬取每一页的关注人地址(如果全局最大爬取页数为1页时候,直接爬取第一页following_url,减少page=1访问次数) if max_page > 1: self.getFollowingUrl.get_other_page_following( self.url, total_page) else: self.getFollowingUrl.add_following_url( all_users_data) else: logger.info('user_data is none!') else: logger.info('all_users_data is none!') else: logger.info( 'data_div is none!(NoneType object has no attribute attrs)' ) self.change_queue_url2none(self.url) except Exception as err: logger.debug('parse_html_info err is : {0}'.format(err)) self.change_queue_url2none(self.url)
def run(self): while True: logger.info('CheckValidateProxyPool is running !') try: self.get_old_ip() if not old_ip_pool.empty(): logger.info( "begin scan validate ip ,old_ip_pool size is :{0}". format(old_ip_pool.qsize())) for i in range(old_ip_pool.qsize()): old_ip = old_ip_pool.get() if self.validate_ip.is_validate_ip(old_ip): # ip还有效则加入可用对列 validate_proxy_pool.put(old_ip) else: continue else: logger.info('CheckValidateProxyPool is over!') # 五分检查一次 time.sleep(240) except Exception as err: logger.info('CheckValidateProxyPool is err :{0}'.format(err)) self.status = 'error'
def load_url(): logger.info("load url begin!") # 从数据库加载已经爬取的url数据 和 之前在队列中的数据 all_query_set = FollowingUrl.objects.all() for followingUrl in all_query_set: try: # 加载所以已经爬取的url(再次判重,将重复的数据从数据库删除) if followingUrl.urlToken not in had_url: had_url.add(followingUrl.urlToken) if followingUrl.queueUrl != 'none': # 加载程序结束前队列中的url到队列中 queue_follow_url.put(followingUrl.queueUrl) else: continue else: logger.info("删除重复的 followingUrl:{0} 的 _id:{1}".format( followingUrl._data, followingUrl.id)) followingUrl.delete({'_id': str(followingUrl.id)}) continue except Exception as err: logger.debug('load url err :{0}'.format(err)) logger.info('error happened in load urls from mongodb!') continue logger.info("load url end!")
def parse_proxy_ip(web_data): if web_data is not None: try: soup = BeautifulSoup(web_data, 'html5lib') ips_tr = soup.find_all('tr') ip_info_list = [] for i in range(1, len(ips_tr)): ip_info = ips_tr[i] tds = ip_info.find_all('td') ip = str(tds[1].text) port = str(tds[2].text) protocol = str(tds[5].text).lower() # 封装代理ip ip_info = {'ip': ip, 'port': port, 'protocol': protocol} ip_info_list.append(ip_info) logger.info('xicidaili get http ip proxy list is:{0}'.format( ip_info_list)) return ip_info_list except Exception as err: logger.info('parse_proxy_ip err is :{0}'.format(err)) return None else: logger.info('parse_proxy_ip fail web_data is None!') return None
def run(self): try: self.add_validate_to_queue() except Exception as err: logger.info('ValidateIPThread err is:{0}'.format(err)) self.status = 'error'
def parsed_user_info(user_data, user_url): # user_info_categories 对象 categories = mongo.UserInfoData() try: categories.user_url = user_url + "/following" logger.info('user_url:{0}'.format(categories.user_url)) # 用户姓名 if 'name' in user_data.keys(): categories.user_name = user_data['name'].strip() logger.info("user_name :{0}".format(categories.user_name)) else: categories.user_name = '' logger.info("user_name is none ") # 用户头像地址 if 'avatarUrl' in user_data.keys(): categories.user_avatar_url = user_data['avatarUrl'].strip() logger.info('user_avatar_url:{0}'.format( categories.user_avatar_url)) else: categories.user_avatar_url = '' logger.info('user_avatar_url is none') if 'headline' in user_data.keys(): categories.user_head_line = user_data['headline'].strip() logger.info('user_head_line:{0}'.format( categories.user_head_line)) else: categories.user_head_line = '' logger.info('user_head_line is none') # TODO 处理标题里面的连接 # --------------------------获取性别gender -1未知性别 0 女性 1男性 begin---------------------------- if 'gender' in user_data.keys(): gender = user_data['gender'] if gender == 1: categories.user_sex = 'man' elif gender == 0: categories.user_sex = 'female' else: categories.user_sex = 'none' logger.info("user_sex:{0}".format(categories.user_sex)) else: categories.user_head_line = 'none' logger.info('user_sex is none') # ---------------------------获取性别gender -1未知性别 0 女性 1男性 end---------------------------- # -----------------------行业 begin---------------------------- # 判断business key是否存在(不存在报异常) if 'business' in user_data.keys(): data_business = user_data['business'] if len(data_business) > 0 and data_business is not None: categories.user_business = user_data['business']['name'] logger.info("user_business:{0}".format( categories.user_business)) else: categories.user_business = '' logger.info("user_business is none") else: categories.user_business = '' logger.info("business key is none") # -----------------------行业 end---------------------------- # ---------------------------用户居住所在地 begin--------------------------------------- if 'locations' in user_data.keys(): data_locations = user_data['locations'] if len(data_locations) > 0 and data_locations is not None: for i in range(len(data_locations)): categories.user_locations.append( data_locations[i]['name'].strip()) else: logger.info('user_locations is none') logger.info('user_locations:{0}'.format( categories.user_locations)) else: logger.info('user_locations is none') # ---------------------------用户居住所在地 end--------------------------------------- # --------------------------获取就读学校和专业 begin------------------------- if 'educations' in user_data.keys(): data_educations = user_data['educations'] if len(data_educations) > 0 and data_educations is not None: for i in range(len(data_educations)): if 'school' in data_educations[i].keys(): categories.user_schools.append( data_educations[i]['school']['name'].strip()) else: categories.user_schools.append('none') if 'major' in data_educations[i].keys(): categories.user_majors.append( data_educations[i]['major']['name'].strip()) else: categories.user_majors.append('none') else: logger.info('data_educations is none') logger.info("user_majors:{0}".format(categories.user_majors)) logger.info("user_schools{0}".format(categories.user_schools)) else: logger.info("data_educations is none") # --------------------------获取就读学校和专业 begin------------------------- # ---------------------------公司和公司职位 employments company begin--------------------- if 'employments' in user_data.keys(): data_employments = user_data['employments'] if len(data_employments) > 0 and data_employments is not None: for i in range(len(data_employments)): if 'company' in data_employments[i].keys(): categories.user_companies.append( data_employments[i]['company']['name'].strip()) else: categories.user_companies.append('none') if 'job' in data_employments[i].keys(): categories.user_jobs.append( data_employments[i]['job']['name'].strip()) else: categories.user_jobs.append('none') else: logger.info('data_employments is none') logger.info("user_companies:{0}".format( categories.user_companies)) logger.info("user_jobs:{0}".format(categories.user_jobs)) else: logger.info('data_employments is none') # ----------------------------公司和公司职位 employments company end-------------------- if 'description' in user_data.keys(): categories.user_description = user_data['description'].strip() logger.info('user_description:{0}'.format( categories.user_description)) else: categories.user_description = '' logger.info('user_description is none') # TODO 字符串里面标签处理 # 被赞同数 if 'voteupCount' in user_data.keys(): categories.user_vote_up_count = user_data['voteupCount'] logger.info('user_vote_up_count:{0}'.format( categories.user_vote_up_count)) else: categories.user_vote_up_count = 0 logger.info('user_vote_up_count is none') # 被收藏数 if 'favoritedCount' in user_data.keys(): categories.user_favorite_count = user_data['favoritedCount'] logger.info('user_favorite_count:{0}'.format( categories.user_favorite_count)) else: categories.user_favorite_count = 0 logger.info('user_favorite_count is none') # 被感谢数 if 'thankedCount' in user_data.keys(): categories.user_thanked_count = user_data['thankedCount'] logger.info('user_thanked_count:{0}'.format( categories.user_thanked_count)) else: categories.user_thanked_count = 0 logger.info('user_thanked_count is none') # 参与公共编辑数 if 'logsCount' in user_data.keys(): categories.user_logs_count = user_data['logsCount'] logger.info('user_logs_count:{0}'.format( categories.user_logs_count)) else: categories.user_logs_count = 0 logger.info('user_logs_count is none') # 关注数 if 'followingCount' in user_data.keys(): categories.user_following = user_data['followingCount'] logger.info('user_following:{0}'.format( categories.user_following)) else: categories.user_following = 0 logger.info('user_following is none') # 被关注数 if 'followerCount' in user_data.keys(): categories.user_followers = user_data['followerCount'] logger.info('user_followers:{0}'.format( categories.user_followers)) else: categories.user_followers = 0 logger.info('user_followers is none') # 参与赞助过的live 数 if 'participatedLiveCount' in user_data.keys(): categories.user_participated_live_count = user_data[ 'participatedLiveCount'] logger.info('user_participated_live_count:{0}'.format( categories.user_participated_live_count)) else: categories.user_participated_live_count = 0 logger.info('user_participated_live_count is none') # 关注的话题数 if 'followingTopicCount' in user_data.keys(): categories.user_following_topic_count = user_data[ 'followingTopicCount'] logger.info('user_following_topic_count:{0}'.format( categories.user_following_topic_count)) else: categories.user_following_topic_count = 0 logger.info('user_following_topic_count is none') # 关注的专栏数 if 'followingColumnsCount' in user_data.keys(): categories.user_following_columns_count = user_data[ 'followingColumnsCount'] logger.info('user_following_columns_count:{0}'.format( categories.user_following_columns_count)) else: categories.user_following_columns_count = 0 logger.info('user_following_columns_count is none') # 关注的问题数 if 'followingQuestionCount' in user_data.keys(): categories.user_following_question_count = user_data[ 'followingQuestionCount'] logger.info('user_following_question_count:{0}'.format( categories.user_following_question_count)) else: categories.user_following_question_count = 0 logger.info('user_following_question_count is none') # 关注的收藏夹 if 'followingFavlistsCount' in user_data.keys(): categories.user_following_favlists_count = user_data[ 'followingFavlistsCount'] logger.info('user_following_favlists_count:{0}'.format( categories.user_following_favlists_count)) else: categories.user_following_favlists_count = 0 logger.info('user_following_favlists_count is none') # 回答数 if 'answerCount' in user_data.keys(): categories.user_answer_count = user_data['answerCount'] logger.info('user_answer_count:{0}'.format( categories.user_answer_count)) else: categories.user_answer_count = 0 logger.info('user_answer_count is none') # 分享数 if 'articlesCount' in user_data.keys(): categories.user_share_count = user_data['articlesCount'] logger.info('user_share_count:{0}'.format( categories.user_share_count)) else: categories.user_share_count = 0 logger.info('user_share_count is none') # 提问数 if 'questionCount' in user_data.keys(): categories.user_question_count = user_data['questionCount'] logger.info('user_question_count:{0}'.format( categories.user_question_count)) else: categories.user_question_count = 0 logger.info('user_question_count is none') # 收藏数 if 'favoriteCount' in user_data.keys(): categories.user_collections = user_data['favoriteCount'] logger.info('user_collections:{0}'.format( categories.user_collections)) else: categories.user_collections = 0 logger.info('user_collections is none') # 勋章类型(是否是最好回答者) 勋章话题(回答的相关话题) if 'badge' in user_data.keys(): badge_data = user_data['badge'] if len(badge_data) > 0: for i in range(len(badge_data)): categories.badge_description.append( badge_data[i]['description']) if len(badge_data[i]['topics']) > 0: for m in range(len(badge_data[i]['topics'])): categories.badge_topics.append( badge_data[i]['topics'][m]['name']) else: categories.badge_topics = [] logger.info( 'badge_description:{0}, badge_topics:{1}'.format( categories.badge_description, categories.badge_topics)) else: categories.badge_description = [] categories.badge_topics = [] logger.info('badge description and topics is none') else: logger.info('badge is none') logger.info( '--------------------------------------------------------------------' ) except Exception as err: logger.info('Exception is :{0}'.format(err)) logger.info("parsed user info err or ser_data is none!") try: time.sleep(1) categories.save() except Exception as err: logger.info('save to db err is:{0}'.format(err)) logger.info('mongodb data save fail!!! url is {0}'.format( categories.user_url))
def exception_handler(request, exception): logger.info('got exception request: {0}, exception {1}'.format( request, exception))
# __author_="gLinlf" # coding=utf-8 import logging.config import queue import redis import configparser from src.logs.Logger import logger # from src.logs.SpiderLogger import logs logger.info('1333' + '123') t1 = [1, 2] t2 = [] # t2 = t1.copy() t2 = (t1).copy() print(t2) print(t1) print(len({})) section = 'redis_py' CONFIG = configparser.ConfigParser() CONFIG.read('redis-py.ini', encoding='utf8') redis_host = CONFIG.get(section, "REDIS_HOST") print(redis_host) redis_port = CONFIG.get(section, "REDIS_PORT") print(redis_port) # redis_db = CONFIG.get(section, "REDIS_DB") # 使用连接池 redis_pool = redis.ConnectionPool(host=redis_host, port=redis_port, db=0) rd = redis.Redis(connection_pool=redis_pool)
def get_user_info(): user_name = request.args.get('user_name') badge_topic = request.args.get('badge_topic') logger.info('user_name is {0} ,badge_topic is {1}'.format( user_name, badge_topic)) page = int(request.args.get('state')) limit_size = int(request.args.get('limit')) if CommUtils.check_params(page, limit_size): start = (page - 1) * limit_size if CommUtils.is_not_empty( user_name) and not CommUtils.is_not_empty(badge_topic): total_size = db.find({'user_name': user_name}).count() user_data = db.find({ 'user_name': user_name }).skip(start).limit(limit_size) elif not CommUtils.is_not_empty(user_name) and CommUtils.is_not_empty( badge_topic): total_size = db.find({ 'badge_topics': { '$regex': badge_topic } }).count() user_data = db.find({ 'badge_topics': { '$regex': badge_topic } }).skip(start).limit(limit_size) elif CommUtils.is_not_empty(user_name) and CommUtils.is_not_empty( badge_topic): total_size = db.find({ 'badge_topics': { '$regex': badge_topic }, 'user_name': user_name }).count() user_data = db.find({ 'badge_topics': { '$regex': badge_topic }, 'user_name': user_name }).skip(start).limit(limit_size) else: total_size = db.find({}).count() user_data = db.find().skip(start).limit(limit_size) print(type(user_data)) data_list = [] for data in user_data: # _id Objectid (不能序列化,转成json)?? data['_id'] = str(data['_id']) data_list.append(data) logger.info('第:{0} 页显示:{1} 条的用户信息:{2}'.format(page, limit_size, data_list)) succ_dict = { 'status': '000000', 'total': total_size, 'pageSize': limit_size, 'curPage': page, 'list': data_list } return jsonify(succ_dict) else: error_dict = {'status': '999999'} logger.info("参数传递有误!") return jsonify(error_dict)
def run(self): # 线程池管理 validate_thread_list = [] # for i in range(PROXY_VALIDATE_THREAD_NUM): # 启动获取代理ip的线程 fetch_parse_thread = FetchParseThread() fetch_parse_thread.start() logger.info('fetch and parse proxy ip thread is:{0}'.format( fetch_parse_thread.status)) # 启动可用代理池扫描线程5 分钟扫描检测一次 scan_proxy_thread = CheckValidateProxyPool() scan_proxy_thread.start() logger.info('scan validate proxy ip thread is:{0}'.format( scan_proxy_thread.status)) # 启动代理ip验证线程(打开3个线程 加快验证) for i in range(VALIDATE_THREAD_NUM): validate_thread = ValidateIPThread() validate_thread_list.append(validate_thread) validate_thread.start() logger.info('validate_thread---- {0} status is {1}'.format( i, validate_thread.status)) # 线程监控 如果发现线程出现异常 status= error 删除线程重新启动 logger.info('unchecked_ip_pool size is :{0}'.format( unchecked_ip_pool.qsize())) logger.info('validate_proxy_pool size is :{0}'.format( validate_proxy_pool.qsize())) while True: if fetch_parse_thread.status == 'error': fetch_parse_thread = FetchParseThread() fetch_parse_thread.start() logger.info('获取代理IP线程出现问题,已经重新启动!') if scan_proxy_thread.status == 'error': scan_proxy_thread = CheckValidateProxyPool() scan_proxy_thread.start() logger.info('扫描检测可用代理线程出现问题,已经重新启动!') for thread in validate_thread_list: if thread.status == 'error': validate_thread_list.remove(thread) validate_thread = ValidateIPThread() validate_thread_list.append(validate_thread) logger.info('验证代理ip线程出现问题,已经重启!') time.sleep(300)