def init_parse_user(mid): global MAX_USER_NUM, START_NUM, db user_following = [] user_follower = [] db.insert_or_update(mid, table_sql.insert_user_detect()) # 准备下一个迭代 # 用户关注与被关注列表 user_following_json = prase_content.return_json( api.return_user_follow(1, 10, mid), None, return_header()) user_follower_json = prase_content.return_json( api.return_user_fans(1, 10, mid), None, return_header()) try: user_following = user_following_json.get('data').get('list') user_follower = user_follower_json.get('data').get('list') except AttributeError as aerr: logging.exception(aerr) # 将关注的人放入数据库,即这些人为爬取对象 for i in user_following: db.insert_or_update(i.get('mid'), table_sql.insert_user_detect()) START_NUM = START_NUM + 1 gl.set_value('current_user', START_NUM) # 随机抽取一个人并开始下一个迭代 user_list = user_following + user_follower random_user = user_list[random.randint(0, len(user_list) - 1)] # 判断爬取人数是否够 if START_NUM < MAX_USER_NUM: return init_parse_user(random_user['mid']) else: return
def update_video(): global db db.start_sql_engine() # 对数据库里需要更新的数据进行爬取 need_update_id_list = db.select(table_sql.query_update_video_list()) if len(need_update_id_list) != 0: gl.set_value('total_video', len(need_update_id_list)) for update_aid in need_update_id_list: try: gl.set_value('current_video', gl.get_value('current_video') + 1) update_old_video(update_aid[0]) except Exception as e: logging.error(update_aid[0], e) continue time.sleep(120) # 检查所有爬取用户有没有更新视频 user_list = db.select(table_sql.query_detect_list(0)) for mid in user_list: try: insert_new_video(mid[0]) except Exception as e: logging.error(str(mid[0]), e) continue db.close_db()
def parse_user_info(mid): # print(mid) gl.set_value("current_id", "mid" + str(mid)) global MAX_USER_NUM, START_NUM, db # print(mid) # 用户基本信息解析 # print(1) user_info_json = prase_content.return_json(api.return_user_info(mid), None, return_header()) time.sleep(random.uniform(0.2, 0.4)) # print(2) user_info_ff_json = prase_content.return_json( api.return_user_follower_following(mid), None, return_header()) time.sleep(random.uniform(0.2, 0.4)) # print(3) user_video_count_json = prase_content.return_json( api.return_user_video_count(mid, None), None, return_header()) # print(4) if user_info_json is None or user_info_ff_json is None or user_video_count_json is None: raise Exception(str(mid) + "遭遇反扒") # 用户不存在,或无意义账号 # print(user_info_json) if user_info_json.get('code') != 0: db.insert_or_update(item=None, sql=table_sql.delete_detect_user(mid)) return user_video_count = user_video_count_json.get('data').get('count') # print("user_video_count: " + str(user_video_count_json)) user_info = package.package_user_info(user_info_json, user_info_ff_json, user_video_count) user_official = package.package_user_official(user_info_json) db.insert_or_update(item=user_info.return_tup(), sql=table_sql.insert_user_info()) db.insert_or_update(item=user_official.return_tup(), sql=table_sql.replace_user_official()) # 用户视频主要分布统计 user_video_category = user_video_count_json.get('data').get('tlist') if len(user_video_category) == 0: return elif isinstance(user_video_category, dict): for i in user_video_category: uv_info = package.package_video_count( user_video_category[i], user_info_json.get('data').get('mid')) db.insert_or_update(item=uv_info.return_tup(), sql=table_sql.replace_uv_count()) elif isinstance(user_video_category, list): for index, i in enumerate(user_video_category): uv_info = package.package_video_count( user_video_category[index], user_info_json.get('data').get('mid')) db.insert_or_update(item=uv_info.return_tup(), sql=table_sql.replace_uv_count())
def update_parse_user(): global db db.start_sql_engine() user_list = db.select(table_sql.query_detect_list(0)) # print(list(user_list)) gl.set_value('total_user', len(user_list)) for index, mid in enumerate(list(user_list)): try: parse_user_info(mid[0]) gl.set_value('current_user', index + 1) except Exception as e: print(traceback.format_exc()) continue db.close_db()
def update_old_video(aid): gl.set_value("current_id", "aid" + str(aid)) global db video_json = prase_content.return_json(api.return_video_info(aid), None, return_header()) # 视频不存在,或以删除,则让视频侦测完毕 if video_json.get('code') != 0: db.insert_or_update(item=None, sql=table_sql.update_complete_video_detect(aid)) return video_info = package.package_video_info(video_json) db.insert_or_update(item=video_info.return_tup(), sql=table_sql.insert_video_info()) db.insert_or_update(item=None, sql=table_sql.update_video_detect_time(aid))
def update_video(): try: # gl._init() app.logger.debug('开始update_video') gl.set_value('current_task', enum.TaskList.update_video.value) gl.set_value('status', enum.Status.Continue.value) start = datetime.datetime.now() gl.set_value('start_time', start) spider.update_video() end = datetime.datetime.now() runtime = start - end app.logger.debug('runtime:' + str(runtime.seconds)) except Exception as exc: app.logger.error(traceback.format_exc()) gl.set_value('status', enum.Status.Error.value) scheduler.pause_job('update_video')
def insert_new_video(mid): # print('mid:'+str(mid)) gl.set_value("current_id", "mid" + str(mid)) global db yesterday_video_count = db.select( table_sql.query_yesterday_user_video_count(mid)) # print("len:"+str(yesterday_video_count)) if len(yesterday_video_count) == 0: # 该天的数据缺失 return yesterday_video_count = yesterday_video_count[0][0] ################## current_video_count_json = prase_content.return_json( api.return_user_video_count(mid, None), None, return_header()) # if not current_video_count_json.get('status'): # # 请求失败者再次请求 # time.sleep(20) # current_video_count_json = prase_content.return_json(api.return_user_video_count(mid, None), None, # return_header()) ################## time.sleep(0.2) current_video_count = current_video_count_json.get('data').get('count') if yesterday_video_count < current_video_count: pagesize = (current_video_count - yesterday_video_count) if ( (current_video_count - yesterday_video_count) < 100) else 100 ############# current_video_list_json = prase_content.return_json( api.return_user_video_count(mid, pagesize), None, return_header()) # if not current_video_list_json.get('status'): # time.sleep(20) # current_video_list_json = prase_content.return_json( # api.return_user_video_count(mid, pagesize), None, return_header()) ############# video_list = current_video_list_json.get('data').get('vlist') for more_video in range(pagesize): try: aid = video_list[more_video].get('aid') # print('aid'+str(aid)) gl.set_value('current_video', gl.get_value('current_video') + 1) gl.set_value('total_video', gl.get_value('current_video')) # 添加视频监控 new_video_detect(aid) except IndexError as exc: logging.error(str(mid), exc) continue