def run(self): while True: start_time = time.time() dbi = MySQL_Interface() col_info = dbi.get_col_name('cache_history') query = 'select * from cache_history where is_dealing is null order by checkin_timestamp limit 1' mysql_res = dbi.select_asQuery(query) if mysql_res.__len__() == 0: # cache_history表为空时,睡眠1秒,跳过此次循环 time.sleep(1) continue mysql_res = mysql_res[0] # todo for delete----- print('debug->start to deal with a new task') print('debug->mysql_res: ') print(mysql_res) #------------------------ container_id = mysql_res[col_info.index('container_id')] print('debug->container_id: {cid}'.format(cid=container_id)) latest_time = mysql_res[col_info.index('latest_time')] latest_timestamp = mysql_res[col_info.index('latest_timestamp')] time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query = 'update cache_history set is_dealing=\'{time}\' where container_id={cid}'.format( time=time_stick, cid=container_id) # todo for delete----- print('debug->query1 : {q}'.format(q=query)) # ------------------------ dbi.update_asQuery(query) client = MongoClient('localhost', 27017) db = client['microblog_spider'] assemble_table = db.assemble_factory res = assemble_table.find({'container_id': container_id}, { 'current_id': 1, 'total_num': 1 }) id_list = [x['current_id'] for x in res] num = int([ x['total_num'] for x in assemble_table.find({ 'container_id': container_id }).limit(1) ][0]) ## todo for delete----- print('debug->id_list_len: {len}'.format(len=id_list.__len__())) print('debug->num: {n}'.format(n=num)) # ------------------------ # 检查是否所有包裹已经到齐 check_state = True if id_list.__len__() < num: print( 'server->HistoryReport:The package is not complete, retry to catch data' ) check_state = False if check_state: # 如果所有子包已经收集完毕,则将数据放入正式数据库mongodb # 将装配车间中的相关数据删除 # 并且在Mysql中更新update_time和latest_blog,抹掉isGettingBlog # 从mysql获取该用户信息 try: query = 'select * from user_info_table where container_id=\'{cid}\'' \ .format(cid=container_id) user_info = dbi.select_asQuery(query)[0] # todo fro debug------------- print('task {cid} :debug->query2: {q}'.format( q=query, cid=container_id)) print('task {cid} debug->user_info:'.format( cid=container_id)) print(user_info) # -------------------------------- col_name = dbi.get_col_name('user_info_table') except Exception as e: print( 'task {cid} :Error:server-HistoryReturn:' 'No such user in MySQL.user_info_table,Reason:'.format( cid=container_id)) print(e) # 将数据从assemble factory中提取出来 try: data_list = assemble_table.find( {'container_id': container_id}, { 'data': 1, 'current_id': 1 }) data_list_ori = [x for x in data_list] data_list = [x['data'] for x in data_list_ori] id_list = [x['current_id'] for x in data_list_ori] data_list_ori = None # todo fro debug------------- print('task {cid} debug->datalist: {len}'.format( len=data_list.__len__(), cid=container_id)) # -------------------------------- except Exception as e: print( 'Error:server-HistoryReturn:' 'Unable to get data from MongoDB, assemble factory,Reason:' ) print(e) # 长度大于预期,说明有重复信息,需要去重 if id_list.__len__() > num: unique_data_list = [] check_dict = {} for i in range(id_list.__len__()): try: # 这里使用字典去重,(算是hash吧) check_dict[str(id_list[i])] continue except: check_dict[str(id_list[i])] = True unique_data_list.append(data_list[i]) # print('data_list.len :{len}'.format(len=data_list.__len__())) # print('id_list.len :{len}'.format(len=id_list.__len__())) # print(i) data_list = unique_data_list # 将碎片拼接 try: data_final = [] for i in data_list: data_final = data_final + i # todo fro debug------------- print('task {cid} debug->数据拼接完毕,len {len}'.format( len=data_final.__len__(), cid=container_id)) # -------------------------------- except Exception as e: print( 'Error:server-HistoryReport:' 'Unable to contact the pieces of information,Reason:') print(e) # 将本次信息录入accuracy_table 用以进一步分析 blog_len = data_final.__len__() wanted_blog_len = user_info[col_name.index('blog_num')] blog_accuracy = blog_len / wanted_blog_len time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query = 'insert into accuracy_table values ({acc},\'{t_s}\',{num}) ;' \ .format(acc=blog_accuracy, t_s=time_stick, num=wanted_blog_len) dbi.insert_asQuery(query) # 将数据录入Mongodb 更改Mysql,删除assemble中相关内容 try: if not user_info[col_name.index('update_time')]: # 将数据存入 Mongodb 的formal collection save_data_seperately(data_final) print( 'task {cid} Success: Data has saved in Mongodb, size is {size}' .format(size=sys.getsizeof(data_final), cid=container_id)) # # 将关键信息录入Mydql query = 'update user_info_table set ' \ 'update_time=\'{up_time}\',' \ 'latest_blog=\'{latest_blog}\',' \ 'isGettingBlog=null ' \ 'where container_id=\'{cid}\';'\ .format(up_time=time_stick,latest_blog=latest_time,cid=container_id) # query='update user_info_table set ' \ # 'update_time=\'{up_time}\',' \ # 'latest_blog=\'{latest_blog}\'' \ # 'where container_id=\'{cid}\';' \ # .format(up_time=time_stick,latest_blog=latest_time,cid=container_id) #TODO 这里为了方便统计,去掉了抹除isGetting这一项,但是正式运行的时候是要加上的 dbi.update_asQuery(query) print( 'task {cid} Success: insert user into MongoDB, the num of data is {len}' .format(len=blog_len, cid=container_id)) else: query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \ .format(cid=container_id) dbi.update_asQuery(query) except Exception as e: print('task {cid} Error:server->HistoryReport:' 'Reason:'.format(cid=container_id)) print(e) else: # 如果所有子包不全,则抹掉isGettingBlog,将装配车间中数据删除 print( 'task {cid} :Error: the package is not complete ,{a} of {b}' .format(a=id_list.__len__(), b=num, cid=container_id)) query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \ .format(cid=container_id) dbi.update_asQuery(query) # 将数据从assemble factory 去掉 assemble_table.remove({'container_id': container_id}) print( 'task {cid} Success: Data has been removed from assemble factory' .format(cid=container_id)) # 将cache_history中的相应行删掉,表示已经处理完该事物了 query='delete from cache_history where container_id=\'{cid}\'' \ .format(cid=container_id) dbi.update_asQuery(query) end_time = time.time() deal_time = end_time - start_time print( 'task {cid} :Success : the user {cid} is completed, length is {len}, use {t} seconds' .format(cid=container_id, len=data_final.__len__(), t=deal_time))
def post(self): # 从客户端获取信息 try: user_history=self.get_argument('user_history') latest_time=self.get_argument('latest_time') latest_timestamp=self.get_argument('latest_timestamp') container_id=self.get_argument('container_id') isDivided=self.get_argument('isDivided') user_history=eval(user_history) if isDivided==1 or isDivided=='1' : block_num=self.get_argument('block_num') current_block=self.get_argument('current_block') self.write('success to return user history') self.finish() print('Success: to get data from web') except Exception as e: self.write('fail to return user history') self.finish() print('Error:server-HistoryReturn:' 'Unable to get value from http package,Reason:') print(e) return # 连接 try: dbi=MySQL_Interface() except: print('Error:server-HistoryReturn:' 'Unable to connect to MySQL') # 从MYSQL获取该用户相关信息 try: query='select * from user_info_table where container_id=\'{cid}\''\ .format(cid=container_id) user_info=dbi.select_asQuery(query)[0] col_name=dbi.get_col_name('user_info_table') except Exception as e: print('Error:server-HistoryReturn:' 'No such user in MySQL.user_info_table,Reason:') print(e) # 将数据存入Mongodb以后将相关信息存入mysql,并将isGettingBlog字段设为空 try: blog_len=user_history.__len__() wanted_blog_len=user_info[col_name.index('blog_num')] blog_accuracy=blog_len/wanted_blog_len time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if not user_info[col_name.index('update_time')]: save_data_inMongo(user_history) # query='update user_info_table set ' \ # 'update_time=\'{up_time}\',' \ # 'latest_blog=\'{latest_blog}\',' \ # 'isGettingBlog=null ' \ # 'where container_id=\'{cid}\';'\ # .format(up_time=time_stick,latest_blog=latest_time,cid=container_id) query='update user_info_table set ' \ 'update_time=\'{up_time}\',' \ 'latest_blog=\'{latest_blog}\'' \ 'where container_id=\'{cid}\';' \ .format(up_time=time_stick,latest_blog=latest_time,cid=container_id) dbi.update_asQuery(query) else: query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\''\ .format(cid=container_id) dbi.update_asQuery(query) query='insert into accuracy_table values ({acc},\'{t_s}\') ;'\ .format(acc=blog_accuracy,t_s=time_stick) dbi.insert_asQuery(query) print('Success: insert user into MongoDB, the num of data is {len}' .format(len=blog_len)) except Exception as e: print('Error:server-HistoryReturn:' 'Unable to update data in MySQL.user_info_tabe,Reason:') print(e)
def run(self): while True: start_time = time.time() dbi=MySQL_Interface() col_info=dbi.get_col_name('cache_history') query='select * from cache_history where is_dealing is null order by checkin_timestamp limit 1' mysql_res=dbi.select_asQuery(query) if mysql_res.__len__()==0: # cache_history表为空时,睡眠1秒,跳过此次循环 time.sleep(1) continue mysql_res=mysql_res[0] # todo for delete----- print('debug->start to deal with a new task') print('debug->mysql_res: ') print(mysql_res) #------------------------ container_id=mysql_res[col_info.index('container_id')] print('debug->container_id: {cid}'.format(cid=container_id)) latest_time=mysql_res[col_info.index('latest_time')] latest_timestamp=mysql_res[col_info.index('latest_timestamp')] time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query = 'update cache_history set is_dealing=\'{time}\' where container_id={cid}'.format(time=time_stick, cid = container_id) # todo for delete----- print('debug->query1 : {q}'.format(q=query)) # ------------------------ dbi.update_asQuery(query) client = MongoClient('localhost', 27017) db = client['microblog_spider'] assemble_table = db.assemble_factory res = assemble_table.find({'container_id': container_id}, {'current_id': 1, 'total_num': 1}) id_list = [x['current_id'] for x in res] num = int([x['total_num'] for x in assemble_table.find({'container_id': container_id}).limit(1)][0]) ## todo for delete----- print('debug->id_list_len: {len}'.format(len=id_list.__len__())) print('debug->num: {n}'.format(n=num)) # ------------------------ # 检查是否所有包裹已经到齐 check_state = True if id_list.__len__() < num: print('server->HistoryReport:The package is not complete, retry to catch data') check_state = False if check_state: # 如果所有子包已经收集完毕,则将数据放入正式数据库mongodb # 将装配车间中的相关数据删除 # 并且在Mysql中更新update_time和latest_blog,抹掉isGettingBlog # 从mysql获取该用户信息 try: query = 'select * from user_info_table where container_id=\'{cid}\'' \ .format(cid=container_id) user_info = dbi.select_asQuery(query)[0] # todo fro debug------------- print('task {cid} :debug->query2: {q}'.format(q=query,cid=container_id)) print('task {cid} debug->user_info:'.format(cid = container_id)) print(user_info) # -------------------------------- col_name = dbi.get_col_name('user_info_table') except Exception as e: print('task {cid} :Error:server-HistoryReturn:' 'No such user in MySQL.user_info_table,Reason:'.format(cid = container_id)) print(e) # 将数据从assemble factory中提取出来 try: data_list = assemble_table.find({'container_id':container_id}, {'data': 1 , 'current_id': 1}) data_list_ori = [x for x in data_list] data_list = [x['data'] for x in data_list_ori] id_list = [x['current_id'] for x in data_list_ori] data_list_ori = None # todo fro debug------------- print('task {cid} debug->datalist: {len}'.format(len = data_list.__len__(),cid=container_id)) # -------------------------------- except Exception as e: print('Error:server-HistoryReturn:' 'Unable to get data from MongoDB, assemble factory,Reason:') print(e) # 长度大于预期,说明有重复信息,需要去重 if id_list.__len__() > num : unique_data_list = [] check_dict = {} for i in range(id_list.__len__()) : try: # 这里使用字典去重,(算是hash吧) check_dict[str(id_list[i])] continue except: check_dict[str(id_list[i])] = True unique_data_list.append(data_list[i]) # print('data_list.len :{len}'.format(len=data_list.__len__())) # print('id_list.len :{len}'.format(len=id_list.__len__())) # print(i) data_list = unique_data_list # 将碎片拼接 try: data_final = [] for i in data_list: data_final = data_final+i # todo fro debug------------- print('task {cid} debug->数据拼接完毕,len {len}'.format(len=data_final.__len__(),cid=container_id)) # -------------------------------- except Exception as e: print('Error:server-HistoryReport:' 'Unable to contact the pieces of information,Reason:') print(e) # 将本次信息录入accuracy_table 用以进一步分析 blog_len = data_final.__len__() wanted_blog_len = user_info[col_name.index('blog_num')] blog_accuracy = blog_len/wanted_blog_len time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query = 'insert into accuracy_table values ({acc},\'{t_s}\',{num}) ;' \ .format(acc=blog_accuracy, t_s=time_stick, num=wanted_blog_len) dbi.insert_asQuery(query) # 将数据录入Mongodb 更改Mysql,删除assemble中相关内容 try: if not user_info[col_name.index('update_time')]: # 将数据存入 Mongodb 的formal collection save_data_seperately(data_final) print('task {cid} Success: Data has saved in Mongodb, size is {size}' .format(size=sys.getsizeof(data_final),cid=container_id)) # # 将关键信息录入Mydql query = 'update user_info_table set ' \ 'update_time=\'{up_time}\',' \ 'latest_blog=\'{latest_blog}\',' \ 'isGettingBlog=null ' \ 'where container_id=\'{cid}\';'\ .format(up_time=time_stick,latest_blog=latest_time,cid=container_id) # query='update user_info_table set ' \ # 'update_time=\'{up_time}\',' \ # 'latest_blog=\'{latest_blog}\'' \ # 'where container_id=\'{cid}\';' \ # .format(up_time=time_stick,latest_blog=latest_time,cid=container_id) #TODO 这里为了方便统计,去掉了抹除isGetting这一项,但是正式运行的时候是要加上的 dbi.update_asQuery(query) print('task {cid} Success: insert user into MongoDB, the num of data is {len}' .format(len=blog_len,cid=container_id)) else: query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \ .format(cid=container_id) dbi.update_asQuery(query) except Exception as e: print('task {cid} Error:server->HistoryReport:' 'Reason:'.format(cid=container_id)) print(e) else: # 如果所有子包不全,则抹掉isGettingBlog,将装配车间中数据删除 print('task {cid} :Error: the package is not complete ,{a} of {b}' .format(a=id_list.__len__(),b=num,cid=container_id)) query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \ .format(cid=container_id) dbi.update_asQuery(query) # 将数据从assemble factory 去掉 assemble_table.remove({'container_id':container_id}) print('task {cid} Success: Data has been removed from assemble factory' .format(cid=container_id)) # 将cache_history中的相应行删掉,表示已经处理完该事物了 query='delete from cache_history where container_id=\'{cid}\'' \ .format(cid=container_id) dbi.update_asQuery(query) end_time = time.time() deal_time = end_time - start_time print('task {cid} :Success : the user {cid} is completed, length is {len}, use {t} seconds' .format(cid = container_id, len = data_final.__len__(), t = deal_time))