def Export_Model(self, model_path): data = dict(word_dict=self.word_dict, huffman=self.huffman, vec_len=self.vec_len, learn_rate=self.learn_rate, win_len=self.win_len, model=self.model) FI.save_pickle(data, model_path)
def Export_Model(self,model_path): data=dict( word_dict = self.word_dict, huffman = self.huffman, vec_len = self.vec_len, learn_rate = self.learn_rate, win_len = self.win_len, model = self.model ) FI.save_pickle(data,model_path)
def post(self): try: user_basic_info=self.get_argument('user_basic_info') attends=self.get_argument('user_attends') user_basic_info=eval(user_basic_info) attends=eval(attends) self.write('success to return user info') self.finish() except: self.write('fail to return user info') self.finish() return try: dbi=MySQL_Interface() except: print('unable to connect to MySql DB') try: if attends.__len__()>0: #store attends info table_name='cache_attends' attends_col_info=dbi.get_col_name(table_name) keys=attends[0].keys() attends= [[line[i] if i in keys else '' for i in attends_col_info] for line in attends] fans_col_pos=attends_col_info.index('fans_num') insert_attends=[] for line in attends: if line[fans_col_pos]>1000: insert_attends.append(line) dbi.insert_asList(table_name,insert_attends,unique=True) print('Success : attends of {uid} is stored in {tname}' .format(uid=user_basic_info['uid'],tname=table_name)) else: pass except Exception as e: print(e) path="temp\\{uid}_attends.pkl".format(uid=user_basic_info['uid']) print('unable to store attends of {uid}, it will be stored ' .format(uid=user_basic_info['uid'])) FI.save_pickle(attends,path) try: atten_num_real=user_basic_info['attends_num'] atten_num_get=attends.__len__() user_basic_info['accuracy']=atten_num_get # 实际获取到的关注数目 col_info=dbi.get_col_name('cache_user_info') # store user basic info keys=user_basic_info.keys() data=[user_basic_info[i] if i in keys else '' for i in col_info] dbi.insert_asList('cache_user_info',[data],unique=True) print('Success : basic info of {uid} is stored in cache_user_info' .format(uid=user_basic_info['uid'])) except Exception as e: print(e) path='temp\\{uid}_basic_info.pkl'.format(uid=user_basic_info['uid']) print('unable to store basic info of {uid} , it will be stored' .format(uid=user_basic_info['uid'])) FI.save_pickle(user_basic_info,path) try: if attends.__len__()>0: # store atten connection web from_uid=user_basic_info['uid'] from_fans_num=user_basic_info['fans_num'] from_blog_num=user_basic_info['blog_num'] data=[[from_uid,from_fans_num,from_blog_num,str(x[attends_col_info.index('uid')]),str(x[attends_col_info.index('fans_num')]),str(x[attends_col_info.index('blog_num')])]for x in attends] dbi.insert_asList('cache_atten_web',data) print('Success : conn web of {uid} is stored in cache_atten_web' .format(uid=user_basic_info['uid'])) else: pass except Exception as e: print(e) path='{uid}_atten_web.pkl'.format(uid=user_basic_info['uid']) print('unable to store atten web of {uid} , it will be stored' .format(uid=user_basic_info['uid'])) FI.save_pickle(data,path)
if ret == 'dict': ret_data = {} for ele, count in temp[:high]: ret_data[ele] = count return ret_data else: return temp[:high] if __name__ == '__main__': text = FI.load_pickle('./static/demo.pkl') text = [x['dealed_text']['left_content'][0] for x in text] # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] wv = Word2Vec(vec_len=500) wv.Train_Model(text) FI.save_pickle(wv.word_dict, './static/wv.pkl') # # data = FI.load_pickle('./static/wv.pkl') # x = {} # for key in data: # temp = data[key]['vector'] # temp = preprocessing.normalize(temp) # x[key] = temp # FI.save_pickle(x,'./static/normal_wv.pkl') # x = FI.load_pickle('./static/normal_wv.pkl') # def cal_simi(data,key1,key2): # return data[key1].dot(data[key2].T)[0][0] # keys=list(x.keys()) # for key in keys: # print(key,'\t',cal_simi(x,'姚明',key))
# 从wikileaks下载邮件内容并生成结构化数据 # =============== ATTENTION =============== : # 由于邮件格式较繁杂,因此无法全部解析(5%左右) base_url = 'https://wikileaks.org/dnc-emails/get/{page}' base_path = '.\static\{page}.pkl' gotten_id = os.listdir('.\static') gotten_id = [int((x.split('.'))[0]) for x in gotten_id] task_pool = list(range(1,2000)) # wikileak中的邮件编号 while True : if task_pool.__len__()==0: break task_id = task_pool.pop(0) if task_id in gotten_id: print('{id} skip'.format(id=task_id)) continue url = base_url.format(page = task_id) path = base_path.format(page = task_id) try: info = getStructedData(url) FI.save_pickle(info,path) print('{t} succeed'.format(t=task_id)) except Exception as e: # task_pool.append(task_id) print('{t} failed <--<--<--<--'.format(t=task_id)) print(e) print('文件已下载完毕') # 生成社交网络数据(需要networkx包) generate_data_for_gelphi() print('gexf文件已经生成,存放路径: {path}\\temp_res'.format(path=os.getcwd()))
else: return temp[:high] if __name__ == '__main__': # text = FI.load_pickle('./static/demo.pkl') # text =[ x['dealed_text']['left_content'][0] for x in text] # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] # 输入文本 data = [ "小说”一词最早出现于《庄子·外物》:「饰小说以干县令,其于大达亦远矣。」庄子所谓的「小说」,是指琐碎的言论,与今日小说观念相差甚远。直至东汉桓谭《新论》:「小说家合残丛小语,近取譬喻,以作短书,治身理家,有可观之辞。」班固《汉书.艺文志》将「小说家」列为十家之后,其下的定义为:「小说家者流,盖出于稗官,街谈巷语,道听途说[4]之所造也。」才稍与今日小说的意义相近。而中国小说最大的特色,便自宋代开始具有文言小说与白话小说两种不同的小说系统。文言小说起源于先秦的街谈巷语,是一种小知小道的纪录。在历经魏晋南北朝及隋唐长期的发展,无论是题材或人物的描写,文言小说都有明显的进步,形成笔记与传奇两种小说类型。而白话小说则起源于唐宋时期说话人的话本,故事的取材来自民间,主要表现了百姓的生活及思想意识。但不管文言小说或白话小说都源远流长,呈现各自不同的艺术特色。" ] wv = Word2Vec(vec_len=500) # vec 长度为500 wv.Train_Model(data) FI.save_pickle(wv.word_dict, './static/wv.pkl') # 保存以后归一化 data = FI.load_pickle('./static/wv.pkl') x = {} for key in data: temp = data[key]['vector'] temp = preprocessing.normalize(temp) x[key] = temp FI.save_pickle(x, './static/normal_wv.pkl') x = FI.load_pickle('./static/normal_wv.pkl') # 计算两个词之间的相似度 def cal_simi(data, key1, key2): return data[key1].dot(data[key2].T)[0][0]
def post(self): try: user_basic_info = self.get_argument('user_basic_info') attends = self.get_argument('user_attends') user_basic_info = eval(user_basic_info) attends = eval(attends) self.write('success to return user info') self.finish() except: self.write('fail to return user info') self.finish() return try: dbi = MySQL_Interface() except: print('unable to connect to MySql DB') try: if attends.__len__() > 0: #store attends info table_name = 'cache_attends' attends_col_info = dbi.get_col_name(table_name) keys = attends[0].keys() attends = [[ line[i] if i in keys else '' for i in attends_col_info ] for line in attends] fans_col_pos = attends_col_info.index('fans_num') insert_attends = [] for line in attends: if line[fans_col_pos] > 1000: insert_attends.append(line) dbi.insert_asList(table_name, insert_attends, unique=True) print('Success : attends of {uid} is stored in {tname}'.format( uid=user_basic_info['uid'], tname=table_name)) else: pass except Exception as e: print(e) path = "temp" + os.sep + "{uid}_attends.pkl".format( uid=user_basic_info['uid']) print( 'unable to store attends of {uid}, it will be stored '.format( uid=user_basic_info['uid'])) FI.save_pickle(attends, path) try: atten_num_real = user_basic_info['attends_num'] atten_num_get = attends.__len__() user_basic_info['accuracy'] = atten_num_get # 实际获取到的关注数目 col_info = dbi.get_col_name( 'cache_user_info') # store user basic info keys = user_basic_info.keys() data = [user_basic_info[i] if i in keys else '' for i in col_info] dbi.insert_asList('cache_user_info', [data], unique=True) print('Success : basic info of {uid} is stored in cache_user_info'. format(uid=user_basic_info['uid'])) except Exception as e: print(e) path = 'temp' + os.sep + '{uid}_basic_info.pkl'.format( uid=user_basic_info['uid']) print('unable to store basic info of {uid} , it will be stored'. format(uid=user_basic_info['uid'])) FI.save_pickle(user_basic_info, path) try: if attends.__len__() > 0: # store atten connection web from_uid = user_basic_info['uid'] from_fans_num = user_basic_info['fans_num'] from_blog_num = user_basic_info['blog_num'] data = [[ from_uid, from_fans_num, from_blog_num, str(x[attends_col_info.index('uid')]), str(x[attends_col_info.index('fans_num')]), str(x[attends_col_info.index('blog_num')]) ] for x in attends] dbi.insert_asList('cache_atten_web', data) print( 'Success : conn web of {uid} is stored in cache_atten_web'. format(uid=user_basic_info['uid'])) else: pass except Exception as e: print(e) path = '{uid}_atten_web.pkl'.format(uid=user_basic_info['uid']) print('unable to store atten web of {uid} , it will be stored'. format(uid=user_basic_info['uid'])) FI.save_pickle(data, path)
res=[pop_id(x) for x in collection.find(select).limit(limit).sort(sort,pymongo.ASCENDING)] else: res=[pop_id(x) for x in collection.find(select).limit(limit).sort(sort,pymongo.DESCENDING)] else: f={} for item in field: f[item]=1 if sort=='': res=[pop_id(x) for x in collection.find(select,f).limit(limit)] else: if sort_type=='up': res=[pop_id(x) for x in collection.find(select,f).limit(limit).sort(sort,pymongo.ASCENDING)] else: res=[pop_id(x) for x in collection.find(select,f).limit(limit).sort(sort,pymongo.DESCENDING)] return res def pop_id(data): data.pop('_id') return data res=read_content_in_mongo('latest_history',{'user_id':'1681029540'},['dealed_text.left_content','created_at','user_name'],-1,'id','down') data = FI.load_pickle('demo.pkl') data = data + res FI.save_pickle(data,'demo.pkl') for line in res: print(line) print(res.__len__())
if sort_type == 'up': res = [ pop_id(x) for x in collection.find(select, f).limit(limit).sort( sort, pymongo.ASCENDING) ] else: res = [ pop_id(x) for x in collection.find(select, f).limit(limit).sort( sort, pymongo.DESCENDING) ] return res def pop_id(data): data.pop('_id') return data res = read_content_in_mongo( 'latest_history', {'user_id': '1681029540'}, ['dealed_text.left_content', 'created_at', 'user_name'], -1, 'id', 'down') data = FI.load_pickle('demo.pkl') data = data + res FI.save_pickle(data, 'demo.pkl') for line in res: print(line) print(res.__len__())
return [] if ret=='dict': ret_data = {} for ele,count in temp[:high]: ret_data[ele]=count return ret_data else: return temp[:high] if __name__ == '__main__': text = FI.load_pickle('./static/demo.pkl') text =[ x['dealed_text']['left_content'][0] for x in text] # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] wv = Word2Vec(vec_len=500) wv.Train_Model(text) FI.save_pickle(wv.word_dict,'./static/wv.pkl') # # data = FI.load_pickle('./static/wv.pkl') # x = {} # for key in data: # temp = data[key]['vector'] # temp = preprocessing.normalize(temp) # x[key] = temp # FI.save_pickle(x,'./static/normal_wv.pkl') # x = FI.load_pickle('./static/normal_wv.pkl') # def cal_simi(data,key1,key2): # return data[key1].dot(data[key2].T)[0][0] # keys=list(x.keys()) # for key in keys: # print(key,'\t',cal_simi(x,'姚明',key))
def generate_node_and_edge(): pkl_list = os.listdir('.\static') node_list = {} edge_list = {} for path in pkl_list: info = FI.load_pickle('.\static\{x}'.format(x=path)) # print(info) # 处理 From情况 try: sender_mail = info['From']['mail'] ori_mail = sender_mail sender_mail = sender_mail.lower() except: continue # 如果其中没有From 则该邮件作废 sender_node = node_list.get(sender_mail) if sender_node: sender_node['send'] += 1 else: node_list[sender_mail] = dict(mail=sender_mail, send=1, receive=0, cc=0, ori_mail=ori_mail) if info.get('name'): node_list[sender_mail]['name'] = info['name'] # 处理To情况 receiver_list = info.get('To') if not receiver_list: continue for receiver in receiver_list: mail = receiver['mail'] ori_mail = mail mail = mail.lower() node = node_list.get(mail) if node: node['receive'] += 1 else: node_list[mail] = dict(mail=mail, send=0, receive=1, cc=0, ori_mail=ori_mail) if receiver.get('name'): node_list[mail]['name'] = receiver['name'] edge_key = sender_mail + '|----->' + mail edge = edge_list.get(edge_key) if edge: edge[2] += 1 # 如果该条边已经存在 else: edge_list[edge_key] = [sender_mail, mail, 1] #如果该条边不存在,则新加一条边 # 处理CC的情况 cc_list = info.get('CC') if cc_list: for cc in cc_list: mail = cc['mail'] ori_mail = mail mail = mail.lower() node = node_list.get(mail) if node: node['cc'] += 1 else: node_list[mail] = dict(mail=mail, send=0, receive=0, cc=1, ori_mail=ori_mail) if cc.get('name'): node_list[mail]['name'] = cc['name'] edge_key = sender_mail + '|----->' + mail edge = edge_list.get(edge_key) if edge: edge[2] += 1 # 如果该条边已经存在 else: edge_list[edge_key] = [sender_mail, mail, 1] #如果该条边不存在,则新加一条边 print('{id} is dealed'.format(id=path)) FI.save_pickle(node_list, '.\\temp_res\\node.pkl') FI.save_pickle(edge_list, '.\\temp_res\\edge.pkl')
count += 1 text = [x['dealed_text']['left_content'][0] for x in data] date = [x['created_timestamp'] for x in data] cutted_text = [] for i in range(text.__len__()): s = list(jieba.cut(text[i])) for i in range(s.__len__())[::-1]: word = s[i] if word in stop_words: s.pop(i) wfts.Add_Sentence_With_Timestamp(s, date[i]) t3 = time.time() print('{x} is completed\t{t1}\t{t2}'.format(x=count, t1=t2 - t1, t2=t3 - t2)) FI.save_pickle(wfts, './static/wfts_continue.pkl') wfts = FI.load_pickle('./static/wfts_continue.pkl') # for item in word_item_list: # print('{a}\t{b}'.format(a=item.word,b=item.total_freq)) # plt.plot([math.log(x.total_freq) for x in word_item_list]) # plt.show() top_asDay = wfts.top_asDay for item in top_asDay: print('------------------------------') print(time.strftime("%Y-%m-%d", time.localtime(item['time']))) print(item['obj'].topN(20)) # wfts = FI.load_pickle('./static/wfts_1w.pkl') # word_item_list = wfts.word_statistic.values() # word_item_list = sorted(word_item_list, key=lambda x:x.total_freq,reverse=True)
def generate_node_and_edge(): pkl_list = os.listdir('.\static') node_list = {} edge_list = {} for path in pkl_list: info = FI.load_pickle('.\static\{x}'.format(x=path)) # print(info) # 处理 From情况 try: sender_mail = info['From']['mail'] ori_mail = sender_mail sender_mail = sender_mail.lower() except: continue # 如果其中没有From 则该邮件作废 sender_node = node_list.get(sender_mail) if sender_node: sender_node['send'] += 1 else: node_list[sender_mail] = dict( mail = sender_mail, send = 1, receive = 0, cc = 0, ori_mail = ori_mail ) if info.get('name'): node_list[sender_mail]['name'] = info['name'] # 处理To情况 receiver_list = info.get('To') if not receiver_list: continue for receiver in receiver_list: mail = receiver['mail'] ori_mail = mail mail = mail.lower() node = node_list.get(mail) if node: node['receive'] += 1 else: node_list[mail] = dict( mail = mail, send = 0 , receive = 1, cc = 0, ori_mail = ori_mail ) if receiver.get('name'): node_list[mail]['name'] = receiver['name'] edge_key = sender_mail + '|----->' + mail edge = edge_list.get(edge_key) if edge: edge[2] += 1 # 如果该条边已经存在 else: edge_list[edge_key] = [sender_mail,mail,1] #如果该条边不存在,则新加一条边 # 处理CC的情况 cc_list = info.get('CC') if cc_list: for cc in cc_list: mail = cc['mail'] ori_mail = mail mail = mail.lower() node = node_list.get(mail) if node: node['cc'] += 1 else: node_list[mail] = dict( mail = mail, send = 0 , receive = 0 , cc = 1, ori_mail = ori_mail ) if cc.get('name'): node_list[mail]['name'] = cc['name'] edge_key = sender_mail + '|----->' + mail edge = edge_list.get(edge_key) if edge: edge[2] += 1 # 如果该条边已经存在 else: edge_list[edge_key] = [sender_mail,mail,1] #如果该条边不存在,则新加一条边 print('{id} is dealed'.format(id=path)) FI.save_pickle(node_list,'.\\temp_res\\node.pkl') FI.save_pickle(edge_list,'.\\temp_res\\edge.pkl')
data = [x for x in data] t2 = time.time() count += 1 text = [x['dealed_text']['left_content'][0] for x in data] date = [x['created_timestamp'] for x in data] cutted_text = [] for i in range(text.__len__()): s = list(jieba.cut(text[i])) for i in range(s.__len__())[::-1]: word = s[i] if word in stop_words: s.pop(i) wfts.Add_Sentence_With_Timestamp(s,date[i]) t3 = time.time() print('{x} is completed\t{t1}\t{t2}'.format(x = count,t1=t2-t1,t2=t3-t2)) FI.save_pickle(wfts,'./static/wfts_continue.pkl') wfts = FI.load_pickle('./static/wfts_continue.pkl') # for item in word_item_list: # print('{a}\t{b}'.format(a=item.word,b=item.total_freq)) # plt.plot([math.log(x.total_freq) for x in word_item_list]) # plt.show() top_asDay = wfts.top_asDay for item in top_asDay: print('------------------------------') print(time.strftime("%Y-%m-%d",time.localtime(item['time']))) print(item['obj'].topN(20)) # wfts = FI.load_pickle('./static/wfts_1w.pkl') # word_item_list = wfts.word_statistic.values() # word_item_list = sorted(word_item_list, key=lambda x:x.total_freq,reverse=True)
# 从wikileaks下载邮件内容并生成结构化数据 # =============== ATTENTION =============== : # 由于邮件格式较繁杂,因此无法全部解析(5%左右) base_url = 'https://wikileaks.org/dnc-emails/get/{page}' base_path = '.\static\{page}.pkl' gotten_id = os.listdir('.\static') gotten_id = [int((x.split('.'))[0]) for x in gotten_id] task_pool = list(range(1, 2000)) # wikileak中的邮件编号 while True: if task_pool.__len__() == 0: break task_id = task_pool.pop(0) if task_id in gotten_id: print('{id} skip'.format(id=task_id)) continue url = base_url.format(page=task_id) path = base_path.format(page=task_id) try: info = getStructedData(url) FI.save_pickle(info, path) print('{t} succeed'.format(t=task_id)) except Exception as e: # task_pool.append(task_id) print('{t} failed <--<--<--<--'.format(t=task_id)) print(e) print('文件已下载完毕') # 生成社交网络数据(需要networkx包) generate_data_for_gelphi() print('gexf文件已经生成,存放路径: {path}\\temp_res'.format(path=os.getcwd()))