def Export_Model(self, model_path): data = dict(word_dict=self.word_dict, huffman=self.huffman, vec_len=self.vec_len, learn_rate=self.learn_rate, win_len=self.win_len, model=self.model) FI.save_pickle(data, model_path)
def Export_Model(self,model_path): data=dict( word_dict = self.word_dict, huffman = self.huffman, vec_len = self.vec_len, learn_rate = self.learn_rate, win_len = self.win_len, model = self.model ) FI.save_pickle(data,model_path)
def generate_data_for_gelphi(): generate_node_and_edge() nodes = FI.load_pickle('.\\temp_res\\node.pkl') edges = FI.load_pickle('.\\temp_res\\edge.pkl') nodes = list(nodes.values()) edges = list(edges.values()) nodes_mail = [x['mail'] for x in nodes] # edges = [tuple(x) for x in edges] G = nx.Graph() G.add_nodes_from(nodes_mail) G.add_weighted_edges_from(edges) nx.write_gexf(G,'.\\temp_res\\data.gexf')
def generate_data_for_gelphi(): generate_node_and_edge() nodes = FI.load_pickle('.\\temp_res\\node.pkl') edges = FI.load_pickle('.\\temp_res\\edge.pkl') nodes = list(nodes.values()) edges = list(edges.values()) nodes_mail = [x['mail'] for x in nodes] # edges = [tuple(x) for x in edges] G = nx.Graph() G.add_nodes_from(nodes_mail) G.add_weighted_edges_from(edges) nx.write_gexf(G, '.\\temp_res\\data.gexf')
def Load_Word_Freq(self,word_freq_path): # load the info of word frequence # will generate a word dict if self.word_dict is not None: raise RuntimeError('the word dict is not empty') word_freq = FI.load_pickle(word_freq_path) self.__Gnerate_Word_Dict(word_freq)
def Load_Word_Freq(self, word_freq_path): # 加载词频数据 # 加载词频后输出词典 if self.word_dict is not None: raise RuntimeError('the word dict is not empty') word_freq = FI.load_pickle(word_freq_path) self.__Gnerate_Word_Dict(word_freq)
def Load_Word_Freq(self, word_freq_path): # load the info of word frequence # will generate a word dict if self.word_dict is not None: raise RuntimeError('the word dict is not empty') word_freq = FI.load_pickle(word_freq_path) self.__Gnerate_Word_Dict(word_freq)
def Import_Model(self,model_path): model = FI.load_pickle(model_path) # a dict, {'word_dict','huffman','vec_len'} self.word_dict = model.word_dict self.huffman = model.huffman self.vec_len = model.vec_len self.learn_rate = model.learn_rate self.win_len = model.win_len self.model = model.model
def Import_Model(self,model_path): #直接读入word2vec模型 model = FI.load_pickle(model_path) # 以字典形式储存, {'word_dict','huffman','vec_len'} self.word_dict = model['word_dict'] self.huffman = model['huffman'] self.vec_len = model['vec_len'] self.learn_rate = model['learn_rate'] self.win_len = model['win_len'] self.model = model['model']
return e def __Sigmoid(self, value): return 1 / (1 + math.exp(-value)) if __name__ == '__main__': # text = FI.load_pickle('./static/demo.pkl') # text =[ x['dealed_text']['left_content'][0] for x in text] # # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] # wv = Word2Vec(vec_len=500) # wv.Train_Model(text) # FI.save_pickle(wv.word_dict,'./static/wv.pkl') # # data = FI.load_pickle('./static/wv.pkl') # x = {} # for key in data: # temp = data[key]['vector'] # temp = preprocessing.normalize(temp) # x[key] = temp # FI.save_pickle(x,'./static/normal_wv.pkl') x = FI.load_pickle('./static/normal_wv.pkl') def cal_simi(data, key1, key2): return data[key1].dot(data[key2].T)[0][0] keys = list(x.keys()) for key in keys: print(key, '\t', cal_simi(x, '小说', key))
else: return temp[:high] if __name__ == '__main__': # text = FI.load_pickle('./static/demo.pkl') # text =[ x['dealed_text']['left_content'][0] for x in text] # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] # 输入文本 data = [ "小说”一词最早出现于《庄子·外物》:「饰小说以干县令,其于大达亦远矣。」庄子所谓的「小说」,是指琐碎的言论,与今日小说观念相差甚远。直至东汉桓谭《新论》:「小说家合残丛小语,近取譬喻,以作短书,治身理家,有可观之辞。」班固《汉书.艺文志》将「小说家」列为十家之后,其下的定义为:「小说家者流,盖出于稗官,街谈巷语,道听途说[4]之所造也。」才稍与今日小说的意义相近。而中国小说最大的特色,便自宋代开始具有文言小说与白话小说两种不同的小说系统。文言小说起源于先秦的街谈巷语,是一种小知小道的纪录。在历经魏晋南北朝及隋唐长期的发展,无论是题材或人物的描写,文言小说都有明显的进步,形成笔记与传奇两种小说类型。而白话小说则起源于唐宋时期说话人的话本,故事的取材来自民间,主要表现了百姓的生活及思想意识。但不管文言小说或白话小说都源远流长,呈现各自不同的艺术特色。" ] wv = Word2Vec(vec_len=500) # vec 长度为500 wv.Train_Model(data) FI.save_pickle(wv.word_dict, './static/wv.pkl') # 保存以后归一化 data = FI.load_pickle('./static/wv.pkl') x = {} for key in data: temp = data[key]['vector'] temp = preprocessing.normalize(temp) x[key] = temp FI.save_pickle(x, './static/normal_wv.pkl') x = FI.load_pickle('./static/normal_wv.pkl') # 计算两个词之间的相似度 def cal_simi(data, key1, key2): return data[key1].dot(data[key2].T)[0][0]
# 从wikileaks下载邮件内容并生成结构化数据 # =============== ATTENTION =============== : # 由于邮件格式较繁杂,因此无法全部解析(5%左右) base_url = 'https://wikileaks.org/dnc-emails/get/{page}' base_path = '.\static\{page}.pkl' gotten_id = os.listdir('.\static') gotten_id = [int((x.split('.'))[0]) for x in gotten_id] task_pool = list(range(1,2000)) # wikileak中的邮件编号 while True : if task_pool.__len__()==0: break task_id = task_pool.pop(0) if task_id in gotten_id: print('{id} skip'.format(id=task_id)) continue url = base_url.format(page = task_id) path = base_path.format(page = task_id) try: info = getStructedData(url) FI.save_pickle(info,path) print('{t} succeed'.format(t=task_id)) except Exception as e: # task_pool.append(task_id) print('{t} failed <--<--<--<--'.format(t=task_id)) print(e) print('文件已下载完毕') # 生成社交网络数据(需要networkx包) generate_data_for_gelphi() print('gexf文件已经生成,存放路径: {path}\\temp_res'.format(path=os.getcwd()))
if ret=='dict': ret_data = {} for ele,count in temp[:high]: ret_data[ele]=count return ret_data else: return temp[:high] if __name__ == '__main__': """ stop_words = open(u"./static/中文停用词表(比较全面,有1208个停用词).txt") list1 = [] for line in stop_words: list1.append(line.strip()) print len(list1) pickle.dump(list1,open("./static/stop_words.pkl","wb"),protocol = 2) """ text = FI.load_pickle("./static/stop_words.pkl") #list for x in text: print x.decode("gbk") break data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] wc = WordCounter(data) print(wc.count_res.larger_than(16)) """ c=MulCounter('abcdeabcdaffbcabag') print(sorted(c.items(),key=_itemgetter(1),reverse=True))#operator.itemgetter(1) print(c.larger_than(3)) """ pass
__author__ = 'multiangle' import File_Interface as FI import json import os import time paths = os.listdir('.\static') date_counter = {} for path in paths: data = FI.load_pickle('.\static\{x}'.format(x=path)) date_ori = data.get('Date') if date_ori: tag = time.strftime('%Y/%m/%d', date_ori) c = date_counter.get(tag) print(tag) if c: date_counter[tag]['count'] += 1 else: date_counter[tag] = dict(date=tag, count=1) # date_counter = list(date_counter.values()) # date_counter = sorted(date_counter,key=lambda x:x['date']) # for date in date_counter: # print('{a}\t{b}'.format(a=date['date'],b=date['count']))
def __GoAlong_Huffman(self, word_huffman, input_vector, root): node = root e = np.zeros([1, self.vec_len]) for level in range(word_huffman.__len__()): huffman_charat = word_huffman[level] q = self.__Sigmoid(input_vector.dot(node.value.T)) grad = self.learn_rate * (1 - int(huffman_charat) - q) e += grad * node.value node.value += grad * input_vector node.value = preprocessing.normalize(node.value) if huffman_charat == '0': node = node.right else: node = node.left return e def __Sigmoid(self, value): return 1 / (1 + math.exp(-value)) if __name__ == '__main__': text = FI.load_pickle('./static/stop_words.pkl') text = [x for x in text] data = [ 'Merge multiple sorted inputs into a single sorted output', 'The API below differs from textbook heap algorithms in two aspects' ] wv = Word2Vec(vec_len=500) wv.Train_Model(text) FI.save_pickle(wv.word_dict, './static/wv.pkl') pass
# pre_node = SNA_node.build_Head_Node(item.data['retweeted_status']) # pre_node.add_Next(item) # info_dict[pre_node.id] = pre_node # info_id_list.append(pre_node.id) # head_id_list.append(pre_node.id) # head_id_list.remove(id) # print(ite_times) # ite_times += 1 # print(head_id_list.__len__()) # FI.save_pickle(info_dict,'./static/dealed_info_list.pkl') # # 统计 # data = FI.load_pickle('./static/dealed_info_list.pkl') # id_list = list(data.keys()) # for id in id_list: # data[id].analyse() # FI.save_pickle(data,'./static/analyse_info_list.pkl') data = FI.load_pickle('./static/analyse_info_list.pkl') id_list = list(data.keys()) for id in id_list: if data[id].result['retweet_num']>1 : print(data[id])
break if list[mid]['time'] < value : low = mid + 1 else: high = mid - 1 if i<0: i = low if i==list.__len__() : list.append({'time':value,'freq':1}) elif list[i]['time']==value : list[i]['freq'] += 1 else: list.insert(i,{'time':value,'freq':1}) if __name__=='__main__': stop_words = FI.load_pickle('./static/stop_words.pkl') wfts = FI.load_pickle('./static/wfts_continue.pkl') client = MongoClient('localhost',27017) db = client['microblog_spider'] latest_history = db.latest_history count = 0 data = [] batch_size = 100 gone_size = 40000 while count<100 : t1 = time.time() data = latest_history.find().skip(gone_size+count*batch_size).limit(batch_size) data = [x for x in data] t2 = time.time() count += 1
def generate_node_and_edge(): pkl_list = os.listdir('.\static') node_list = {} edge_list = {} for path in pkl_list: info = FI.load_pickle('.\static\{x}'.format(x=path)) # print(info) # 处理 From情况 try: sender_mail = info['From']['mail'] ori_mail = sender_mail sender_mail = sender_mail.lower() except: continue # 如果其中没有From 则该邮件作废 sender_node = node_list.get(sender_mail) if sender_node: sender_node['send'] += 1 else: node_list[sender_mail] = dict( mail = sender_mail, send = 1, receive = 0, cc = 0, ori_mail = ori_mail ) if info.get('name'): node_list[sender_mail]['name'] = info['name'] # 处理To情况 receiver_list = info.get('To') if not receiver_list: continue for receiver in receiver_list: mail = receiver['mail'] ori_mail = mail mail = mail.lower() node = node_list.get(mail) if node: node['receive'] += 1 else: node_list[mail] = dict( mail = mail, send = 0 , receive = 1, cc = 0, ori_mail = ori_mail ) if receiver.get('name'): node_list[mail]['name'] = receiver['name'] edge_key = sender_mail + '|----->' + mail edge = edge_list.get(edge_key) if edge: edge[2] += 1 # 如果该条边已经存在 else: edge_list[edge_key] = [sender_mail,mail,1] #如果该条边不存在,则新加一条边 # 处理CC的情况 cc_list = info.get('CC') if cc_list: for cc in cc_list: mail = cc['mail'] ori_mail = mail mail = mail.lower() node = node_list.get(mail) if node: node['cc'] += 1 else: node_list[mail] = dict( mail = mail, send = 0 , receive = 0 , cc = 1, ori_mail = ori_mail ) if cc.get('name'): node_list[mail]['name'] = cc['name'] edge_key = sender_mail + '|----->' + mail edge = edge_list.get(edge_key) if edge: edge[2] += 1 # 如果该条边已经存在 else: edge_list[edge_key] = [sender_mail,mail,1] #如果该条边不存在,则新加一条边 print('{id} is dealed'.format(id=path)) FI.save_pickle(node_list,'.\\temp_res\\node.pkl') FI.save_pickle(edge_list,'.\\temp_res\\edge.pkl')
__author__ = 'multiangle' import jieba import File_Interface as FI data = FI.load_pickle('demo.pkl') user_list = [x['user_name'] for x in data] text_list = [x['dealed_text']['left_content'] for x in data] for line in text_list: print(line) res = jieba.cut(line[0], cut_all=False) # print(list(seg_list)) res = list(res) print(res)
if list[mid]['time'] < value: low = mid + 1 else: high = mid - 1 if i < 0: i = low if i == list.__len__(): list.append({'time': value, 'freq': 1}) elif list[i]['time'] == value: list[i]['freq'] += 1 else: list.insert(i, {'time': value, 'freq': 1}) if __name__ == '__main__': stop_words = FI.load_pickle('./static/stop_words.pkl') wfts = FI.load_pickle('./static/wfts_continue.pkl') client = MongoClient('localhost', 27017) db = client['microblog_spider'] latest_history = db.latest_history count = 0 data = [] batch_size = 100 gone_size = 40000 while count < 100: t1 = time.time() data = latest_history.find().skip(gone_size + count * batch_size).limit(batch_size) data = [x for x in data] t2 = time.time()
high = mid if temp[low][1]>maxvalue: if ret=='dict': return {} else: return [] if ret=='dict': ret_data = {} for ele,count in temp[:high]: ret_data[ele]=count return ret_data else: return temp[:high] if __name__ == '__main__': text = FI.load_pickle('./static/demo.pkl') text =[ x['dealed_text']['left_content'][0] for x in text] # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] wv = Word2Vec(vec_len=500) wv.Train_Model(text) FI.save_pickle(wv.word_dict,'./static/wv.pkl') # # data = FI.load_pickle('./static/wv.pkl') # x = {} # for key in data: # temp = data[key]['vector'] # temp = preprocessing.normalize(temp) # x[key] = temp # FI.save_pickle(x,'./static/normal_wv.pkl') # x = FI.load_pickle('./static/normal_wv.pkl')
if sort_type == 'up': res = [ pop_id(x) for x in collection.find(select, f).limit(limit).sort( sort, pymongo.ASCENDING) ] else: res = [ pop_id(x) for x in collection.find(select, f).limit(limit).sort( sort, pymongo.DESCENDING) ] return res def pop_id(data): data.pop('_id') return data res = read_content_in_mongo( 'latest_history', {'user_id': '1681029540'}, ['dealed_text.left_content', 'created_at', 'user_name'], -1, 'id', 'down') data = FI.load_pickle('demo.pkl') data = data + res FI.save_pickle(data, 'demo.pkl') for line in res: print(line) print(res.__len__())
if temp[low][1] > maxvalue: if ret == 'dict': return {} else: return [] if ret == 'dict': ret_data = {} for ele, count in temp[:high]: ret_data[ele] = count return ret_data else: return temp[:high] if __name__ == '__main__': text = FI.load_pickle('./static/demo.pkl') text = [x['dealed_text']['left_content'][0] for x in text] # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] wv = Word2Vec(vec_len=500) wv.Train_Model(text) FI.save_pickle(wv.word_dict, './static/wv.pkl') # # data = FI.load_pickle('./static/wv.pkl') # x = {} # for key in data: # temp = data[key]['vector'] # temp = preprocessing.normalize(temp) # x[key] = temp # FI.save_pickle(x,'./static/normal_wv.pkl') # x = FI.load_pickle('./static/normal_wv.pkl')
for level in range(word_huffman.__len__()): huffman_charat = word_huffman[level] q = self.__Sigmoid(input_vector.dot(node.value.T)) grad = self.learn_rate * (1 - int(huffman_charat) - q) e += grad * node.value node.value += grad * input_vector node.value = preprocessing.normalize(node.value) if huffman_charat == '0': node = node.right else: node = node.left return e def __Sigmoid(self, value): return 1 / (1 + math.exp(-value)) if __name__ == '__main__': import WordCount # text = WordCount.readfile('./static/text8_mini') # text = WordCount.readfile('./static/text8') # mv = Word2Vec(vec_len=500) # mv.Train_Model(text) # FI.save_picle(mv.word_dict, './model/model_h.pkl') model_h = FI.load_pickle('./model/model_h.pkl') keys = list(model_h.keys()) print(keys.__len__()) print(keys[0])
def Get_Stop_Words(self): ret = [] ret = FI.load_pickle('./static/stop_words.pkl') return ret
def generate_node_and_edge(): pkl_list = os.listdir('.\static') node_list = {} edge_list = {} for path in pkl_list: info = FI.load_pickle('.\static\{x}'.format(x=path)) # print(info) # 处理 From情况 try: sender_mail = info['From']['mail'] ori_mail = sender_mail sender_mail = sender_mail.lower() except: continue # 如果其中没有From 则该邮件作废 sender_node = node_list.get(sender_mail) if sender_node: sender_node['send'] += 1 else: node_list[sender_mail] = dict(mail=sender_mail, send=1, receive=0, cc=0, ori_mail=ori_mail) if info.get('name'): node_list[sender_mail]['name'] = info['name'] # 处理To情况 receiver_list = info.get('To') if not receiver_list: continue for receiver in receiver_list: mail = receiver['mail'] ori_mail = mail mail = mail.lower() node = node_list.get(mail) if node: node['receive'] += 1 else: node_list[mail] = dict(mail=mail, send=0, receive=1, cc=0, ori_mail=ori_mail) if receiver.get('name'): node_list[mail]['name'] = receiver['name'] edge_key = sender_mail + '|----->' + mail edge = edge_list.get(edge_key) if edge: edge[2] += 1 # 如果该条边已经存在 else: edge_list[edge_key] = [sender_mail, mail, 1] #如果该条边不存在,则新加一条边 # 处理CC的情况 cc_list = info.get('CC') if cc_list: for cc in cc_list: mail = cc['mail'] ori_mail = mail mail = mail.lower() node = node_list.get(mail) if node: node['cc'] += 1 else: node_list[mail] = dict(mail=mail, send=0, receive=0, cc=1, ori_mail=ori_mail) if cc.get('name'): node_list[mail]['name'] = cc['name'] edge_key = sender_mail + '|----->' + mail edge = edge_list.get(edge_key) if edge: edge[2] += 1 # 如果该条边已经存在 else: edge_list[edge_key] = [sender_mail, mail, 1] #如果该条边不存在,则新加一条边 print('{id} is dealed'.format(id=path)) FI.save_pickle(node_list, '.\\temp_res\\node.pkl') FI.save_pickle(edge_list, '.\\temp_res\\edge.pkl')
else: node = node.left return e def __Sigmoid(self,value): return 1/(1+math.exp(-value)) if __name__ == '__main__': # text = FI.load_pickle('./static/demo.pkl') # text =[ x['dealed_text']['left_content'][0] for x in text] # # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] # wv = Word2Vec(vec_len=500) # wv.Train_Model(text) # FI.save_pickle(wv.word_dict,'./static/wv.pkl') # # data = FI.load_pickle('./static/wv.pkl') # x = {} # for key in data: # temp = data[key]['vector'] # temp = preprocessing.normalize(temp) # x[key] = temp # FI.save_pickle(x,'./static/normal_wv.pkl') x = FI.load_pickle('./static/normal_wv.pkl') def cal_simi(data,key1,key2): return data[key1].dot(data[key2].T)[0][0] keys=list(x.keys()) for key in keys: print(key,'\t',cal_simi(x,'姚明',key))
__author__ = 'multiangle' import jieba import File_Interface as FI data = FI.load_pickle('demo.pkl') user_list = [x['user_name'] for x in data] text_list = [x['dealed_text']['left_content'] for x in data] for line in text_list: print(line) res = jieba.cut(line[0],cut_all=False) # print(list(seg_list)) res = list(res) print(res)
# item = info_dict[id] # if item.hasPre: # changed = True # if item.data['retweeted_status']['id'] in info_id_list: # info_dict[item.data['retweeted_status']['id']].add_Next(item) # head_id_list.remove(id) # else: # pre_node = SNA_node.build_Head_Node(item.data['retweeted_status']) # pre_node.add_Next(item) # info_dict[pre_node.id] = pre_node # info_id_list.append(pre_node.id) # head_id_list.append(pre_node.id) # head_id_list.remove(id) # print(ite_times) # ite_times += 1 # print(head_id_list.__len__()) # FI.save_pickle(info_dict,'./static/dealed_info_list.pkl') # # 统计 # data = FI.load_pickle('./static/dealed_info_list.pkl') # id_list = list(data.keys()) # for id in id_list: # data[id].analyse() # FI.save_pickle(data,'./static/analyse_info_list.pkl') data = FI.load_pickle('./static/analyse_info_list.pkl') id_list = list(data.keys()) for id in id_list: if data[id].result['retweet_num'] > 1: print(data[id])
def post(self): try: user_basic_info = self.get_argument('user_basic_info') attends = self.get_argument('user_attends') user_basic_info = eval(user_basic_info) attends = eval(attends) self.write('success to return user info') self.finish() except: self.write('fail to return user info') self.finish() return try: dbi = MySQL_Interface() except: print('unable to connect to MySql DB') try: if attends.__len__() > 0: #store attends info table_name = 'cache_attends' attends_col_info = dbi.get_col_name(table_name) keys = attends[0].keys() attends = [[ line[i] if i in keys else '' for i in attends_col_info ] for line in attends] fans_col_pos = attends_col_info.index('fans_num') insert_attends = [] for line in attends: if line[fans_col_pos] > 1000: insert_attends.append(line) dbi.insert_asList(table_name, insert_attends, unique=True) print('Success : attends of {uid} is stored in {tname}'.format( uid=user_basic_info['uid'], tname=table_name)) else: pass except Exception as e: print(e) path = "temp" + os.sep + "{uid}_attends.pkl".format( uid=user_basic_info['uid']) print( 'unable to store attends of {uid}, it will be stored '.format( uid=user_basic_info['uid'])) FI.save_pickle(attends, path) try: atten_num_real = user_basic_info['attends_num'] atten_num_get = attends.__len__() user_basic_info['accuracy'] = atten_num_get # 实际获取到的关注数目 col_info = dbi.get_col_name( 'cache_user_info') # store user basic info keys = user_basic_info.keys() data = [user_basic_info[i] if i in keys else '' for i in col_info] dbi.insert_asList('cache_user_info', [data], unique=True) print('Success : basic info of {uid} is stored in cache_user_info'. format(uid=user_basic_info['uid'])) except Exception as e: print(e) path = 'temp' + os.sep + '{uid}_basic_info.pkl'.format( uid=user_basic_info['uid']) print('unable to store basic info of {uid} , it will be stored'. format(uid=user_basic_info['uid'])) FI.save_pickle(user_basic_info, path) try: if attends.__len__() > 0: # store atten connection web from_uid = user_basic_info['uid'] from_fans_num = user_basic_info['fans_num'] from_blog_num = user_basic_info['blog_num'] data = [[ from_uid, from_fans_num, from_blog_num, str(x[attends_col_info.index('uid')]), str(x[attends_col_info.index('fans_num')]), str(x[attends_col_info.index('blog_num')]) ] for x in attends] dbi.insert_asList('cache_atten_web', data) print( 'Success : conn web of {uid} is stored in cache_atten_web'. format(uid=user_basic_info['uid'])) else: pass except Exception as e: print(e) path = '{uid}_atten_web.pkl'.format(uid=user_basic_info['uid']) print('unable to store atten web of {uid} , it will be stored'. format(uid=user_basic_info['uid'])) FI.save_pickle(data, path)
def post(self): try: user_basic_info=self.get_argument('user_basic_info') attends=self.get_argument('user_attends') user_basic_info=eval(user_basic_info) attends=eval(attends) self.write('success to return user info') self.finish() except: self.write('fail to return user info') self.finish() return try: dbi=MySQL_Interface() except: print('unable to connect to MySql DB') try: if attends.__len__()>0: #store attends info table_name='cache_attends' attends_col_info=dbi.get_col_name(table_name) keys=attends[0].keys() attends= [[line[i] if i in keys else '' for i in attends_col_info] for line in attends] fans_col_pos=attends_col_info.index('fans_num') insert_attends=[] for line in attends: if line[fans_col_pos]>1000: insert_attends.append(line) dbi.insert_asList(table_name,insert_attends,unique=True) print('Success : attends of {uid} is stored in {tname}' .format(uid=user_basic_info['uid'],tname=table_name)) else: pass except Exception as e: print(e) path="temp\\{uid}_attends.pkl".format(uid=user_basic_info['uid']) print('unable to store attends of {uid}, it will be stored ' .format(uid=user_basic_info['uid'])) FI.save_pickle(attends,path) try: atten_num_real=user_basic_info['attends_num'] atten_num_get=attends.__len__() user_basic_info['accuracy']=atten_num_get # 实际获取到的关注数目 col_info=dbi.get_col_name('cache_user_info') # store user basic info keys=user_basic_info.keys() data=[user_basic_info[i] if i in keys else '' for i in col_info] dbi.insert_asList('cache_user_info',[data],unique=True) print('Success : basic info of {uid} is stored in cache_user_info' .format(uid=user_basic_info['uid'])) except Exception as e: print(e) path='temp\\{uid}_basic_info.pkl'.format(uid=user_basic_info['uid']) print('unable to store basic info of {uid} , it will be stored' .format(uid=user_basic_info['uid'])) FI.save_pickle(user_basic_info,path) try: if attends.__len__()>0: # store atten connection web from_uid=user_basic_info['uid'] from_fans_num=user_basic_info['fans_num'] from_blog_num=user_basic_info['blog_num'] data=[[from_uid,from_fans_num,from_blog_num,str(x[attends_col_info.index('uid')]),str(x[attends_col_info.index('fans_num')]),str(x[attends_col_info.index('blog_num')])]for x in attends] dbi.insert_asList('cache_atten_web',data) print('Success : conn web of {uid} is stored in cache_atten_web' .format(uid=user_basic_info['uid'])) else: pass except Exception as e: print(e) path='{uid}_atten_web.pkl'.format(uid=user_basic_info['uid']) print('unable to store atten web of {uid} , it will be stored' .format(uid=user_basic_info['uid'])) FI.save_pickle(data,path)
res=[pop_id(x) for x in collection.find(select).limit(limit).sort(sort,pymongo.ASCENDING)] else: res=[pop_id(x) for x in collection.find(select).limit(limit).sort(sort,pymongo.DESCENDING)] else: f={} for item in field: f[item]=1 if sort=='': res=[pop_id(x) for x in collection.find(select,f).limit(limit)] else: if sort_type=='up': res=[pop_id(x) for x in collection.find(select,f).limit(limit).sort(sort,pymongo.ASCENDING)] else: res=[pop_id(x) for x in collection.find(select,f).limit(limit).sort(sort,pymongo.DESCENDING)] return res def pop_id(data): data.pop('_id') return data res=read_content_in_mongo('latest_history',{'user_id':'1681029540'},['dealed_text.left_content','created_at','user_name'],-1,'id','down') data = FI.load_pickle('demo.pkl') data = data + res FI.save_pickle(data,'demo.pkl') for line in res: print(line) print(res.__len__())
# 从wikileaks下载邮件内容并生成结构化数据 # =============== ATTENTION =============== : # 由于邮件格式较繁杂,因此无法全部解析(5%左右) base_url = 'https://wikileaks.org/dnc-emails/get/{page}' base_path = '.\static\{page}.pkl' gotten_id = os.listdir('.\static') gotten_id = [int((x.split('.'))[0]) for x in gotten_id] task_pool = list(range(1, 2000)) # wikileak中的邮件编号 while True: if task_pool.__len__() == 0: break task_id = task_pool.pop(0) if task_id in gotten_id: print('{id} skip'.format(id=task_id)) continue url = base_url.format(page=task_id) path = base_path.format(page=task_id) try: info = getStructedData(url) FI.save_pickle(info, path) print('{t} succeed'.format(t=task_id)) except Exception as e: # task_pool.append(task_id) print('{t} failed <--<--<--<--'.format(t=task_id)) print(e) print('文件已下载完毕') # 生成社交网络数据(需要networkx包) generate_data_for_gelphi() print('gexf文件已经生成,存放路径: {path}\\temp_res'.format(path=os.getcwd()))