def generate_data_for_gelphi(): generate_node_and_edge() nodes = FI.load_pickle('.\\temp_res\\node.pkl') edges = FI.load_pickle('.\\temp_res\\edge.pkl') nodes = list(nodes.values()) edges = list(edges.values()) nodes_mail = [x['mail'] for x in nodes] # edges = [tuple(x) for x in edges] G = nx.Graph() G.add_nodes_from(nodes_mail) G.add_weighted_edges_from(edges) nx.write_gexf(G,'.\\temp_res\\data.gexf')
def generate_data_for_gelphi(): generate_node_and_edge() nodes = FI.load_pickle('.\\temp_res\\node.pkl') edges = FI.load_pickle('.\\temp_res\\edge.pkl') nodes = list(nodes.values()) edges = list(edges.values()) nodes_mail = [x['mail'] for x in nodes] # edges = [tuple(x) for x in edges] G = nx.Graph() G.add_nodes_from(nodes_mail) G.add_weighted_edges_from(edges) nx.write_gexf(G, '.\\temp_res\\data.gexf')
def Load_Word_Freq(self, word_freq_path): # load the info of word frequence # will generate a word dict if self.word_dict is not None: raise RuntimeError('the word dict is not empty') word_freq = FI.load_pickle(word_freq_path) self.__Gnerate_Word_Dict(word_freq)
def Load_Word_Freq(self,word_freq_path): # load the info of word frequence # will generate a word dict if self.word_dict is not None: raise RuntimeError('the word dict is not empty') word_freq = FI.load_pickle(word_freq_path) self.__Gnerate_Word_Dict(word_freq)
def Load_Word_Freq(self, word_freq_path): # 加载词频数据 # 加载词频后输出词典 if self.word_dict is not None: raise RuntimeError('the word dict is not empty') word_freq = FI.load_pickle(word_freq_path) self.__Gnerate_Word_Dict(word_freq)
def Import_Model(self,model_path): model = FI.load_pickle(model_path) # a dict, {'word_dict','huffman','vec_len'} self.word_dict = model.word_dict self.huffman = model.huffman self.vec_len = model.vec_len self.learn_rate = model.learn_rate self.win_len = model.win_len self.model = model.model
def Import_Model(self,model_path): model = FI.load_pickle(model_path) # a dict, {'word_dict','huffman','vec_len'} self.word_dict = model.word_dict self.huffman = model.huffman self.vec_len = model.vec_len self.learn_rate = model.learn_rate self.win_len = model.win_len self.model = model.model
def Import_Model(self,model_path): #直接读入word2vec模型 model = FI.load_pickle(model_path) # 以字典形式储存, {'word_dict','huffman','vec_len'} self.word_dict = model['word_dict'] self.huffman = model['huffman'] self.vec_len = model['vec_len'] self.learn_rate = model['learn_rate'] self.win_len = model['win_len'] self.model = model['model']
grad = self.learn_rate * (1 - int(huffman_charat) - q) e += grad * node.value node.value += grad * input_vector node.value = preprocessing.normalize(node.value) if huffman_charat == '0': node = node.right else: node = node.left return e def __Sigmoid(self, value): return 1 / (1 + math.exp(-value)) if __name__ == '__main__': text = FI.load_pickle('./static/demo.pkl') text = [x['dealed_text']['left_content'][0] for x in text] #data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] wv = Word2Vec(vec_len=500) wv.Train_Model(text) FI.save_pickle(wv.word_dict, './static/wv.pkl') data = FI.load_pickle('./static/wv.pkl') x = {} for key in data: temp = data[key]['vector'] temp = preprocessing.normalize(temp) x[key] = temp FI.save_pickle(x, './static/normal_wv.pkl') # x = FI.load_pickle('./static/normal_wv.pkl')
def __GoAlong_Huffman(self, word_huffman, input_vector, root): node = root e = np.zeros([1, self.vec_len]) for level in range(word_huffman.__len__()): huffman_charat = word_huffman[level] q = self.__Sigmoid(input_vector.dot(node.value.T)) grad = self.learn_rate * (1 - int(huffman_charat) - q) e += grad * node.value node.value += grad * input_vector node.value = preprocessing.normalize(node.value) if huffman_charat == '0': node = node.right else: node = node.left return e def __Sigmoid(self, value): return 1 / (1 + math.exp(-value)) if __name__ == '__main__': text = FI.load_pickle('./static/stop_words.pkl') text = [x for x in text] data = [ 'Merge multiple sorted inputs into a single sorted output', 'The API below differs from textbook heap algorithms in two aspects' ] wv = Word2Vec(vec_len=500) wv.Train_Model(text) FI.save_pickle(wv.word_dict, './static/wv.pkl') pass
res=[pop_id(x) for x in collection.find(select).limit(limit).sort(sort,pymongo.ASCENDING)] else: res=[pop_id(x) for x in collection.find(select).limit(limit).sort(sort,pymongo.DESCENDING)] else: f={} for item in field: f[item]=1 if sort=='': res=[pop_id(x) for x in collection.find(select,f).limit(limit)] else: if sort_type=='up': res=[pop_id(x) for x in collection.find(select,f).limit(limit).sort(sort,pymongo.ASCENDING)] else: res=[pop_id(x) for x in collection.find(select,f).limit(limit).sort(sort,pymongo.DESCENDING)] return res def pop_id(data): data.pop('_id') return data res=read_content_in_mongo('latest_history',{'user_id':'1681029540'},['dealed_text.left_content','created_at','user_name'],-1,'id','down') data = FI.load_pickle('demo.pkl') data = data + res FI.save_pickle(data,'demo.pkl') for line in res: print(line) print(res.__len__())
for level in range(word_huffman.__len__()): huffman_charat = word_huffman[level] q = self.__Sigmoid(input_vector.dot(node.value.T)) grad = self.learn_rate * (1 - int(huffman_charat) - q) e += grad * node.value node.value += grad * input_vector node.value = preprocessing.normalize(node.value) if huffman_charat == '0': node = node.right else: node = node.left return e def __Sigmoid(self, value): return 1 / (1 + math.exp(-value)) if __name__ == '__main__': import WordCount # text = WordCount.readfile('./static/text8_mini') # text = WordCount.readfile('./static/text8') # mv = Word2Vec(vec_len=500) # mv.Train_Model(text) # FI.save_picle(mv.word_dict, './model/model_h.pkl') model_h = FI.load_pickle('./model/model_h.pkl') keys = list(model_h.keys()) print(keys.__len__()) print(keys[0])
# pre_node = SNA_node.build_Head_Node(item.data['retweeted_status']) # pre_node.add_Next(item) # info_dict[pre_node.id] = pre_node # info_id_list.append(pre_node.id) # head_id_list.append(pre_node.id) # head_id_list.remove(id) # print(ite_times) # ite_times += 1 # print(head_id_list.__len__()) # FI.save_pickle(info_dict,'./static/dealed_info_list.pkl') # # 统计 # data = FI.load_pickle('./static/dealed_info_list.pkl') # id_list = list(data.keys()) # for id in id_list: # data[id].analyse() # FI.save_pickle(data,'./static/analyse_info_list.pkl') data = FI.load_pickle('./static/analyse_info_list.pkl') id_list = list(data.keys()) for id in id_list: if data[id].result['retweet_num']>1 : print(data[id])
if ret=='dict': ret_data = {} for ele,count in temp[:high]: ret_data[ele]=count return ret_data else: return temp[:high] if __name__ == '__main__': """ stop_words = open(u"./static/中文停用词表(比较全面,有1208个停用词).txt") list1 = [] for line in stop_words: list1.append(line.strip()) print len(list1) pickle.dump(list1,open("./static/stop_words.pkl","wb"),protocol = 2) """ text = FI.load_pickle("./static/stop_words.pkl") #list for x in text: print x.decode("gbk") break data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] wc = WordCounter(data) print(wc.count_res.larger_than(16)) """ c=MulCounter('abcdeabcdaffbcabag') print(sorted(c.items(),key=_itemgetter(1),reverse=True))#operator.itemgetter(1) print(c.larger_than(3)) """ pass
def generate_node_and_edge(): pkl_list = os.listdir('.\static') node_list = {} edge_list = {} for path in pkl_list: info = FI.load_pickle('.\static\{x}'.format(x=path)) # print(info) # 处理 From情况 try: sender_mail = info['From']['mail'] ori_mail = sender_mail sender_mail = sender_mail.lower() except: continue # 如果其中没有From 则该邮件作废 sender_node = node_list.get(sender_mail) if sender_node: sender_node['send'] += 1 else: node_list[sender_mail] = dict( mail = sender_mail, send = 1, receive = 0, cc = 0, ori_mail = ori_mail ) if info.get('name'): node_list[sender_mail]['name'] = info['name'] # 处理To情况 receiver_list = info.get('To') if not receiver_list: continue for receiver in receiver_list: mail = receiver['mail'] ori_mail = mail mail = mail.lower() node = node_list.get(mail) if node: node['receive'] += 1 else: node_list[mail] = dict( mail = mail, send = 0 , receive = 1, cc = 0, ori_mail = ori_mail ) if receiver.get('name'): node_list[mail]['name'] = receiver['name'] edge_key = sender_mail + '|----->' + mail edge = edge_list.get(edge_key) if edge: edge[2] += 1 # 如果该条边已经存在 else: edge_list[edge_key] = [sender_mail,mail,1] #如果该条边不存在,则新加一条边 # 处理CC的情况 cc_list = info.get('CC') if cc_list: for cc in cc_list: mail = cc['mail'] ori_mail = mail mail = mail.lower() node = node_list.get(mail) if node: node['cc'] += 1 else: node_list[mail] = dict( mail = mail, send = 0 , receive = 0 , cc = 1, ori_mail = ori_mail ) if cc.get('name'): node_list[mail]['name'] = cc['name'] edge_key = sender_mail + '|----->' + mail edge = edge_list.get(edge_key) if edge: edge[2] += 1 # 如果该条边已经存在 else: edge_list[edge_key] = [sender_mail,mail,1] #如果该条边不存在,则新加一条边 print('{id} is dealed'.format(id=path)) FI.save_pickle(node_list,'.\\temp_res\\node.pkl') FI.save_pickle(edge_list,'.\\temp_res\\edge.pkl')
if list[mid]['time'] < value: low = mid + 1 else: high = mid - 1 if i < 0: i = low if i == list.__len__(): list.append({'time': value, 'freq': 1}) elif list[i]['time'] == value: list[i]['freq'] += 1 else: list.insert(i, {'time': value, 'freq': 1}) if __name__ == '__main__': stop_words = FI.load_pickle('./static/stop_words.pkl') wfts = FI.load_pickle('./static/wfts_continue.pkl') client = MongoClient('localhost', 27017) db = client['microblog_spider'] latest_history = db.latest_history count = 0 data = [] batch_size = 100 gone_size = 40000 while count < 100: t1 = time.time() data = latest_history.find().skip(gone_size + count * batch_size).limit(batch_size) data = [x for x in data] t2 = time.time()
return e def __Sigmoid(self, value): return 1 / (1 + math.exp(-value)) if __name__ == '__main__': # text = FI.load_pickle('./static/demo.pkl') # text =[ x['dealed_text']['left_content'][0] for x in text] # # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] # wv = Word2Vec(vec_len=500) # wv.Train_Model(text) # FI.save_pickle(wv.word_dict,'./static/wv.pkl') # # data = FI.load_pickle('./static/wv.pkl') # x = {} # for key in data: # temp = data[key]['vector'] # temp = preprocessing.normalize(temp) # x[key] = temp # FI.save_pickle(x,'./static/normal_wv.pkl') x = FI.load_pickle('./static/normal_wv.pkl') def cal_simi(data, key1, key2): return data[key1].dot(data[key2].T)[0][0] keys = list(x.keys()) for key in keys: print(key, '\t', cal_simi(x, '小说', key))
__author__ = 'multiangle' import File_Interface as FI import json import os import time paths = os.listdir('.\static') date_counter = {} for path in paths: data = FI.load_pickle('.\static\{x}'.format(x=path)) date_ori = data.get('Date') if date_ori: tag = time.strftime('%Y/%m/%d', date_ori) c = date_counter.get(tag) print(tag) if c: date_counter[tag]['count'] += 1 else: date_counter[tag] = dict(date=tag, count=1) # date_counter = list(date_counter.values()) # date_counter = sorted(date_counter,key=lambda x:x['date']) # for date in date_counter: # print('{a}\t{b}'.format(a=date['date'],b=date['count']))
def generate_node_and_edge(): pkl_list = os.listdir('.\static') node_list = {} edge_list = {} for path in pkl_list: info = FI.load_pickle('.\static\{x}'.format(x=path)) # print(info) # 处理 From情况 try: sender_mail = info['From']['mail'] ori_mail = sender_mail sender_mail = sender_mail.lower() except: continue # 如果其中没有From 则该邮件作废 sender_node = node_list.get(sender_mail) if sender_node: sender_node['send'] += 1 else: node_list[sender_mail] = dict(mail=sender_mail, send=1, receive=0, cc=0, ori_mail=ori_mail) if info.get('name'): node_list[sender_mail]['name'] = info['name'] # 处理To情况 receiver_list = info.get('To') if not receiver_list: continue for receiver in receiver_list: mail = receiver['mail'] ori_mail = mail mail = mail.lower() node = node_list.get(mail) if node: node['receive'] += 1 else: node_list[mail] = dict(mail=mail, send=0, receive=1, cc=0, ori_mail=ori_mail) if receiver.get('name'): node_list[mail]['name'] = receiver['name'] edge_key = sender_mail + '|----->' + mail edge = edge_list.get(edge_key) if edge: edge[2] += 1 # 如果该条边已经存在 else: edge_list[edge_key] = [sender_mail, mail, 1] #如果该条边不存在,则新加一条边 # 处理CC的情况 cc_list = info.get('CC') if cc_list: for cc in cc_list: mail = cc['mail'] ori_mail = mail mail = mail.lower() node = node_list.get(mail) if node: node['cc'] += 1 else: node_list[mail] = dict(mail=mail, send=0, receive=0, cc=1, ori_mail=ori_mail) if cc.get('name'): node_list[mail]['name'] = cc['name'] edge_key = sender_mail + '|----->' + mail edge = edge_list.get(edge_key) if edge: edge[2] += 1 # 如果该条边已经存在 else: edge_list[edge_key] = [sender_mail, mail, 1] #如果该条边不存在,则新加一条边 print('{id} is dealed'.format(id=path)) FI.save_pickle(node_list, '.\\temp_res\\node.pkl') FI.save_pickle(edge_list, '.\\temp_res\\edge.pkl')
if temp[low][1] > maxvalue: if ret == 'dict': return {} else: return [] if ret == 'dict': ret_data = {} for ele, count in temp[:high]: ret_data[ele] = count return ret_data else: return temp[:high] if __name__ == '__main__': text = FI.load_pickle('./static/demo.pkl') text = [x['dealed_text']['left_content'][0] for x in text] # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] wv = Word2Vec(vec_len=500) wv.Train_Model(text) FI.save_pickle(wv.word_dict, './static/wv.pkl') # # data = FI.load_pickle('./static/wv.pkl') # x = {} # for key in data: # temp = data[key]['vector'] # temp = preprocessing.normalize(temp) # x[key] = temp # FI.save_pickle(x,'./static/normal_wv.pkl') # x = FI.load_pickle('./static/normal_wv.pkl')
if sort_type == 'up': res = [ pop_id(x) for x in collection.find(select, f).limit(limit).sort( sort, pymongo.ASCENDING) ] else: res = [ pop_id(x) for x in collection.find(select, f).limit(limit).sort( sort, pymongo.DESCENDING) ] return res def pop_id(data): data.pop('_id') return data res = read_content_in_mongo( 'latest_history', {'user_id': '1681029540'}, ['dealed_text.left_content', 'created_at', 'user_name'], -1, 'id', 'down') data = FI.load_pickle('demo.pkl') data = data + res FI.save_pickle(data, 'demo.pkl') for line in res: print(line) print(res.__len__())
else: node = node.left return e def __Sigmoid(self,value): return 1/(1+math.exp(-value)) if __name__ == '__main__': # text = FI.load_pickle('./static/demo.pkl') # text =[ x['dealed_text']['left_content'][0] for x in text] # # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] # wv = Word2Vec(vec_len=500) # wv.Train_Model(text) # FI.save_pickle(wv.word_dict,'./static/wv.pkl') # # data = FI.load_pickle('./static/wv.pkl') # x = {} # for key in data: # temp = data[key]['vector'] # temp = preprocessing.normalize(temp) # x[key] = temp # FI.save_pickle(x,'./static/normal_wv.pkl') x = FI.load_pickle('./static/normal_wv.pkl') def cal_simi(data,key1,key2): return data[key1].dot(data[key2].T)[0][0] keys=list(x.keys()) for key in keys: print(key,'\t',cal_simi(x,'姚明',key))
high = mid if temp[low][1]>maxvalue: if ret=='dict': return {} else: return [] if ret=='dict': ret_data = {} for ele,count in temp[:high]: ret_data[ele]=count return ret_data else: return temp[:high] if __name__ == '__main__': text = FI.load_pickle('./static/demo.pkl') text =[ x['dealed_text']['left_content'][0] for x in text] # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] wv = Word2Vec(vec_len=500) wv.Train_Model(text) FI.save_pickle(wv.word_dict,'./static/wv.pkl') # # data = FI.load_pickle('./static/wv.pkl') # x = {} # for key in data: # temp = data[key]['vector'] # temp = preprocessing.normalize(temp) # x[key] = temp # FI.save_pickle(x,'./static/normal_wv.pkl') # x = FI.load_pickle('./static/normal_wv.pkl')
def Get_Stop_Words(self): ret = [] ret = FI.load_pickle('./static/stop_words.pkl') return ret
# item = info_dict[id] # if item.hasPre: # changed = True # if item.data['retweeted_status']['id'] in info_id_list: # info_dict[item.data['retweeted_status']['id']].add_Next(item) # head_id_list.remove(id) # else: # pre_node = SNA_node.build_Head_Node(item.data['retweeted_status']) # pre_node.add_Next(item) # info_dict[pre_node.id] = pre_node # info_id_list.append(pre_node.id) # head_id_list.append(pre_node.id) # head_id_list.remove(id) # print(ite_times) # ite_times += 1 # print(head_id_list.__len__()) # FI.save_pickle(info_dict,'./static/dealed_info_list.pkl') # # 统计 # data = FI.load_pickle('./static/dealed_info_list.pkl') # id_list = list(data.keys()) # for id in id_list: # data[id].analyse() # FI.save_pickle(data,'./static/analyse_info_list.pkl') data = FI.load_pickle('./static/analyse_info_list.pkl') id_list = list(data.keys()) for id in id_list: if data[id].result['retweet_num'] > 1: print(data[id])
def Get_Stop_Words(self): ret = [] ret = FI.load_pickle('./static/stop_words.pkl') return ret
break if list[mid]['time'] < value : low = mid + 1 else: high = mid - 1 if i<0: i = low if i==list.__len__() : list.append({'time':value,'freq':1}) elif list[i]['time']==value : list[i]['freq'] += 1 else: list.insert(i,{'time':value,'freq':1}) if __name__=='__main__': stop_words = FI.load_pickle('./static/stop_words.pkl') wfts = FI.load_pickle('./static/wfts_continue.pkl') client = MongoClient('localhost',27017) db = client['microblog_spider'] latest_history = db.latest_history count = 0 data = [] batch_size = 100 gone_size = 40000 while count<100 : t1 = time.time() data = latest_history.find().skip(gone_size+count*batch_size).limit(batch_size) data = [x for x in data] t2 = time.time() count += 1