def Build_Relation_Content_Net(self,uid): ''' given a user build his followees network NOTED!!! not the content network the content network can be calculated in write_c_adj_txt ''' if 'g' not in dir(self): print 'loading %s'%(self.base_dir+self.expr_dir+SQLDao.ce.properties['relation_full_graph_file_name']) self._load_full_graph() # self rather than gng uids=self.Load_Followees_From_DB(uid) # write the followees ids self.Save_Followees(uid,uids) #g_sub=gng.ConnectivityGraph(uids) g_sub=self.ConnectivityGraph(uids) #self.g_sub=gng.BuildContentInfo(g_sub) self.g_sub=self.BuildContentInfo(g_sub) # set a flag for uids uids=[long(id[0]) for id in uids] for v in self.g_sub.vs: if v['user_id'] in uids: v['is_followee']=1 else: v['is_followee']=0 # serialize it FSDao.write_pickle(self.base_dir+self.expr_dir,'%s_relation.pickle'%uid,self.g_sub)
def Build_Relation_Content_Net(self, uid): ''' given a user build his followees network NOTED!!! not the content network the content network can be calculated in write_c_adj_txt ''' if 'g' not in dir(self): print 'loading %s' % ( self.base_dir + self.expr_dir + SQLDao.ce.properties['relation_full_graph_file_name']) self._load_full_graph() # self rather than gng uids = self.Load_Followees_From_DB(uid) # write the followees ids self.Save_Followees(uid, uids) #g_sub=gng.ConnectivityGraph(uids) g_sub = self.ConnectivityGraph(uids) #self.g_sub=gng.BuildContentInfo(g_sub) self.g_sub = self.BuildContentInfo(g_sub) # set a flag for uids uids = [long(id[0]) for id in uids] for v in self.g_sub.vs: if v['user_id'] in uids: v['is_followee'] = 1 else: v['is_followee'] = 0 # serialize it FSDao.write_pickle(self.base_dir + self.expr_dir, '%s_relation.pickle' % uid, self.g_sub)
def compute_laplacian_matrix(self): from numpy import array # TODO actually you need to notice that we need to add weight in the future i.degree and -1 is too simple here vecCount = len(self.g.vs) row = [] col = [] data = [] for i in self.g.vs: row.append(i.index) col.append(i.index) data.append(i.degree()) pass for e in self.g.es: row.append(e.source) col.append(e.target) data.append(-1) row.append(e.target) col.append(e.source) data.append(-1) row = array(row) col = array(col) data = array(data) self.l_matrix = coo_matrix((data, (row, col)), shape=(vecCount, vecCount), dtype='float32').asformat('csr') print 'Finish building laplacian matrix' FSDao.write_pickle(self.expr_dir, SQLDao.ce.properties['laplacian_file_name'], self.l_matrix) print 'Finish write laplacian matrix' pass
def SoftIndicator_CommunityDiscovery(self, step=False): ''' 1. compute the eigen vector for the second smallest eigen value 2. serialize the whole eigen matrix ''' if 'l_matrix' not in dir(self): f = open( self.expr_dir + SQLDao.ce.properties['laplacian_file_name'], 'rb') self.l_matrix = pickle.load(f) f.close() print 'finished loading laplacian matrix' pass from scipy.sparse import linalg # SA is an important parameter for computing the smallest eigen value and the respective eigen vector # eigsh is special for symmetric matrix self.eigValue, self.eigMatrix = linalg.eigsh(A=self.l_matrix, k=10, which='SA', maxiter=500) # print self.d[:,self.get_second_smallest_value_index(self.v)] FSDao.write_pickle(self.expr_dir, SQLDao.ce.properties['eigen_value'], self.eigValue) FSDao.write_pickle(self.expr_dir, SQLDao.ce.properties['eigen_matrix'], self.eigMatrix) pass
def build_vertex_clustering(self): ''' this function aim to build vertex clustering from the matlab generated group file ''' import os import igraph if not os.path.exists(self.expr_dir + SQLDao.ce.properties['matlab_dir'] + SQLDao.ce.properties['group_file_name']): raise ValueError( 'group file do not exist, please run eig_solve.m to generate one' ) else: f = open(self.expr_dir + SQLDao.ce.properties['matlab_dir'] + SQLDao.ce.properties['group_file_name']) com_list = f.readline().strip().split(' ') f.close() com_list = [int(a) - 1 for a in com_list] self.vertex_clustering = igraph.clustering.VertexClustering( self.g, com_list) print self.vertex_clustering.modularity print 'writing vertex_clustering' FSDao.write_pickle( self.expr_dir, SQLDao.ce.properties['vertex_clustering_file_name'], self.vertex_clustering) print 'finished writing vertex_clustering' pass
def Build_Content_Graph_From_Rg(self, uid): if 'rg' not in dir(self): self._load_user_relation_pickle(uid) self.cg = self.rg.copy() self.cg.delete_edges(self.cg.es) from ExtractContentEdge import ContentEdgeExtractorBaseRoutine cebr = ContentEdgeExtractorBaseRoutine(self.base_dir, self.expr_dir) edges = [] weights = [] for i, vi in enumerate(self.cg.vs): for j in range(i + 1, len(self.cg.vs)): vj = self.cg.vs[j] h1, h2, h3 = cebr.compute_similarity(vi, vj) sim = self.w1 * h1 + self.w2 * h2 + self.w3 * h3 if sim > self.criterion: edges.append((vi.index, vj.index)) weights.append(sim) self.cg.add_edges(edges) self.cg.es['weight'] = weights FSDao.write_pickle(self.base_dir + self.expr_dir, '%s_content.pickle' % uid, self.cg)
def Build_Content_Graph_From_Rg(self,uid): if 'rg' not in dir(self): self._load_user_relation_pickle(uid) self.cg=self.rg.copy() self.cg.delete_edges(self.cg.es) from ExtractContentEdge import ContentEdgeExtractorBaseRoutine cebr=ContentEdgeExtractorBaseRoutine(self.base_dir,self.expr_dir) edges=[] weights=[] for i,vi in enumerate(self.cg.vs): for j in range(i+1,len(self.cg.vs)): vj=self.cg.vs[j] h1,h2,h3=cebr.compute_similarity(vi,vj) sim=self.w1*h1+self.w2*h2+self.w3*h3 if sim>self.criterion: edges.append((vi.index,vj.index)) weights.append(sim) self.cg.add_edges(edges) self.cg.es['weight']=weights FSDao.write_pickle(self.base_dir+self.expr_dir,'%s_content.pickle'%uid,self.cg)
def build_prime_dict(self,n=1000): ''' build prime dict and this function should always run once ''' self.prime_dict=dict() self.prime_dict[-1]=1 for i in range(n): self.prime_dict[i]=prime.prime(i+1) print self.prime_dict FSDao.write_pickle(self.base_dir+self.expr_dir,SQLDao.ce.properties['prime_dict'],self.prime_dict)
def SerializeDendrogram(directory, graph_filename, dendrogram_filename): """ In this function, we 1. read the pick graph from the file system 2. rudely compute the community by fast greedy algorithm 3. then dump the pick object """ graph = d.trace(FSDao.read_pickle_graph)(graph_filename) # we need some filtering here global CRITERION_VERTEX_OCCUR_COUNT graph = d.trace(graph.subgraph)(graph.vs.select(occur_count_ge=CRITERION_VERTEX_OCCUR_COUNT)) vertexDendrogram = d.trace(graph.community_fastgreedy)() FSDao.write_pickle(directory, dendrogram_filename, vertexDendrogram) pass
def SerializeDendrogram(directory, graph_filename, dendrogram_filename): ''' In this function, we 1. read the pick graph from the file system 2. rudely compute the community by fast greedy algorithm 3. then dump the pick object ''' graph = d.trace(FSDao.read_pickle_graph)(graph_filename) #we need some filtering here global CRITERION_VERTEX_OCCUR_COUNT graph = d.trace(graph.subgraph)( graph.vs.select(occur_count_ge=CRITERION_VERTEX_OCCUR_COUNT)) vertexDendrogram = d.trace(graph.community_fastgreedy)() FSDao.write_pickle(directory, dendrogram_filename, vertexDendrogram) pass
def Gen_Filtering_Graph(relation_count=100, ori_tw_count=30): ''' this function aim to generate a graph that was filtered by certain criterion ''' import FSDao import igraph g = FSDao.read_pickle_graph(properties['base_dir'] + properties['expr_dir'] + 'relation.pickle.old') f = open(properties['base_dir'] + 'users_unfilter.csv') uid_list = [] for line in f: t = line.strip().split(',') if int(t[1]) <= relation_count: continue if int(t[2]) <= ori_tw_count: continue uid_list.append(int(t[0])) f.close() sub_vlsit = [v.index for v in g.vs if v['user_id'] in uid_list] g_sub = g.subgraph(sub_vlsit) #FSDao.write_pickle(properties['base_dir']+properties['expr_dir'],'relation.pickle',g_sub) return g_sub pass
def Gen_Filtering_Graph(relation_count=100, ori_tw_count=30): """ this function aim to generate a graph that was filtered by certain criterion """ import FSDao import igraph g = FSDao.read_pickle_graph(properties["base_dir"] + properties["expr_dir"] + "relation.pickle.old") f = open(properties["base_dir"] + "users_unfilter.csv") uid_list = [] for line in f: t = line.strip().split(",") if int(t[1]) <= relation_count: continue if int(t[2]) <= ori_tw_count: continue uid_list.append(int(t[0])) f.close() sub_vlsit = [v.index for v in g.vs if v["user_id"] in uid_list] g_sub = g.subgraph(sub_vlsit) # FSDao.write_pickle(properties['base_dir']+properties['expr_dir'],'relation.pickle',g_sub) return g_sub pass
def Load_Community_List(self): ''' load the community list generated by the matlab and write the pickle ''' import os self.communities=[] for r,d,files in os.walk(self.expr_dir+SQLDao.ce.properties['matlab_dir']): for f in files: if f.endswith('.list'): cl=self.build_comlist_obj(f) self.communities.append(cl) print 'handled %s'%f pass # serialize the communities print 'writing communities' FSDao.write_pickle(self.expr_dir,'communities.pickle',self.communities) print 'finished writing communities' pass
def iGraph_CommunityDiscovery(self, step=False): ''' discover community in the relation graph, it takes time 1. compute the vertexClustering 2. serialize the vertexClustering step to see whether this function start from itself ''' # why I serialize the vertexClustering: # even though I compute eigenvector myself, I am forced to construct vertexClustering myself self.vertex_clustering = d.trace( self.g.community_leading_eigenvector)() print 'modularity is %s' % self.vertex_clustering.modularity # print self.vertex_clustering.membership print 'finish find community_leading_eigenvector' FSDao.write_pickle(self.expr_dir, SQLDao.ce.properties['vertex_clustering_file_name'], self.vertex_clustering) pass
def Load_Partition_Result(self): ''' load the partition result including community_relation.list and community_content.list ''' if 'rg' not in dir(self): self._load_relation_graph(self.expr_dir, self.r_graph) pass if 'cg' not in dir(self): self._load_content_graph(self.expr_dir, self.c_graph) pass # first load the community_relation.list f = open(self.expr_dir + SQLDao.ce.properties['matlab_dir'] + 'community_relation.list') title = f.readline().strip() #title=simplejson.loads(title) com_list = f.readline().strip().split(' ') f.close() com_list = [int(a) - 1 for a in com_list] self.vc_relation = igraph.clustering.VertexClustering( self.rg, com_list) # then load the community_content.list f = open(self.expr_dir + SQLDao.ce.properties['matlab_dir'] + 'community_content.list') title = f.readline().strip() #title=simplejson.loads(title) com_list = f.readline().strip().split(' ') f.close() com_list = [int(a) - 1 for a in com_list] self.vc_content = igraph.clustering.VertexClustering( graph=self.cg, membership=com_list, modularity_params={'weights': 'similarity'}) import FSDao FSDao.write_pickle(self.expr_dir, 'vc_relation.pickle', self.vc_relation) FSDao.write_pickle(self.expr_dir, 'vc_content.pickle', self.vc_content) print 'finish writing vc_relation.pickle and vc_content.pickle' pass
def Load_Community_List(self): ''' load the community list generated by the matlab and write the pickle ''' import os self.communities = [] for r, d, files in os.walk(self.expr_dir + SQLDao.ce.properties['matlab_dir']): for f in files: if f.endswith('.list'): cl = self.build_comlist_obj(f) self.communities.append(cl) print 'handled %s' % f pass # serialize the communities print 'writing communities' FSDao.write_pickle(self.expr_dir, 'communities.pickle', self.communities) print 'finished writing communities' pass
def KMeansClustering(self, step=False): ''' Integrate orange here actually it is a little subtle here: 1. I dont think kmeans is a good way to decide which community a node(user) should be however it is the most generalized one 2.TODO maybe change the number of clusters(rather than make it automatically...) is better, but you have to check to result first ''' data = self.build_orange_data_from_eig_vector() # clustering self.km = Orange.clustering.kmeans.Clustering(data=data, distance=EigDistance) # you had better construct it into a vertex_clustering in order to compute modularity, it is not reasonable to use orange to insert into database any more clusters = self.km.clusters import igraph self.vertex_clustering = igraph.clustering.VertexClustering( self.g, clusters) print 'writing vertex_clustering' FSDao.write_pickle(self.expr_dir, SQLDao.ce.properties['vertex_clustering_file_name'], self.vertex_clustering) print 'finished writing vertex_clustering'
def CommunityDiscovery(expr_dir, pickle_filename, dendrogram_file_name): ''' discover community in the relation graph, it takes time 1. read pickle graph from the file system 2. compute the dendrogram 3. serialize the dendrogram ''' print expr_dir + pickle_filename #g=FSDao.read_pickle_graph(expr_dir+pickle_filename) f = open(expr_dir + pickle_filename, 'rb') g = d.trace(pickle.load)(f) f.close() vertexClustering = d.trace(g.community_leading_eigenvector)() FSDao.write_pickle(expr_dir, 'dendrogram.eigen', vertexClustering) # edge betweeness # vertexDendrogram=d.trace(g.community_edge_betweenness)(directed=True) # FSDao.write_pickle(expr_dir,'dendrogram.betweeness',vertexDendrogram) # walk strap # vertexDendrogram=d.trace(g.community_walktrap)() # FSDao.write_pickle(expr_dir,'dendrogram.walkstrap',vertexDendrogram) pass
def SerializeBirelationGraph(): ''' construct the bi-relational graph and write it to the file system as pickle graph ''' import igraph sql = SQLDao.SQLDao.getInstance() h1, uids = d.trace(sql.getAllOU)() # print len(uids) # add users to the graph and construct a dict for index g = igraph.Graph(n=0, directed=False) uid_to_gidx_dict = {} for idx, uid in enumerate(uids): # make sure the name is user_id g.add_vertex({SQLDao.LABEL_USER_GROUP_INFO_USERID:uid[0]}) uid_to_gidx_dict[uid[0]] = idx pass print 'Finish add vertices %s'%len(uids) h, ur = d.trace(sql.getOURelations)(reciprocated=True) #construct the list contain tuples represent the relations between users edge_list = [] for idx, rec in enumerate(ur): if idx % 1000 == 0: print 'edge %s' % idx sid = rec[SQLDao.LABEL_SRC_USERID] tid = rec[SQLDao.LABEL_TAR_USERID] edge_list.append((uid_to_gidx_dict[sid], uid_to_gidx_dict[tid])) edge_list=list(edge_set) print 'Finish constructing edge list %s' % len(edge_list) # Note: It is <bold>very very</bold> slow to add edge iteratively g.add_edges(edge_list) print 'finish building a graph based on social relation' FSDao.write_pickle(SQLDao.ce.properties['base_dir'] + SQLDao.ce.properties['expr_dir'],SQLDao.ce.properties['relation_reciprocated_graph_file_name'], g) pass
def SerializeRelationshipGraph(): ''' construct the graph and write it to the file system as pickle graph ''' import igraph sql = SQLDao.SQLDao.getInstance() h1, uids = d.trace(sql.getAllOU)() sql = SQLDao.SQLDao.getInstance() h, ur = d.trace(sql.getOURelations)() g = igraph.Graph(n=0, directed=True) # add users to the graph and construct a dict for index uid_to_gidx_dict={} assert SQLDao.LABEL_USER_GROUP_INFO_USERID=='user_id' for idx, user_id in enumerate(uids): g.add_vertex(user_id=user_id[0]) uid_to_gidx_dict[user_id[0]] = idx pass print 'Finish add vertices' # construct the list contain tuples represent the relations between users edge_list = [] for idx, rec in enumerate(ur): if idx % 1000 == 0: print 'edge %s' % idx sid = rec[SQLDao.LABEL_SRC_USERID] tid = rec[SQLDao.LABEL_TAR_USERID] edge_list.append((uid_to_gidx_dict[sid], uid_to_gidx_dict[tid])) print 'Finish constructing edge list %s' % len(edge_list) # Note: It is <bold>very very</bold> slow to add edge iteratively g.add_edges(edge_list) print 'finish building a graph based on social relation' # FSDao.write_graph(g, SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir']+'relation.pickle') FSDao.write_pickle(SQLDao.ce.properties['base_dir'] + SQLDao.ce.properties['expr_dir'],SQLDao.ce.properties['relation_graph_file_name'], g) pass
def Insert_Word_Group_Info_To_DB(source_type, source_directory, src_file_name, sql_table_name, sheet_name=""): """ we insert the word group info into db for data mining use 'cause we need to know how a user's words distribute """ if source_type == "excel": groupinfo = FSDao.read_excel(source_directory + src_file_name, sheet_name) elif source_type == "db": sqlite = SQLDao.SQLiteDao(source_directory, src_file_name) headings, groupinfo = sqlite.get_word_group_info() pass else: raise ValueError("source type error") sql = SQLDao.SQLDao.getInstance() sql.saveGroupInfo(sql_table_name, groupinfo)
def Insert_Word_Group_Info_To_DB(source_type, source_directory, src_file_name, sql_table_name, sheet_name=''): ''' we insert the word group info into db for data mining use 'cause we need to know how a user's words distribute ''' if source_type == 'excel': groupinfo = FSDao.read_excel(source_directory + src_file_name, sheet_name) elif source_type == 'db': sqlite = SQLDao.SQLiteDao(source_directory, src_file_name) headings, groupinfo = sqlite.get_word_group_info() pass else: raise ValueError('source type error') sql = SQLDao.SQLDao.getInstance() sql.saveGroupInfo(sql_table_name, groupinfo)
def leading_eigen_vector(self): ''' I would like to implement leading eigenvector methodology myself, which a critical challenge is the result should be totally the same with igraph community_leading_eigenvector ''' import Queue import igraph membership = [0] * len(self.g.vs) vc = igraph.clustering.VertexClustering(self.g, membership) max_com_label = 0 q = Queue.Queue() q.put(0) max_modularity = vc.modularity # begin split iteratively until no split can increase the modularity while q.qsize() > 0: cur_label = q.get() print 'cur label:%s, qsize: %s' % (cur_label, q.qsize()) vertex_list = [ idx for idx, com_label in enumerate(vc.membership) if com_label == cur_label ] graph_to_split = self.g.subgraph(vertex_list) mod_matrix = self.compute_modularity_matrix(graph_to_split) gc.collect() eig = self.compute_largest_eigenvector(mod_matrix) # positive index and negative index if eig[0] > 0: positive_index = [ vertex_list[idx] for idx, v in enumerate(eig) if v > 0 ] negative_index = [ vertex_list[idx] for idx, v in enumerate(eig) if v <= 0 ] else: positive_index = [ vertex_list[idx] for idx, v in enumerate(eig) if v <= 0 ] negative_index = [ vertex_list[idx] for idx, v in enumerate(eig) if v > 0 ] # noted that there are no set method for membership setting new_membership = vc.membership # positive index element keep the cur_label # negative index element use the max_label+1 for idx in negative_index: new_membership[idx] = max_com_label + 1 vc_temp = igraph.clustering.VertexClustering( self.g, new_membership) # print 'new_membership:%s'%vc_temp.membership print 'old modularity: %s, new modularity: %s\n' % ( vc.modularity, vc_temp.modularity) if vc_temp.modularity > vc.modularity: vc = vc_temp q.put(cur_label) q.put(max_com_label + 1) max_com_label = max_com_label + 1 else: pass pass self.vertex_clustering = vc # print self.vertex_clustering.membership print 'writing vertex_clustering' FSDao.write_pickle(self.expr_dir, SQLDao.ce.properties['vertex_clustering_file_name'], self.vertex_clustering) print 'finished writing vertex_clustering' pass
def __init__(self, base_dir,expr_dir): self.base_dir=base_dir self.expr_dir=expr_dir self.g=FSDao.read_pickle_graph(self.base_dir+self.expr_dir+SQLDao.ce.properties['relation_graph_file_name']) pass
def KMeansClustering_Iterative(self, step=False): ''' Integrate orange here actually it is a little subtle here: 1. I dont think kmeans is a good way to decide which community a node(user) should be however it is the most generalized one 2.TODO maybe change the number of clusters(rather than make it automatically...) is better, but you have to check to result first ''' eig_data = self.build_orange_data_from_eig_vector() # clustering self.km = Orange.clustering.kmeans.Clustering(data=eig_data, centroids=5, distance=EigDistance) # you had better construct it into a vertex_clustering in order to compute modularity, it is not reasonable to use orange to insert into database any more clusters = self.km.clusters d = {} for idx, c in enumerate(clusters): if not d.has_key(c): d[c] = [idx] else: d[c].append(idx) import Queue q = Queue.Queue() for v in d.values(): q.put(v) res_list = [] import CommunityExtraction as ce while q.qsize() > 0: v = q.get() print 'qsize:%s cluster size: %s res list size: %s' % ( q.qsize(), len(v), len(res_list)) if len(v) < ce.CRITERION_CLUSTER_NODES_LOWER_BOUND: res_list.append(v) pass elif len(v) > ce.CRITERION_CLUSTER_NODES_UPPER_BOUND: # may be it can be iterative sub_data = eig_data.get_items(v) sub_km = Orange.clustering.kmeans.Clustering( data=sub_data, centroids=5, distance=EigDistance) sub_clusters = sub_km.clusters temp_d = dict() for idx, c in enumerate(sub_clusters): if not temp_d.has_key(c): temp_d[c] = [v[idx]] else: temp_d[c].append(v[idx]) for sub_v in temp_d.values(): q.put(sub_v) pass else: res_list.append(v) pass pass clusters = [0] * len(eig_data) for idx, res in enumerate(res_list): for r in res: clusters[r] = idx pass import igraph self.vertex_clustering = igraph.clustering.VertexClustering( self.g, clusters) print 'writing vertex_clustering' FSDao.write_pickle(self.expr_dir, SQLDao.ce.properties['vertex_clustering_file_name'], self.vertex_clustering) print 'finished writing vertex_clustering'
d[p.main_category]+=1 l=d.values() total=sum(l) a=[float(i)/float(total) for i in l ] return max(a) import pickle import FSDao import SQLDao if __name__ == '__main__': init_category_list() poster_list=init_poster_and_tweets() tester_list=init_tester(poster_list) FSDao.write_pickle(SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir'],'tester_list.pickle',tester_list) def simulate(filename): print 'simulating...' eva_f=open(SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir']+filename,'w') for t in tester_list: while True: purity_rc=random.gauss(0.8,0.1) if purity_rc<1.0 and purity_rc>0: break
g.add_edges(edges_t) g.es['similarity']=[e[2] for e in edges] l_c=get_components_by_similarity(g,sim_criterion) print l_c try: assert l_c==1 except AssertionError,e: return g assert isinstance(g,igraph.Graph) import FSDao FSDao.write_pickle(SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir'],'content_with_similarity.pickle',g) return g def build_full_adjacency_matrix(layer): f=open(SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir']+SQLDao.ce.properties['target_user_without_edge_graph_file'],'rb') g=pickle.load(f) f.close() print 'finished loaded graph' sql=SQLDao.SQLDao.getInstance() assert isinstance(sql, SQLDao.SQLDao) f=open(SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir']+'matlab/'+'adj_content_full','a+') for v in g.vs:
def Serialize_ContentGraph(self): import gc gc.collect() FSDao.write_pickle(self.base_dir+self.expr_dir,'content.pickle',self.g) pass