def Build_Relation_Content_Net(self,uid):
     '''
     given a user build his followees network 
     NOTED!!!
     not the content network
     the content network can be calculated in write_c_adj_txt
     '''
     if 'g' not in dir(self):
         print 'loading %s'%(self.base_dir+self.expr_dir+SQLDao.ce.properties['relation_full_graph_file_name'])
         self._load_full_graph()
     
     # self rather than gng
     uids=self.Load_Followees_From_DB(uid)
     # write the followees ids
     self.Save_Followees(uid,uids)
     
     #g_sub=gng.ConnectivityGraph(uids)
     g_sub=self.ConnectivityGraph(uids)
     
     #self.g_sub=gng.BuildContentInfo(g_sub)
     self.g_sub=self.BuildContentInfo(g_sub)
     
     
     # set a flag for uids
     uids=[long(id[0]) for id in uids]
     for v in self.g_sub.vs:
         if v['user_id'] in uids:
             v['is_followee']=1
         else:
             v['is_followee']=0
     
     # serialize it
     FSDao.write_pickle(self.base_dir+self.expr_dir,'%s_relation.pickle'%uid,self.g_sub)
    def Build_Relation_Content_Net(self, uid):
        '''
        given a user build his followees network 
        NOTED!!!
        not the content network
        the content network can be calculated in write_c_adj_txt
        '''
        if 'g' not in dir(self):
            print 'loading %s' % (
                self.base_dir + self.expr_dir +
                SQLDao.ce.properties['relation_full_graph_file_name'])
            self._load_full_graph()

        # self rather than gng
        uids = self.Load_Followees_From_DB(uid)
        # write the followees ids
        self.Save_Followees(uid, uids)

        #g_sub=gng.ConnectivityGraph(uids)
        g_sub = self.ConnectivityGraph(uids)

        #self.g_sub=gng.BuildContentInfo(g_sub)
        self.g_sub = self.BuildContentInfo(g_sub)

        # set a flag for uids
        uids = [long(id[0]) for id in uids]
        for v in self.g_sub.vs:
            if v['user_id'] in uids:
                v['is_followee'] = 1
            else:
                v['is_followee'] = 0

        # serialize it
        FSDao.write_pickle(self.base_dir + self.expr_dir,
                           '%s_relation.pickle' % uid, self.g_sub)
    def compute_laplacian_matrix(self):

        from numpy import array
        # TODO actually you need to notice that we need to add weight in the future i.degree and -1 is too simple here
        vecCount = len(self.g.vs)
        row = []
        col = []
        data = []
        for i in self.g.vs:
            row.append(i.index)
            col.append(i.index)
            data.append(i.degree())
            pass
        for e in self.g.es:
            row.append(e.source)
            col.append(e.target)
            data.append(-1)
            row.append(e.target)
            col.append(e.source)
            data.append(-1)
        row = array(row)
        col = array(col)
        data = array(data)
        self.l_matrix = coo_matrix((data, (row, col)),
                                   shape=(vecCount, vecCount),
                                   dtype='float32').asformat('csr')

        print 'Finish building laplacian matrix'
        FSDao.write_pickle(self.expr_dir,
                           SQLDao.ce.properties['laplacian_file_name'],
                           self.l_matrix)
        print 'Finish write laplacian matrix'
        pass
    def SoftIndicator_CommunityDiscovery(self, step=False):
        '''
        1.  compute the eigen vector for the second smallest eigen value
        2.  serialize the whole eigen matrix
        '''
        if 'l_matrix' not in dir(self):
            f = open(
                self.expr_dir + SQLDao.ce.properties['laplacian_file_name'],
                'rb')
            self.l_matrix = pickle.load(f)
            f.close()
            print 'finished loading laplacian matrix'
            pass
        from scipy.sparse import linalg
        # SA is an important parameter for computing the smallest eigen value and the respective eigen vector
        # eigsh is special for symmetric matrix
        self.eigValue, self.eigMatrix = linalg.eigsh(A=self.l_matrix,
                                                     k=10,
                                                     which='SA',
                                                     maxiter=500)

        # print self.d[:,self.get_second_smallest_value_index(self.v)]
        FSDao.write_pickle(self.expr_dir, SQLDao.ce.properties['eigen_value'],
                           self.eigValue)
        FSDao.write_pickle(self.expr_dir, SQLDao.ce.properties['eigen_matrix'],
                           self.eigMatrix)
        pass
 def build_vertex_clustering(self):
     '''
     this function aim to build vertex clustering from the matlab generated group file
     '''
     import os
     import igraph
     if not os.path.exists(self.expr_dir +
                           SQLDao.ce.properties['matlab_dir'] +
                           SQLDao.ce.properties['group_file_name']):
         raise ValueError(
             'group file do not exist, please run eig_solve.m to generate one'
         )
     else:
         f = open(self.expr_dir + SQLDao.ce.properties['matlab_dir'] +
                  SQLDao.ce.properties['group_file_name'])
         com_list = f.readline().strip().split(' ')
         f.close()
         com_list = [int(a) - 1 for a in com_list]
         self.vertex_clustering = igraph.clustering.VertexClustering(
             self.g, com_list)
         print self.vertex_clustering.modularity
         print 'writing vertex_clustering'
         FSDao.write_pickle(
             self.expr_dir,
             SQLDao.ce.properties['vertex_clustering_file_name'],
             self.vertex_clustering)
         print 'finished writing vertex_clustering'
     pass
    def Build_Content_Graph_From_Rg(self, uid):
        if 'rg' not in dir(self):
            self._load_user_relation_pickle(uid)

        self.cg = self.rg.copy()
        self.cg.delete_edges(self.cg.es)

        from ExtractContentEdge import ContentEdgeExtractorBaseRoutine
        cebr = ContentEdgeExtractorBaseRoutine(self.base_dir, self.expr_dir)

        edges = []
        weights = []

        for i, vi in enumerate(self.cg.vs):
            for j in range(i + 1, len(self.cg.vs)):
                vj = self.cg.vs[j]
                h1, h2, h3 = cebr.compute_similarity(vi, vj)
                sim = self.w1 * h1 + self.w2 * h2 + self.w3 * h3
                if sim > self.criterion:
                    edges.append((vi.index, vj.index))
                    weights.append(sim)
        self.cg.add_edges(edges)
        self.cg.es['weight'] = weights

        FSDao.write_pickle(self.base_dir + self.expr_dir,
                           '%s_content.pickle' % uid, self.cg)
 def Build_Content_Graph_From_Rg(self,uid):
     if 'rg' not in dir(self):
         self._load_user_relation_pickle(uid)
         
     self.cg=self.rg.copy()
     self.cg.delete_edges(self.cg.es)
     
     from ExtractContentEdge import ContentEdgeExtractorBaseRoutine
     cebr=ContentEdgeExtractorBaseRoutine(self.base_dir,self.expr_dir)
     
     edges=[]
     weights=[]
     
     
     
     for i,vi in enumerate(self.cg.vs):
         for j in range(i+1,len(self.cg.vs)):
             vj=self.cg.vs[j]
             h1,h2,h3=cebr.compute_similarity(vi,vj)
             sim=self.w1*h1+self.w2*h2+self.w3*h3
             if sim>self.criterion:
                 edges.append((vi.index,vj.index))
                 weights.append(sim)
     self.cg.add_edges(edges)
     self.cg.es['weight']=weights
     
     FSDao.write_pickle(self.base_dir+self.expr_dir,'%s_content.pickle'%uid,self.cg)
Example #8
0
    def build_prime_dict(self,n=1000):
        '''
        build prime dict and this function should always run once
        '''
        self.prime_dict=dict()
        self.prime_dict[-1]=1

        for i in range(n):
            self.prime_dict[i]=prime.prime(i+1)
        print self.prime_dict
        FSDao.write_pickle(self.base_dir+self.expr_dir,SQLDao.ce.properties['prime_dict'],self.prime_dict)
def SerializeDendrogram(directory, graph_filename, dendrogram_filename):
    """
    In this function, we
    1.  read the pick graph from the file system
    2.  rudely compute the community by fast greedy algorithm
    3.  then dump the pick object
    """
    graph = d.trace(FSDao.read_pickle_graph)(graph_filename)
    # we need some filtering here
    global CRITERION_VERTEX_OCCUR_COUNT
    graph = d.trace(graph.subgraph)(graph.vs.select(occur_count_ge=CRITERION_VERTEX_OCCUR_COUNT))
    vertexDendrogram = d.trace(graph.community_fastgreedy)()
    FSDao.write_pickle(directory, dendrogram_filename, vertexDendrogram)
    pass
Example #10
0
def SerializeDendrogram(directory, graph_filename, dendrogram_filename):
    '''
    In this function, we
    1.  read the pick graph from the file system
    2.  rudely compute the community by fast greedy algorithm
    3.  then dump the pick object
    '''
    graph = d.trace(FSDao.read_pickle_graph)(graph_filename)
    #we need some filtering here
    global CRITERION_VERTEX_OCCUR_COUNT
    graph = d.trace(graph.subgraph)(
        graph.vs.select(occur_count_ge=CRITERION_VERTEX_OCCUR_COUNT))
    vertexDendrogram = d.trace(graph.community_fastgreedy)()
    FSDao.write_pickle(directory, dendrogram_filename, vertexDendrogram)
    pass
Example #11
0
def Gen_Filtering_Graph(relation_count=100, ori_tw_count=30):
    '''
    this function aim to generate a graph that was filtered by certain criterion
    '''
    import FSDao
    import igraph
    g = FSDao.read_pickle_graph(properties['base_dir'] +
                                properties['expr_dir'] + 'relation.pickle.old')

    f = open(properties['base_dir'] + 'users_unfilter.csv')
    uid_list = []
    for line in f:
        t = line.strip().split(',')
        if int(t[1]) <= relation_count:
            continue
        if int(t[2]) <= ori_tw_count:
            continue
        uid_list.append(int(t[0]))
    f.close()

    sub_vlsit = [v.index for v in g.vs if v['user_id'] in uid_list]
    g_sub = g.subgraph(sub_vlsit)

    #FSDao.write_pickle(properties['base_dir']+properties['expr_dir'],'relation.pickle',g_sub)
    return g_sub
    pass
def Gen_Filtering_Graph(relation_count=100, ori_tw_count=30):
    """
    this function aim to generate a graph that was filtered by certain criterion
    """
    import FSDao
    import igraph

    g = FSDao.read_pickle_graph(properties["base_dir"] + properties["expr_dir"] + "relation.pickle.old")

    f = open(properties["base_dir"] + "users_unfilter.csv")
    uid_list = []
    for line in f:
        t = line.strip().split(",")
        if int(t[1]) <= relation_count:
            continue
        if int(t[2]) <= ori_tw_count:
            continue
        uid_list.append(int(t[0]))
    f.close()

    sub_vlsit = [v.index for v in g.vs if v["user_id"] in uid_list]
    g_sub = g.subgraph(sub_vlsit)

    # FSDao.write_pickle(properties['base_dir']+properties['expr_dir'],'relation.pickle',g_sub)
    return g_sub
    pass
    def Load_Community_List(self):
        '''
        load the community list generated by the matlab and write the pickle
        '''
        import os
        self.communities=[]
        for r,d,files in os.walk(self.expr_dir+SQLDao.ce.properties['matlab_dir']):
            for f in files:
                if f.endswith('.list'):
                    cl=self.build_comlist_obj(f)
                    self.communities.append(cl)
                    print 'handled %s'%f
                    pass

        # serialize the communities
        print 'writing communities'
        FSDao.write_pickle(self.expr_dir,'communities.pickle',self.communities)
        print 'finished writing communities'
        pass
    def iGraph_CommunityDiscovery(self, step=False):
        '''
        discover community in the relation graph, it takes time
        1. compute the vertexClustering
        2. serialize the vertexClustering

        step to see whether this function start from itself
        '''
        # why I serialize the vertexClustering:
        # even though I compute eigenvector myself, I am forced to construct vertexClustering myself
        self.vertex_clustering = d.trace(
            self.g.community_leading_eigenvector)()
        print 'modularity is %s' % self.vertex_clustering.modularity
        # print self.vertex_clustering.membership
        print 'finish find community_leading_eigenvector'
        FSDao.write_pickle(self.expr_dir,
                           SQLDao.ce.properties['vertex_clustering_file_name'],
                           self.vertex_clustering)
        pass
Example #15
0
    def Load_Partition_Result(self):
        '''
        load the partition result including community_relation.list and community_content.list
        '''
        if 'rg' not in dir(self):
            self._load_relation_graph(self.expr_dir, self.r_graph)
            pass
        if 'cg' not in dir(self):
            self._load_content_graph(self.expr_dir, self.c_graph)
            pass

        # first load the community_relation.list
        f = open(self.expr_dir + SQLDao.ce.properties['matlab_dir'] +
                 'community_relation.list')
        title = f.readline().strip()
        #title=simplejson.loads(title)
        com_list = f.readline().strip().split(' ')
        f.close()
        com_list = [int(a) - 1 for a in com_list]
        self.vc_relation = igraph.clustering.VertexClustering(
            self.rg, com_list)

        # then load the community_content.list
        f = open(self.expr_dir + SQLDao.ce.properties['matlab_dir'] +
                 'community_content.list')
        title = f.readline().strip()
        #title=simplejson.loads(title)
        com_list = f.readline().strip().split(' ')
        f.close()
        com_list = [int(a) - 1 for a in com_list]
        self.vc_content = igraph.clustering.VertexClustering(
            graph=self.cg,
            membership=com_list,
            modularity_params={'weights': 'similarity'})

        import FSDao
        FSDao.write_pickle(self.expr_dir, 'vc_relation.pickle',
                           self.vc_relation)
        FSDao.write_pickle(self.expr_dir, 'vc_content.pickle', self.vc_content)
        print 'finish writing vc_relation.pickle and vc_content.pickle'
        pass
Example #16
0
    def Load_Community_List(self):
        '''
        load the community list generated by the matlab and write the pickle
        '''
        import os
        self.communities = []
        for r, d, files in os.walk(self.expr_dir +
                                   SQLDao.ce.properties['matlab_dir']):
            for f in files:
                if f.endswith('.list'):
                    cl = self.build_comlist_obj(f)
                    self.communities.append(cl)
                    print 'handled %s' % f
                    pass

        # serialize the communities
        print 'writing communities'
        FSDao.write_pickle(self.expr_dir, 'communities.pickle',
                           self.communities)
        print 'finished writing communities'
        pass
    def KMeansClustering(self, step=False):
        '''
        Integrate orange here
        actually it is a little subtle here:
        1.  I dont think kmeans is a good way to decide which community a node(user) should be
        however it is the most generalized one

        2.TODO maybe change the number of clusters(rather than make it automatically...) is better, but you have to check to result first
        '''
        data = self.build_orange_data_from_eig_vector()
        # clustering
        self.km = Orange.clustering.kmeans.Clustering(data=data,
                                                      distance=EigDistance)
        # you had better construct it into a vertex_clustering in order to compute modularity, it is not reasonable to use orange to insert into database any more
        clusters = self.km.clusters

        import igraph
        self.vertex_clustering = igraph.clustering.VertexClustering(
            self.g, clusters)
        print 'writing vertex_clustering'
        FSDao.write_pickle(self.expr_dir,
                           SQLDao.ce.properties['vertex_clustering_file_name'],
                           self.vertex_clustering)
        print 'finished writing vertex_clustering'
def CommunityDiscovery(expr_dir, pickle_filename, dendrogram_file_name):
    '''
     discover community in the relation graph, it takes time
     1. read pickle graph from the file system
     2. compute the dendrogram
     3. serialize the dendrogram
    '''
    print expr_dir + pickle_filename
    #g=FSDao.read_pickle_graph(expr_dir+pickle_filename)
    f = open(expr_dir + pickle_filename, 'rb')
    g = d.trace(pickle.load)(f)
    f.close()

    vertexClustering = d.trace(g.community_leading_eigenvector)()
    FSDao.write_pickle(expr_dir, 'dendrogram.eigen', vertexClustering)

    # edge betweeness
    # vertexDendrogram=d.trace(g.community_edge_betweenness)(directed=True)
    # FSDao.write_pickle(expr_dir,'dendrogram.betweeness',vertexDendrogram)

    # walk strap
    # vertexDendrogram=d.trace(g.community_walktrap)()
    # FSDao.write_pickle(expr_dir,'dendrogram.walkstrap',vertexDendrogram)
    pass
def SerializeBirelationGraph():
    '''
    construct the bi-relational graph and write it to the file system as pickle graph
    '''
    import igraph

    sql = SQLDao.SQLDao.getInstance()
    h1, uids = d.trace(sql.getAllOU)()
    # print len(uids)
    # add users to the graph and construct a dict for index
    g = igraph.Graph(n=0, directed=False)
    uid_to_gidx_dict = {}
    for idx, uid in enumerate(uids):
        # make sure the name is user_id
        g.add_vertex({SQLDao.LABEL_USER_GROUP_INFO_USERID:uid[0]})
        uid_to_gidx_dict[uid[0]] = idx
        pass
    print 'Finish add vertices %s'%len(uids)

    h, ur = d.trace(sql.getOURelations)(reciprocated=True)
    #construct the list contain tuples represent the relations between users
    edge_list = []
    for idx, rec in enumerate(ur):
        if idx % 1000 == 0:
            print 'edge %s' % idx
        sid = rec[SQLDao.LABEL_SRC_USERID]
        tid = rec[SQLDao.LABEL_TAR_USERID]
        edge_list.append((uid_to_gidx_dict[sid], uid_to_gidx_dict[tid]))

    edge_list=list(edge_set)
    print 'Finish constructing edge list %s' % len(edge_list)
    # Note: It is <bold>very very</bold> slow to add edge iteratively
    g.add_edges(edge_list)
    print 'finish building a graph based on social relation'
    FSDao.write_pickle(SQLDao.ce.properties['base_dir'] + SQLDao.ce.properties['expr_dir'],SQLDao.ce.properties['relation_reciprocated_graph_file_name'], g)
    pass
def SerializeRelationshipGraph():
    '''
	construct the graph and write it to the file system as pickle graph
    '''
    import igraph

    sql = SQLDao.SQLDao.getInstance()
    h1, uids = d.trace(sql.getAllOU)()
    sql = SQLDao.SQLDao.getInstance()
    h, ur = d.trace(sql.getOURelations)()
    g = igraph.Graph(n=0, directed=True)
    # add users to the graph and construct a dict for index
    uid_to_gidx_dict={}
    assert SQLDao.LABEL_USER_GROUP_INFO_USERID=='user_id'
    for idx, user_id in enumerate(uids):
        g.add_vertex(user_id=user_id[0])
        uid_to_gidx_dict[user_id[0]] = idx
        pass
    print 'Finish add vertices'
    # construct the list contain tuples represent the relations between users
    edge_list = []
    for idx, rec in enumerate(ur):
        if idx % 1000 == 0:
            print 'edge %s' % idx
        sid = rec[SQLDao.LABEL_SRC_USERID]
        tid = rec[SQLDao.LABEL_TAR_USERID]
        edge_list.append((uid_to_gidx_dict[sid], uid_to_gidx_dict[tid]))

    print 'Finish constructing edge list %s' % len(edge_list)
    # Note: It is <bold>very very</bold> slow to add edge iteratively
    g.add_edges(edge_list)
    print 'finish building a graph based on social relation'

    # FSDao.write_graph(g, SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir']+'relation.pickle')
    FSDao.write_pickle(SQLDao.ce.properties['base_dir'] + SQLDao.ce.properties['expr_dir'],SQLDao.ce.properties['relation_graph_file_name'], g)
    pass
def Insert_Word_Group_Info_To_DB(source_type, source_directory, src_file_name, sql_table_name, sheet_name=""):
    """
    we insert the word group info into db for data mining use
    'cause we need to know how a user's words distribute
    """
    if source_type == "excel":
        groupinfo = FSDao.read_excel(source_directory + src_file_name, sheet_name)
    elif source_type == "db":
        sqlite = SQLDao.SQLiteDao(source_directory, src_file_name)
        headings, groupinfo = sqlite.get_word_group_info()
        pass
    else:
        raise ValueError("source type error")

    sql = SQLDao.SQLDao.getInstance()
    sql.saveGroupInfo(sql_table_name, groupinfo)
Example #22
0
def Insert_Word_Group_Info_To_DB(source_type,
                                 source_directory,
                                 src_file_name,
                                 sql_table_name,
                                 sheet_name=''):
    '''
    we insert the word group info into db for data mining use
    'cause we need to know how a user's words distribute
    '''
    if source_type == 'excel':
        groupinfo = FSDao.read_excel(source_directory + src_file_name,
                                     sheet_name)
    elif source_type == 'db':
        sqlite = SQLDao.SQLiteDao(source_directory, src_file_name)
        headings, groupinfo = sqlite.get_word_group_info()
        pass
    else:
        raise ValueError('source type error')

    sql = SQLDao.SQLDao.getInstance()
    sql.saveGroupInfo(sql_table_name, groupinfo)
    def leading_eigen_vector(self):
        '''
        I would like to implement leading eigenvector methodology myself, which a critical challenge is the result should be totally the same with igraph
        community_leading_eigenvector
        '''
        import Queue
        import igraph
        membership = [0] * len(self.g.vs)
        vc = igraph.clustering.VertexClustering(self.g, membership)
        max_com_label = 0
        q = Queue.Queue()
        q.put(0)
        max_modularity = vc.modularity

        # begin split iteratively until no split can increase the modularity
        while q.qsize() > 0:
            cur_label = q.get()
            print 'cur label:%s, qsize: %s' % (cur_label, q.qsize())
            vertex_list = [
                idx for idx, com_label in enumerate(vc.membership)
                if com_label == cur_label
            ]
            graph_to_split = self.g.subgraph(vertex_list)
            mod_matrix = self.compute_modularity_matrix(graph_to_split)
            gc.collect()
            eig = self.compute_largest_eigenvector(mod_matrix)
            # positive index and negative index
            if eig[0] > 0:
                positive_index = [
                    vertex_list[idx] for idx, v in enumerate(eig) if v > 0
                ]
                negative_index = [
                    vertex_list[idx] for idx, v in enumerate(eig) if v <= 0
                ]
            else:
                positive_index = [
                    vertex_list[idx] for idx, v in enumerate(eig) if v <= 0
                ]
                negative_index = [
                    vertex_list[idx] for idx, v in enumerate(eig) if v > 0
                ]

            # noted that there are no set method for membership setting
            new_membership = vc.membership
            # positive index element keep the cur_label

            # negative index element use the max_label+1

            for idx in negative_index:
                new_membership[idx] = max_com_label + 1
            vc_temp = igraph.clustering.VertexClustering(
                self.g, new_membership)
            # print 'new_membership:%s'%vc_temp.membership
            print 'old modularity: %s, new modularity: %s\n' % (
                vc.modularity, vc_temp.modularity)
            if vc_temp.modularity > vc.modularity:
                vc = vc_temp
                q.put(cur_label)
                q.put(max_com_label + 1)
                max_com_label = max_com_label + 1
            else:
                pass
            pass

        self.vertex_clustering = vc
        # print self.vertex_clustering.membership
        print 'writing vertex_clustering'
        FSDao.write_pickle(self.expr_dir,
                           SQLDao.ce.properties['vertex_clustering_file_name'],
                           self.vertex_clustering)
        print 'finished writing vertex_clustering'
        pass
Example #24
0
 def __init__(self, base_dir,expr_dir):
     self.base_dir=base_dir
     self.expr_dir=expr_dir
     self.g=FSDao.read_pickle_graph(self.base_dir+self.expr_dir+SQLDao.ce.properties['relation_graph_file_name'])
     pass
    def KMeansClustering_Iterative(self, step=False):
        '''
        Integrate orange here
        actually it is a little subtle here:
        1.  I dont think kmeans is a good way to decide which community a node(user) should be
        however it is the most generalized one

        2.TODO maybe change the number of clusters(rather than make it automatically...) is better, but you have to check to result first
        '''
        eig_data = self.build_orange_data_from_eig_vector()
        # clustering
        self.km = Orange.clustering.kmeans.Clustering(data=eig_data,
                                                      centroids=5,
                                                      distance=EigDistance)
        # you had better construct it into a vertex_clustering in order to compute modularity, it is not reasonable to use orange to insert into database any more
        clusters = self.km.clusters

        d = {}
        for idx, c in enumerate(clusters):
            if not d.has_key(c):
                d[c] = [idx]
            else:
                d[c].append(idx)

        import Queue
        q = Queue.Queue()

        for v in d.values():
            q.put(v)

        res_list = []

        import CommunityExtraction as ce
        while q.qsize() > 0:
            v = q.get()
            print 'qsize:%s cluster size: %s res list size: %s' % (
                q.qsize(), len(v), len(res_list))
            if len(v) < ce.CRITERION_CLUSTER_NODES_LOWER_BOUND:
                res_list.append(v)
                pass
            elif len(v) > ce.CRITERION_CLUSTER_NODES_UPPER_BOUND:
                # may be it can be iterative
                sub_data = eig_data.get_items(v)
                sub_km = Orange.clustering.kmeans.Clustering(
                    data=sub_data, centroids=5, distance=EigDistance)
                sub_clusters = sub_km.clusters
                temp_d = dict()
                for idx, c in enumerate(sub_clusters):
                    if not temp_d.has_key(c):
                        temp_d[c] = [v[idx]]
                    else:
                        temp_d[c].append(v[idx])

                for sub_v in temp_d.values():
                    q.put(sub_v)
                pass
            else:
                res_list.append(v)
                pass
            pass

        clusters = [0] * len(eig_data)
        for idx, res in enumerate(res_list):
            for r in res:
                clusters[r] = idx
            pass

        import igraph
        self.vertex_clustering = igraph.clustering.VertexClustering(
            self.g, clusters)
        print 'writing vertex_clustering'
        FSDao.write_pickle(self.expr_dir,
                           SQLDao.ce.properties['vertex_clustering_file_name'],
                           self.vertex_clustering)
        print 'finished writing vertex_clustering'
Example #26
0
            d[p.main_category]+=1

        l=d.values()

        total=sum(l)
        a=[float(i)/float(total) for i in l ]
        return max(a)

import pickle
import FSDao
import SQLDao
if __name__ == '__main__':
    init_category_list()
    poster_list=init_poster_and_tweets()
    tester_list=init_tester(poster_list)
    FSDao.write_pickle(SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir'],'tester_list.pickle',tester_list)






def simulate(filename):
    
    print 'simulating...'
    eva_f=open(SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir']+filename,'w')
    for t in tester_list:
        while True:
            purity_rc=random.gauss(0.8,0.1)
            if purity_rc<1.0 and purity_rc>0:
                break
    g.add_edges(edges_t)
    
    g.es['similarity']=[e[2] for e in edges]
    
    l_c=get_components_by_similarity(g,sim_criterion)
    print l_c
    
    try:
        assert l_c==1
    except AssertionError,e:
        return g
        
    assert isinstance(g,igraph.Graph)
    
    import FSDao
    FSDao.write_pickle(SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir'],'content_with_similarity.pickle',g)
    
    return g
    
    
def build_full_adjacency_matrix(layer):
    f=open(SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir']+SQLDao.ce.properties['target_user_without_edge_graph_file'],'rb')
    g=pickle.load(f)
    f.close()
    print 'finished loaded graph'

    sql=SQLDao.SQLDao.getInstance()
    
    assert isinstance(sql, SQLDao.SQLDao)
    f=open(SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir']+'matlab/'+'adj_content_full','a+')
    for v in g.vs:
 def Serialize_ContentGraph(self):
     import gc
     gc.collect()
     FSDao.write_pickle(self.base_dir+self.expr_dir,'content.pickle',self.g)
     pass