def Summarize_User_Group_Specify(groupinfo_tablename, db_dir, db_file_name):
    """
    1.  Summarize the user group info specification, however, some data puring problem should be handled here
    2.  write the user group specification to database
    """
    sql = SQLDao.SQLDao.getInstance()
    (headings, user_group_info) = d.trace(sql.getUserGroupSpecify)(groupinfo_tablename)

    # print debug message
    print "user_group_info:%s" % len(user_group_info)
    sqlite = SQLDao.SQLiteDao(db_dir, db_file_name)
    d.trace(sqlite.save_user_group_info)(user_group_info)
Beispiel #2
0
def Summarize_User_Group_Specify(groupinfo_tablename, db_dir, db_file_name):
    '''
    1.  Summarize the user group info specification, however, some data puring problem should be handled here
    2.  write the user group specification to database
    '''
    sql = SQLDao.SQLDao.getInstance()
    (headings,
     user_group_info) = d.trace(sql.getUserGroupSpecify)(groupinfo_tablename)

    #print debug message
    print 'user_group_info:%s' % len(user_group_info)
    sqlite = SQLDao.SQLiteDao(db_dir, db_file_name)
    d.trace(sqlite.save_user_group_info)(user_group_info)
def Dendrogram(directory, vertexDendrogramFile, write_type, filename="", comment=None):
    """
    In this function, we will
    1.  calculate the group info iteratively, in order to make sure a group is not so big and also not so small
    2.  write the word network to a excel file or community.db
    """
    # read dendrogram from file system
    f = open(directory + vertexDendrogramFile, "rb")
    vertexDendrogram = d.trace(pickle.load)(f)
    f.close()

    vertexClustering = d.trace(vertexDendrogram.as_clustering)()
    subgraphs = d.trace(vertexClustering.subgraphs)()
    subgraphs_accordance = []
    # make all the subgraphs that
    # size(subgraphs)>CRITERION_CLUSTER_NODES_LOWER_BOUND and size(subgraphs)<CRITERION_CLUSTER_NODES_UPPER_BOUND
    while len(subgraphs) > 0:
        print "subgraphs size: %s" % len(subgraphs)
        g = subgraphs.pop()
        nodes = g.vs
        print "nodes size: %s" % len(nodes)
        if len(nodes) > CRITERION_CLUSTER_NODES_UPPER_BOUND:
            # iterate find community here
            vd = d.trace(g.community_fastgreedy)()
            vc = d.trace(vd.as_clustering)()
            sgs = d.trace(vc.subgraphs)()
            print "new subgraphs count(and all of them will be pushed) %s" % len(sgs)
            for sg in sgs:
                subgraphs.append(sg)

        elif len(nodes) < CRITERION_CLUSTER_NODES_LOWER_BOUND:
            # omit this community here
            pass
        else:
            # write this community to the file system here
            subgraphs_accordance.append(g)
            pass

    # there must be some subgraphs that contain less than 10 nodes
    groupinfo = []
    gid = 0
    for g in subgraphs_accordance:
        nodes = g.vs
        gid += 1
        for node in nodes:
            # groupinfo.append([node['dbIndex'],node['name'],gid])
            groupinfo.append({LABEL_VERTEX_ID: node["dbIndex"], LABEL_NOUN: node["name"], LABEL_GROUP_ID: gid})

    if write_type == "excel":
        d.trace(FSDao.write_excel)(
            directory, filename, "group", [LABEL_VERTEX_ID, LABEL_NOUN, LABEL_GROUP_ID], groupinfo, comment
        )
    elif write_type == "db":
        # write them to the community db
        sqlite = SQLDao.SQLiteDao(directory, filename)
        sqlite.save_word_group_info(groupinfo)
        pass
    else:
        raise ValueError("write type error")
    return groupinfo
def SerializeDendrogram(directory, graph_filename, dendrogram_filename):
    """
    In this function, we
    1.  read the pick graph from the file system
    2.  rudely compute the community by fast greedy algorithm
    3.  then dump the pick object
    """
    graph = d.trace(FSDao.read_pickle_graph)(graph_filename)
    # we need some filtering here
    global CRITERION_VERTEX_OCCUR_COUNT
    graph = d.trace(graph.subgraph)(graph.vs.select(occur_count_ge=CRITERION_VERTEX_OCCUR_COUNT))
    vertexDendrogram = d.trace(graph.community_fastgreedy)()
    FSDao.write_pickle(directory, dendrogram_filename, vertexDendrogram)
    pass
Beispiel #5
0
def SerializeDendrogram(directory, graph_filename, dendrogram_filename):
    '''
    In this function, we
    1.  read the pick graph from the file system
    2.  rudely compute the community by fast greedy algorithm
    3.  then dump the pick object
    '''
    graph = d.trace(FSDao.read_pickle_graph)(graph_filename)
    #we need some filtering here
    global CRITERION_VERTEX_OCCUR_COUNT
    graph = d.trace(graph.subgraph)(
        graph.vs.select(occur_count_ge=CRITERION_VERTEX_OCCUR_COUNT))
    vertexDendrogram = d.trace(graph.community_fastgreedy)()
    FSDao.write_pickle(directory, dendrogram_filename, vertexDendrogram)
    pass
def CommunityDiscovery(expr_dir, pickle_filename, dendrogram_file_name):
    '''
     discover community in the relation graph, it takes time
     1. read pickle graph from the file system
     2. compute the dendrogram
     3. serialize the dendrogram
    '''
    print expr_dir + pickle_filename
    #g=FSDao.read_pickle_graph(expr_dir+pickle_filename)
    f = open(expr_dir + pickle_filename, 'rb')
    g = d.trace(pickle.load)(f)
    f.close()

    vertexClustering = d.trace(g.community_leading_eigenvector)()
    FSDao.write_pickle(expr_dir, 'dendrogram.eigen', vertexClustering)

    # edge betweeness
    # vertexDendrogram=d.trace(g.community_edge_betweenness)(directed=True)
    # FSDao.write_pickle(expr_dir,'dendrogram.betweeness',vertexDendrogram)

    # walk strap
    # vertexDendrogram=d.trace(g.community_walktrap)()
    # FSDao.write_pickle(expr_dir,'dendrogram.walkstrap',vertexDendrogram)
    pass
def SerializeBirelationGraph():
    '''
    construct the bi-relational graph and write it to the file system as pickle graph
    '''
    import igraph

    sql = SQLDao.SQLDao.getInstance()
    h1, uids = d.trace(sql.getAllOU)()
    # print len(uids)
    # add users to the graph and construct a dict for index
    g = igraph.Graph(n=0, directed=False)
    uid_to_gidx_dict = {}
    for idx, uid in enumerate(uids):
        # make sure the name is user_id
        g.add_vertex({SQLDao.LABEL_USER_GROUP_INFO_USERID:uid[0]})
        uid_to_gidx_dict[uid[0]] = idx
        pass
    print 'Finish add vertices %s'%len(uids)

    h, ur = d.trace(sql.getOURelations)(reciprocated=True)
    #construct the list contain tuples represent the relations between users
    edge_list = []
    for idx, rec in enumerate(ur):
        if idx % 1000 == 0:
            print 'edge %s' % idx
        sid = rec[SQLDao.LABEL_SRC_USERID]
        tid = rec[SQLDao.LABEL_TAR_USERID]
        edge_list.append((uid_to_gidx_dict[sid], uid_to_gidx_dict[tid]))

    edge_list=list(edge_set)
    print 'Finish constructing edge list %s' % len(edge_list)
    # Note: It is <bold>very very</bold> slow to add edge iteratively
    g.add_edges(edge_list)
    print 'finish building a graph based on social relation'
    FSDao.write_pickle(SQLDao.ce.properties['base_dir'] + SQLDao.ce.properties['expr_dir'],SQLDao.ce.properties['relation_reciprocated_graph_file_name'], g)
    pass
def SerializeRelationshipGraph():
    '''
	construct the graph and write it to the file system as pickle graph
    '''
    import igraph

    sql = SQLDao.SQLDao.getInstance()
    h1, uids = d.trace(sql.getAllOU)()
    sql = SQLDao.SQLDao.getInstance()
    h, ur = d.trace(sql.getOURelations)()
    g = igraph.Graph(n=0, directed=True)
    # add users to the graph and construct a dict for index
    uid_to_gidx_dict={}
    assert SQLDao.LABEL_USER_GROUP_INFO_USERID=='user_id'
    for idx, user_id in enumerate(uids):
        g.add_vertex(user_id=user_id[0])
        uid_to_gidx_dict[user_id[0]] = idx
        pass
    print 'Finish add vertices'
    # construct the list contain tuples represent the relations between users
    edge_list = []
    for idx, rec in enumerate(ur):
        if idx % 1000 == 0:
            print 'edge %s' % idx
        sid = rec[SQLDao.LABEL_SRC_USERID]
        tid = rec[SQLDao.LABEL_TAR_USERID]
        edge_list.append((uid_to_gidx_dict[sid], uid_to_gidx_dict[tid]))

    print 'Finish constructing edge list %s' % len(edge_list)
    # Note: It is <bold>very very</bold> slow to add edge iteratively
    g.add_edges(edge_list)
    print 'finish building a graph based on social relation'

    # FSDao.write_graph(g, SQLDao.ce.properties['base_dir']+SQLDao.ce.properties['expr_dir']+'relation.pickle')
    FSDao.write_pickle(SQLDao.ce.properties['base_dir'] + SQLDao.ce.properties['expr_dir'],SQLDao.ce.properties['relation_graph_file_name'], g)
    pass
    def iGraph_CommunityDiscovery(self, step=False):
        '''
        discover community in the relation graph, it takes time
        1. compute the vertexClustering
        2. serialize the vertexClustering

        step to see whether this function start from itself
        '''
        # why I serialize the vertexClustering:
        # even though I compute eigenvector myself, I am forced to construct vertexClustering myself
        self.vertex_clustering = d.trace(
            self.g.community_leading_eigenvector)()
        print 'modularity is %s' % self.vertex_clustering.modularity
        # print self.vertex_clustering.membership
        print 'finish find community_leading_eigenvector'
        FSDao.write_pickle(self.expr_dir,
                           SQLDao.ce.properties['vertex_clustering_file_name'],
                           self.vertex_clustering)
        pass
    def KMeansClustering_Iterative(self, step=False):
        '''
        Integrate orange here
        actually it is a little subtle here:
        1.  I dont think kmeans is a good way to decide which community a node(user) should be
        however it is the most generalized one

        2.TODO maybe change the number of clusters(rather than make it automatically...) is better, but you have to check to result first
        '''
        eig_data = self.build_orange_data_from_eig_vector()
        # clustering
        self.km = Orange.clustering.kmeans.Clustering(data=eig_data,
                                                      centroids=5,
                                                      distance=EigDistance)
        # you had better construct it into a vertex_clustering in order to compute modularity, it is not reasonable to use orange to insert into database any more
        clusters = self.km.clusters

        d = {}
        for idx, c in enumerate(clusters):
            if not d.has_key(c):
                d[c] = [idx]
            else:
                d[c].append(idx)

        import Queue
        q = Queue.Queue()

        for v in d.values():
            q.put(v)

        res_list = []

        import CommunityExtraction as ce
        while q.qsize() > 0:
            v = q.get()
            print 'qsize:%s cluster size: %s res list size: %s' % (
                q.qsize(), len(v), len(res_list))
            if len(v) < ce.CRITERION_CLUSTER_NODES_LOWER_BOUND:
                res_list.append(v)
                pass
            elif len(v) > ce.CRITERION_CLUSTER_NODES_UPPER_BOUND:
                # may be it can be iterative
                sub_data = eig_data.get_items(v)
                sub_km = Orange.clustering.kmeans.Clustering(
                    data=sub_data, centroids=5, distance=EigDistance)
                sub_clusters = sub_km.clusters
                temp_d = dict()
                for idx, c in enumerate(sub_clusters):
                    if not temp_d.has_key(c):
                        temp_d[c] = [v[idx]]
                    else:
                        temp_d[c].append(v[idx])

                for sub_v in temp_d.values():
                    q.put(sub_v)
                pass
            else:
                res_list.append(v)
                pass
            pass

        clusters = [0] * len(eig_data)
        for idx, res in enumerate(res_list):
            for r in res:
                clusters[r] = idx
            pass

        import igraph
        self.vertex_clustering = igraph.clustering.VertexClustering(
            self.g, clusters)
        print 'writing vertex_clustering'
        FSDao.write_pickle(self.expr_dir,
                           SQLDao.ce.properties['vertex_clustering_file_name'],
                           self.vertex_clustering)
        print 'finished writing vertex_clustering'
def Guess_User_Group_by_KMeans(db_dir, db_file_name):
    """
    1.  get distinct user ids
    2.  foreach user id, compute which group should it be
        2.1 convert the data to orange data table
        2.2 kmeans
    3.  save them into database
    """
    sqlite = SQLDao.SQLiteDao(db_dir, db_file_name)
    h1, uids = sqlite.get_distinct_user_id()

    user_group_dict = {}

    for uid in uids:
        # retreive the user group info of a specific user
        h2, uid_group_info = sqlite.get_group_info_by_user_id(uid[SQLDao.LABEL_USER_GROUP_INFO_USERID])
        # convert the uid group info into the orange data table
        features = []
        features.append(Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT))
        domain = Orange.data.Domain(features, False)
        domain.add_meta(
            Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_USERID)
        )
        domain.add_meta(
            Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPID)
        )
        datas = []
        for i in uid_group_info:
            data = Orange.data.Instance(domain, [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]])
            data[SQLDao.LABEL_USER_GROUP_INFO_USERID] = i[SQLDao.LABEL_USER_GROUP_INFO_USERID]
            data[SQLDao.LABEL_USER_GROUP_INFO_GROUPID] = i[SQLDao.LABEL_USER_GROUP_INFO_GROUPID]
            datas.append(data)

        table = Orange.data.Table(domain, datas)
        target_instances = []
        if len(table) > 3:
            km = Orange.clustering.kmeans.Clustering(data=table, distance=GroupCountDistance)
            clusters = km.clusters
            d = {}
            for idx, c_label in enumerate(clusters):
                if d.has_key(c_label):
                    d[c_label].append(table[idx])
                else:
                    d[c_label] = [table[idx]]

            if len(d) == 3:
                # figure out which cluster represent the largest cluster
                max_label = None
                max_value = -1
                for label, instances in d.items():
                    temp_list = [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value for i in instances]
                    if max(temp_list) > max_value:
                        max_value = max(temp_list)
                        max_label = label
                        pass
                for instance in d[max_label]:
                    target_instances.append(instance)
        else:
            # just pick the group which has the largest group_count if it is large enough?
            table.sort([SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT])
            if table[-1][SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value > 20:
                target_instances.append(table[-1])

        # print 'processing %s'%uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]
        user_group_dict[uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]] = target_instances
        pass

    print "finish cluster"
    sqlite.save_user_group_clustered(user_group_dict)
Beispiel #12
0
def Dendrogram(directory,
               vertexDendrogramFile,
               write_type,
               filename='',
               comment=None):
    '''
    In this function, we will
    1.  calculate the group info iteratively, in order to make sure a group is not so big and also not so small
    2.  write the word network to a excel file or community.db
    '''
    #read dendrogram from file system
    f = open(directory + vertexDendrogramFile, 'rb')
    vertexDendrogram = d.trace(pickle.load)(f)
    f.close()

    vertexClustering = d.trace(vertexDendrogram.as_clustering)()
    subgraphs = d.trace(vertexClustering.subgraphs)()
    subgraphs_accordance = []
    # make all the subgraphs that
    # size(subgraphs)>CRITERION_CLUSTER_NODES_LOWER_BOUND and size(subgraphs)<CRITERION_CLUSTER_NODES_UPPER_BOUND
    while len(subgraphs) > 0:
        print 'subgraphs size: %s' % len(subgraphs)
        g = subgraphs.pop()
        nodes = g.vs
        print 'nodes size: %s' % len(nodes)
        if len(nodes) > CRITERION_CLUSTER_NODES_UPPER_BOUND:
            #iterate find community here
            vd = d.trace(g.community_fastgreedy)()
            vc = d.trace(vd.as_clustering)()
            sgs = d.trace(vc.subgraphs)()
            print 'new subgraphs count(and all of them will be pushed) %s' % len(
                sgs)
            for sg in sgs:
                subgraphs.append(sg)

        elif len(nodes) < CRITERION_CLUSTER_NODES_LOWER_BOUND:
            #omit this community here
            pass
        else:
            #write this community to the file system here
            subgraphs_accordance.append(g)
            pass

    # there must be some subgraphs that contain less than 10 nodes
    groupinfo = []
    gid = 0
    for g in subgraphs_accordance:
        nodes = g.vs
        gid += 1
        for node in nodes:
            #groupinfo.append([node['dbIndex'],node['name'],gid])
            groupinfo.append({
                LABEL_VERTEX_ID: node['dbIndex'],
                LABEL_NOUN: node['name'],
                LABEL_GROUP_ID: gid
            })

    if write_type == 'excel':
        d.trace(FSDao.write_excel)(
            directory, filename, 'group',
            [LABEL_VERTEX_ID, LABEL_NOUN, LABEL_GROUP_ID], groupinfo, comment)
    elif write_type == 'db':
        #write them to the community db
        sqlite = SQLDao.SQLiteDao(directory, filename)
        sqlite.save_word_group_info(groupinfo)
        pass
    else:
        raise ValueError('write type error')
    return groupinfo
Beispiel #13
0
def Guess_User_Group_by_KMeans(db_dir, db_file_name):
    '''
    1.  get distinct user ids
    2.  foreach user id, compute which group should it be
        2.1 convert the data to orange data table
        2.2 kmeans
    3.  save them into database
    '''
    sqlite = SQLDao.SQLiteDao(db_dir, db_file_name)
    h1, uids = sqlite.get_distinct_user_id()

    user_group_dict = {}

    for uid in uids:
        # retreive the user group info of a specific user
        h2, uid_group_info = sqlite.get_group_info_by_user_id(
            uid[SQLDao.LABEL_USER_GROUP_INFO_USERID])
        # convert the uid group info into the orange data table
        features = []
        features.append(
            Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT))
        domain = Orange.data.Domain(features, False)
        domain.add_meta(
            Orange.feature.Descriptor.new_meta_id(),
            Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_USERID))
        domain.add_meta(
            Orange.feature.Descriptor.new_meta_id(),
            Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPID))
        datas = []
        for i in uid_group_info:
            data = Orange.data.Instance(
                domain, [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]])
            data[SQLDao.LABEL_USER_GROUP_INFO_USERID] = i[
                SQLDao.LABEL_USER_GROUP_INFO_USERID]
            data[SQLDao.LABEL_USER_GROUP_INFO_GROUPID] = i[
                SQLDao.LABEL_USER_GROUP_INFO_GROUPID]
            datas.append(data)

        table = Orange.data.Table(domain, datas)
        target_instances = []
        if len(table) > 3:
            km = Orange.clustering.kmeans.Clustering(
                data=table, distance=GroupCountDistance)
            clusters = km.clusters
            d = {}
            for idx, c_label in enumerate(clusters):
                if d.has_key(c_label):
                    d[c_label].append(table[idx])
                else:
                    d[c_label] = [table[idx]]

            if len(d) == 3:
                # figure out which cluster represent the largest cluster
                max_label = None
                max_value = -1
                for label, instances in d.items():
                    temp_list = [
                        i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value
                        for i in instances
                    ]
                    if max(temp_list) > max_value:
                        max_value = max(temp_list)
                        max_label = label
                        pass
                for instance in d[max_label]:
                    target_instances.append(instance)
        else:
            # just pick the group which has the largest group_count if it is large enough?
            table.sort([SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT])
            if table[-1][SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value > 20:
                target_instances.append(table[-1])

        # print 'processing %s'%uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]
        user_group_dict[uid[
            SQLDao.LABEL_USER_GROUP_INFO_USERID]] = target_instances
        pass

    print 'finish cluster'
    sqlite.save_user_group_clustered(user_group_dict)