コード例 #1
0
def Guess_User_Group_by_KMeans(db_dir, db_file_name):
    """
    1.  get distinct user ids
    2.  foreach user id, compute which group should it be
        2.1 convert the data to orange data table
        2.2 kmeans
    3.  save them into database
    """
    sqlite = SQLDao.SQLiteDao(db_dir, db_file_name)
    h1, uids = sqlite.get_distinct_user_id()

    user_group_dict = {}

    for uid in uids:
        # retreive the user group info of a specific user
        h2, uid_group_info = sqlite.get_group_info_by_user_id(uid[SQLDao.LABEL_USER_GROUP_INFO_USERID])
        # convert the uid group info into the orange data table
        features = []
        features.append(Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT))
        domain = Orange.data.Domain(features, False)
        domain.add_meta(
            Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_USERID)
        )
        domain.add_meta(
            Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPID)
        )
        datas = []
        for i in uid_group_info:
            data = Orange.data.Instance(domain, [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]])
            data[SQLDao.LABEL_USER_GROUP_INFO_USERID] = i[SQLDao.LABEL_USER_GROUP_INFO_USERID]
            data[SQLDao.LABEL_USER_GROUP_INFO_GROUPID] = i[SQLDao.LABEL_USER_GROUP_INFO_GROUPID]
            datas.append(data)

        table = Orange.data.Table(domain, datas)
        target_instances = []
        if len(table) > 3:
            km = Orange.clustering.kmeans.Clustering(data=table, distance=GroupCountDistance)
            clusters = km.clusters
            d = {}
            for idx, c_label in enumerate(clusters):
                if d.has_key(c_label):
                    d[c_label].append(table[idx])
                else:
                    d[c_label] = [table[idx]]

            if len(d) == 3:
                # figure out which cluster represent the largest cluster
                max_label = None
                max_value = -1
                for label, instances in d.items():
                    temp_list = [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value for i in instances]
                    if max(temp_list) > max_value:
                        max_value = max(temp_list)
                        max_label = label
                        pass
                for instance in d[max_label]:
                    target_instances.append(instance)
        else:
            # just pick the group which has the largest group_count if it is large enough?
            table.sort([SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT])
            if table[-1][SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value > 20:
                target_instances.append(table[-1])

        # print 'processing %s'%uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]
        user_group_dict[uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]] = target_instances
        pass

    print "finish cluster"
    sqlite.save_user_group_clustered(user_group_dict)
コード例 #2
0
def Guess_User_Group_by_KMeans(db_dir, db_file_name):
    '''
    1.  get distinct user ids
    2.  foreach user id, compute which group should it be
        2.1 convert the data to orange data table
        2.2 kmeans
    3.  save them into database
    '''
    sqlite = SQLDao.SQLiteDao(db_dir, db_file_name)
    h1, uids = sqlite.get_distinct_user_id()

    user_group_dict = {}

    for uid in uids:
        # retreive the user group info of a specific user
        h2, uid_group_info = sqlite.get_group_info_by_user_id(
            uid[SQLDao.LABEL_USER_GROUP_INFO_USERID])
        # convert the uid group info into the orange data table
        features = []
        features.append(
            Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT))
        domain = Orange.data.Domain(features, False)
        domain.add_meta(
            Orange.feature.Descriptor.new_meta_id(),
            Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_USERID))
        domain.add_meta(
            Orange.feature.Descriptor.new_meta_id(),
            Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPID))
        datas = []
        for i in uid_group_info:
            data = Orange.data.Instance(
                domain, [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]])
            data[SQLDao.LABEL_USER_GROUP_INFO_USERID] = i[
                SQLDao.LABEL_USER_GROUP_INFO_USERID]
            data[SQLDao.LABEL_USER_GROUP_INFO_GROUPID] = i[
                SQLDao.LABEL_USER_GROUP_INFO_GROUPID]
            datas.append(data)

        table = Orange.data.Table(domain, datas)
        target_instances = []
        if len(table) > 3:
            km = Orange.clustering.kmeans.Clustering(
                data=table, distance=GroupCountDistance)
            clusters = km.clusters
            d = {}
            for idx, c_label in enumerate(clusters):
                if d.has_key(c_label):
                    d[c_label].append(table[idx])
                else:
                    d[c_label] = [table[idx]]

            if len(d) == 3:
                # figure out which cluster represent the largest cluster
                max_label = None
                max_value = -1
                for label, instances in d.items():
                    temp_list = [
                        i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value
                        for i in instances
                    ]
                    if max(temp_list) > max_value:
                        max_value = max(temp_list)
                        max_label = label
                        pass
                for instance in d[max_label]:
                    target_instances.append(instance)
        else:
            # just pick the group which has the largest group_count if it is large enough?
            table.sort([SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT])
            if table[-1][SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value > 20:
                target_instances.append(table[-1])

        # print 'processing %s'%uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]
        user_group_dict[uid[
            SQLDao.LABEL_USER_GROUP_INFO_USERID]] = target_instances
        pass

    print 'finish cluster'
    sqlite.save_user_group_clustered(user_group_dict)