Esempio n. 1
0
def test():
    kind_list = [
        'C3-Art', 'C19-Computer', 'C7-History', 'C32-Agriculture',
        'C31-Enviornment'
    ]
    stopwords = tl.read_stopwords()
    doc = []
    for i in kind_list:
        ans = tl.read_kind(i, 50)
        for j in ans:
            doc.append(tl.cut_without_stopwords(j, stopwords))
    print('分词完成')
    pdoc = []
    for i in doc:
        con = ' '.join(i)
        pdoc.append(con)
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(pdoc))
    word = vectorizer.get_feature_names()  #所有文本的关键字
    weight = tfidf.toarray()  #对应的tfidf矩阵
    '''
    ans, bag = tV.create_VSM(doc)
    print('词袋模型构建完毕')
    TF_IDF = tT.cal_TFIDF(ans)
    print('TF-IDF权重矩阵计算完毕')
    # TF_IDF = tP.create_PCA(TF_IDF, int(0.1 * TF_IDF.shape[1]))
    '''
    cluster = tc.KMeans(weight, len(kind_list))
    print('获得分类结果')
    return cluster, kind_list
def runClustering(weights, k, featureFunctionMapping, businesses, truthPairs,
                  truthIds, cache):
    cacheId = paramId(weights, k)
    if (cacheId in cache):
        return cache[cacheId]

    featureDistMap = featureDistanceMap.FeatureDistanceMap(
        featureFunctionMapping, weights)
    kMeans = clustering.KMeans(k, featureDistMap)

    id = "\t".join([str(weight) for weight in weights])
    randIndex = -1

    try:
        clusters = kMeans.cluster(businesses)
        randIndex = metrics.randIndex(clusters, businesses, truthPairs,
                                      truthIds)
        print("%s\t%f" % (id, randIndex), file=sys.stderr)
        '''
        for i in range(len(clusters)):
            print("Cluster: %02d, Size: %02d" % (i, len(clusters[i])))
            print("         %s" % (", ".join([str(x) for x in sorted([businesses[index].otherInfo['yelpId'] for index in clusters[i]])])))
        '''
    except Exception as ex:
        print(ex)
        print("%s\tERROR" % (id), file=sys.stderr)

    cache[cacheId] = randIndex

    return randIndex
Esempio n. 3
0
def run():
    businesses = features.getBusinesses(data.DATA_SOURCE_HUMAN_EVAL)

    featureDistMap = featureDistanceMap.FeatureDistanceMap(learnWeights.getFeatureMapping(), WEIGHTS)
    kMeans = clustering.KMeans(K, featureDistMap)

    # kMeans = clustering.KMeans(K, featureDistanceMap.FeatureDistanceMap())

    clusters = kMeans.cluster(businesses)

    for i in range(len(clusters)):
        print("Cluster: %02d, Size: %02d" % (i, len(clusters[i])))
        print("         %s" % (", ".join([str(x) for x in sorted([businesses[index].otherInfo['name'] for index in clusters[i]])])))

    # Metrics
    truthPairs, truthIds = metrics.getHumanTruthPairs()
    print("Rand Index: %f" % (metrics.randIndex(clusters, businesses, truthPairs, truthIds)))
Esempio n. 4
0
def run(businessType):
    businesses = features.getBusinesses(businessType)

    # Arbitrary K
    kMeans = clustering.KMeans(K, featureDistanceMap.FeatureDistanceMap())
    clusters = kMeans.cluster(businesses)

    for i in range(len(clusters)):
        print("Cluster: %02d, Size: %02d" % (i, len(clusters[i])))
        print("         %s" % (", ".join([
            str(x) for x in sorted(
                [businesses[index].otherInfo['name'] for index in clusters[i]])
        ])))

    # Metrics
    goldLabel = metrics.readGoldLabel("../data/groundtruth")
    b_cluster = metrics.getClusterBusinessID(businesses, clusters)
    randIndex = metrics.oldRandIndex(b_cluster, goldLabel)
    print("Old Rand Index: " + str(randIndex))

    print("New Rand Index: %f" % (metrics.randIndex(clusters, businesses)))
def run(weights, k, scalarNorm, setDistance):
    businesses = features.getBusinesses(data.DATA_SOURCE_GROUNDTRUTH_ALL)

    featureDistMap = featureDistanceMap.FeatureDistanceMap(
        buildFeatureMapping(scalarNorm, setDistance), weights)
    kMeans = clustering.KMeans(k, featureDistMap)
    clusters = kMeans.cluster(businesses)

    for i in range(len(clusters)):
        print("Cluster: %02d, Size: %02d" % (i, len(clusters[i])))
        print("         %s" % (", ".join([
            str(x) for x in sorted([
                businesses[index].otherInfo['yelpId'] for index in clusters[i]
            ])
        ])))

    goldLabel = metrics.readGoldLabel("../data/groundtruth")
    b_cluster = metrics.getClusterBusinessID(businesses, clusters)
    randIndex = metrics.oldRandIndex(b_cluster, goldLabel)

    return randIndex
    def test_kmeansBase(self):
        data = [
            business.Business(10, [0, 0, 0]),
            business.Business(20, [1, 1, 1]),
            business.Business(30, [2, 2, 2]),

            business.Business(411, [10, 10, 10]),
            business.Business(511, [11, 11, 11]),
            business.Business(611, [12, 12, 12]),

            business.Business(7123, [110, 110, 110]),
            business.Business(8123, [111, 111, 111]),
            business.Business(9123, [112, 112, 112])
        ]

        expected = [
            [0, 1, 2],
            [3, 4, 5],
            [6, 7, 8]
        ]

        manhattan = lambda a, b: distance.manhattan([a], [b])
        kMeans = clustering.KMeans(3, featureDistanceMap.FeatureDistanceMap([manhattan, manhattan, manhattan]))
        self.assertEqual(sorted(kMeans.cluster(data)), sorted(expected))