Ejemplo n.º 1
0
def doTC_array(txt_dict, minTC=0, topN=None):
    """进行TC特征降维"""
    # 计算各文本各词itc权值的tfidf矩阵
    txt_array, txtName, wordName = dict2Array(txt_dict, dtype=int)
    tfidf_dict = myTFIDF(txt_dict, itc=True)
    tfidf_array = dict2Array(tfidf_dict)[0]
    # 计算各词的单词权值
    tc_array = myTC_array(tfidf_array)
    # 根据TC权值筛选单词
    newWordName = selectFeature(tc_array, wordName, minTC=minTC, topN=topN)
    # 根据新的单词集(特征集)压缩数据
    newData = selectData(txt_array,
                         newWordName,
                         oldWordName=wordName,
                         orderchange=False)
    return newData, newWordName
Ejemplo n.º 2
0
def test_selectData():
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess')  # 0.6s
    txt_array, txtName, wordName = dict2Array(txt_dict)

    newWid = random.sample(range(0, len(wordName)), 300)
    newWid.sort()
    newWordname = [wordName[i] for i in newWid]
    sfdata = selectData_dict(txt_dict, newWordname)
    sfdata2 = selectData_array(txt_array, newWordname, oldWordName=wordName)
    sfdata22 = selectData_array(txt_array,
                                newWordname,
                                oldWordName=wordName,
                                orderchange=False)
    # 当newWordname顺序改变时即orderchange=True
    # selectData_dict 比 selectFeature_array快
    sfdata1 = dict2Array(sfdata)[0]
    print(sum(sum(sfdata1 - sfdata2)))
Ejemplo n.º 3
0
def test_myTC():
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess')  # 0.6s
    tfidf_dict = myTFIDF(txt_dict, itc=True)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    tc_dict = myTC_dict(tfidf_dict, rtype=list)
    tc_array = myTC_array(tfidf_array)
    # myTC_array 比 myTC_dict 快
    # print(sum(tc_dict - tc_array))
    # 可以借助dataInfo内的函数、查看TC的数据分布
    print(fiveNumber(tc_array))
    showDistplot(tc_array)
Ejemplo n.º 4
0
def doTC_dict(txt_dict, minTC=0, topN=None):  # 快
    """进行TC特征降维"""
    # 计算各文本各词itc权值的tfidf矩阵
    tfidf_dict = myTFIDF(txt_dict, itc=True)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    # 计算各词的单词权值
    tc_array = myTC_array(tfidf_array)
    # 根据TC权值筛选单词
    newWordName = selectFeature(tc_array, wordName, minTC=minTC, topN=topN)
    # 根据新的单词集(特征集)压缩数据
    newData = selectData(txt_dict, newWordName)
    return newData
Ejemplo n.º 5
0
def test_pca_sklearn():
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess_test')  # 0.6s
    txt_array, txtName, wordName = dict2Array(txt_dict, dtype=int)
    testdata = txt_array[:, 1:500]

    # newData_dict = doTC_dict(txt_dict, topN=1000)
    # testdata, txtName, wordName = dict2Array(newData_dict, dtype=int)
    dataMat = numpy.mat(testdata)

    print(dataMat.shape)
    topNfeat = 50
    lowDDataMat, redEigVects = myPCA_R(dataMat, topN=topNfeat)
Ejemplo n.º 6
0
def test_doTC():
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess')
    minTC, topN = 0, 10000
    tcData_array, tcWordName = doTC_array(txt_dict, minTC, topN)
    tcData_dict = doTC_dict(txt_dict, minTC, topN)
    tcData_dict_array, txtName, tcWordName_dict_array = dict2Array(tcData_dict,
                                                                   dtype=int)
    # doTC_dict 比 doTC_array 快
    for j in range(len(tcWordName)):
        if tcWordName[j] != tcWordName_dict_array[j]:
            print("%d tcWordName[i]!=tcWordName_dict_array" % j)
            break
    print(sum(sum(tcData_dict_array - tcData_array)))
Ejemplo n.º 7
0
def test_pca():
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess_test')  # 0.6s
    txt_array, txtName, wordName = dict2Array(txt_dict, dtype=int)
    dataMat = numpy.mat(txt_array[:, :1000], dtype=numpy.float64)
    topN = 100
    newData, U, rdata = myPCA(dataMat, topN=topN, onlyNewData=False)
    newData2, U2, rdata2 = myPCA_R(dataMat, topN=topN, onlyNewData=False)
    newData3, U3, rdata3 = pca_sklearn(dataMat, topN=topN, onlyNewData=False)
    # showDiff(newData, newData2)
    # dd = getDiff(dataMat, newData, U)
    # rdata = newData * U.T + numpy.mean(dataMat, axis=0)
    # print(dd,rdata.max())
    print(getDiff(dataMat, rdata))
    print(getDiff(dataMat, rdata2))
    print(getDiff(dataMat, rdata3))
Ejemplo n.º 8
0
def test_selectFeature():
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess_test')
    tfidf_dict = myTFIDF(txt_dict, itc=True)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    tc_array = myTC_array(tfidf_array)
    minTC, topN = 0, 100
    tc = tc_array
    wordAndIdx = list(zip(wordName, tc))
    wordAndIdx.sort(key=lambda x: x[1], reverse=True)  # 按tc排序
    newWordName = [wordAndIdx[i][0] for i in range(topN)]
    newWordName.sort()

    idx = tc.argsort()
    idx = idx[:-topN - 1:-1]
    idx.sort()
    newWordName2 = [wordName[i] for i in idx]
    for j in range(len(newWordName2)):
        if newWordName2[j] != newWordName[j]:
            print("%d tcWordName[i]!=tcWordName_dict_array" % j)
            break
Ejemplo n.º 9
0
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess')
    minTC, topN = 0, 10000
    tcData_array, tcWordName = doTC_array(txt_dict, minTC, topN)
    tcData_dict = doTC_dict(txt_dict, minTC, topN)
    tcData_dict_array, txtName, tcWordName_dict_array = dict2Array(tcData_dict,
                                                                   dtype=int)
    # doTC_dict 比 doTC_array 快
    for j in range(len(tcWordName)):
        if tcWordName[j] != tcWordName_dict_array[j]:
            print("%d tcWordName[i]!=tcWordName_dict_array" % j)
            break
    print(sum(sum(tcData_dict_array - tcData_array)))


if __name__ == '__main__':
    txt_dict = getWordCount(
        '/Users/brobear/PycharmProjects/TextClusteringAnalysis/txt2')
    tfidf_dict = myTFIDF(txt_dict, itc=True)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    # 计算各词的单词权值
    tc_array = myTC_array(tfidf_array)
    showDistplot(tc_array)
    tc_array.sort()
    tc_array = tc_array[::-1]
    from matplotlib import pyplot as plt

    plt.plot(range(len(tc_array)), tc_array)
    plt.ylim(0, 200)
    plt.show()
Ejemplo n.º 10
0
                clusterLabel_map[minIndex].append(i)
            else:
                clusterLabel_map[minIndex] = [i]
        # 更新中心
        for i in range(k):
            Cent[i, :] = numpy.mean(data[clusterLabel_map[i], :], axis=0)
    print(iter)
    return clusterLabel


if __name__ == '__main__':

    outDir = '/Users/brobear/PycharmProjects/TextClusteringAnalysis/txt2'
    txt_dict = getWordCount(outDir)
    tfidf_dict = myTFIDF(txt_dict, itc=False)
    data, textNames, wordName = dict2Array(tfidf_dict)
    # 降维
    topN = 1200
    data, textNames = PCA(txt_dict, topN=topN, itc=False)[:2]
    # 确定特征维数
    for x in [i * 0.1 for i in range(1, 10)]:
        data, textNames = PCA(txt_dict, topN=x, itc=False)[:2]
        print(x, data.shape)
    # 结果:0.1 74 0.2 204 0.3 357 0.4 519 0.5 684 0.6 851 0.7 1022 0.8 1198 0.9 1387
    # [74, 204, 357, 519, 684, 851, 1022, 1198, 1387]
    #
    #
    # # 肘方法看k值
    # kList = range(5, 40, 1)
    # d = []
    # for k in kList:
Ejemplo n.º 11
0
def TC_PCA(txt_dict, minTC=0, topN=None, itc=False):  # 45s
    newData_dict = doTC_dict(txt_dict, minTC=minTC)
    tfidf_dict = myTFIDF(newData_dict, itc=itc)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    newData_mat = pca_sklearn(tfidf_array, topN=topN)
    return newData_mat, txtName
Ejemplo n.º 12
0
def PCA(txt_dict, topN=None, itc=False):  # 137s
    tfidf_dict = myTFIDF(txt_dict, itc=itc)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    newData_mat = pca_sklearn(tfidf_array, topN=topN)
    return newData_mat, txtName
Ejemplo n.º 13
0
def TC(txt_dict, topN):  # 7.6S
    newData_dict = doTC_dict(txt_dict, topN=topN)
    tfidf_dict = myTFIDF(newData_dict, itc=False)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    newData_mat = numpy.mat(tfidf_array)
    return newData_mat, txtName, wordName
Ejemplo n.º 14
0
def show_wbpj(y_pre, textNames):
    """显示外部评价"""
    # 获取标签
    y_true_dict = getTags(textNames)
    # 非类别标签
    deleteTag = [
        'Other', 'Platform', 'Cryptocurrency', 'Business services',
        'Investment', 'Smart Contract', 'Software', 'Internet',
        'Infrastructure', 'Entertainment'
    ]
    for v in y_true_dict.values():
        for i in deleteTag:
            if i in v:
                v.remove(i)
    y_pre2true = {}
    for i in range(len(textNames)):
        if y_pre[i] not in y_pre2true:
            y_pre2true[y_pre[i]] = []
        y_pre2true[y_pre[i]].append(y_true_dict[textNames[i]])
    y_pre2true_dict = {}
    for k, v in y_pre2true.items():
        y_pre2true_dict[k] = {}
        vv = [i for i in v if i != []]
        for i in v:
            for j in i:
                y_pre2true_dict[k][j] = y_pre2true_dict[k].get(
                    j, 0.0) + 1 / len(i) / len(vv)
    y_pre2true_array, cid, tagList = dict2Array(y_pre2true_dict)
    colorListAll = [
        'yellowgreen', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige',
        'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown',
        'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral',
        'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue',
        'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkkhaki',
        'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred',
        'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray',
        'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray',
        'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia',
        'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green',
        'greenyellow', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory',
        'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon',
        'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow',
        'lightgreen', 'lightgray', 'lightpink', 'lightsalmon', 'lightseagreen',
        'lightskyblue', 'lightslategray', 'lightsteelblue', 'lightyellow',
        'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine',
        'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen',
        'mediumslateblue', 'mediumspringgreen', 'mediumturquoise',
        'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose',
        'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab',
        'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen',
        'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru',
        'pink', 'plum', 'powderblue', 'purple', 'red', 'rosybrown',
        'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen',
        'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray',
        'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato',
        'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow',
        'aliceblue'
    ]
    # colorList = random.sample(colorListAll, y_pre2true_array.shape[1])
    colorList = [colorListAll[i * 4] for i in range(y_pre2true_array.shape[1])]
    for i in range(y_pre2true_array.shape[1] - 1, -1, -1):
        plt.bar(cid,
                +y_pre2true_array[:, i],
                color=colorList[i],
                edgecolor='black',
                alpha=0.9)
    plt.legend(tagList[::-1],
               bbox_to_anchor=(0, 1),
               loc=3,
               ncol=3,
               borderaxespad=0)
    plt.show()
    y_pre2true_array2 = y_pre2true_array.copy()
    for i in range(1, y_pre2true_array2.shape[1]):
        y_pre2true_array2[:, i] += y_pre2true_array2[:, i - 1]
    for i in range(y_pre2true_array.shape[1] - 1, -1, -1):
        plt.bar(cid,
                +y_pre2true_array2[:, i],
                color=colorList[i],
                edgecolor='black',
                alpha=1)
    plt.legend(tagList[::-1],
               bbox_to_anchor=(0, 1),
               loc=3,
               ncol=3,
               borderaxespad=0)
    plt.show()