Example #1
0
def condition_HXD(d, a, fenmu):
    '''
    计算H X|D
    :param d: list/1 array -- 类标签(跟a等长度)
    :param a: list/1 array
    fenmu :  存储密度*n  的一组数据      -----  ndarry 一维数组
    :return: 条件熵 H(D|X)
    '''
    d1 = np.unique(d)  # 将d中重复数值剔除
    # 形成字典,key:类标签值(不重复)
    dD = {}
    lD = dict.fromkeys(d1, 0)  # 计数每个类别的数个数
    dexD = {}  # 用来存储对应被分类元素的下标
    # 开始将a分类
    for i in range(0, len(a)):
        dD.setdefault(d[i], []).append(a[i])  # 将a的值逐个写入dD中
        dexD.setdefault(d[i],[]).append(i)  # a下标值写入
        lD[d[i]] += 1
    atr = np.array(a)  # ndarray
    aa2 = atr.reshape(len(a), 1)  # 2d ndarray
    # 计算数据中标签(d/xi)的条件密度  && (xi|d)分子
    # 放入list中
    l = list()
    print("lenlD: ",len(lD))
    for j in lD:
        at = dD[j];  # list
        aa = np.array(at)  # ndarray
        aa2t = aa.reshape(len(aa), 1)  # 2d ndarray
        h = h1decision(np.array(at))  # 全体连续数据的h
        print("at: ",at)
        print("h: ",h)
        pad = kde.KernelDensity(kernel='gaussian', bandwidth=h).fit(aa2t).score_samples(aa2)  # p(x,d
        # print("log_pxd",log_pad)
        pad = np.exp(pad)
        tmp = len(at) * pad  # p d|x 的分子
        # print("p",tmp)
        l.append(tmp)  # 每个标签的每个数据计算出的分子组成之一
        print(j," done")
    print("sum tmp: ", sum(tmp))
    pdx = p_d_a(l, fenmu)
    lpdx =
    # 计算条件熵
    ree = 0

    # print("ttest",log_pad[0],type(log_pad[0]))  # float
    # print("tteste",log_pad[0][0],type(log_pad[0][0]))
    # print("ttts", pdx[0],type(pdx[0])) #ndarry
    # print("llltt",np.array(pdx[0]),type(np.array(pdx[0])))  #ndarray
    for k in range(0, len(pad)):
        nlog_pad = np.array(pad[k])
        npda = np.array(pdx[k])
        ree += npda.dot(nlog_pad)
    # print("reee",ree,type(ree))
    hxd = (-1/len(a)) * ree
    return hxd
def kde2D(x, y, bandwidth, xbins=100j, ybins=100j, **kwargs): 
    
    xx, yy = np.mgrid[x.min():x.max():xbins, 
                      y.min():y.max():ybins]

    xy_sample = np.vstack([yy.ravel(), xx.ravel()]).T
    xy_train  = np.vstack([y, x]).T

    kde_skl = kde.KernelDensity(bandwidth=bandwidth, kernel="gaussian")
    kde_skl.fit(xy_train)

    z = np.exp(kde_skl.score_samples(xy_sample))
    return xx, yy, np.reshape(z, xx.shape)
 def kernel_gaussian(x_train, y_train):
     # 分别提取三个类别的数据
     index1 = []
     index2 = []
     index3 = []
     for each in np.arange(0, len(y_train)):
         if y_train[each] == 1:
             index1.append(each)
         if y_train[each] == 2:
             index2.append(each)
         if y_train[each] == 3:
             index3.append(each)
     data1_train = x_train[index1, :]
     data2_train = x_train[index2, :]
     data3_train = x_train[index3, :]
     # 估计模型
     clf1 = kde.KernelDensity(kernel='gaussian',
                              bandwidth=0.4).fit(data1_train)
     clf2 = kde.KernelDensity(kernel='gaussian',
                              bandwidth=0.4).fit(data2_train)
     clf3 = kde.KernelDensity(kernel='gaussian',
                              bandwidth=0.4).fit(data3_train)
     return clf1, clf2, clf3
Example #4
0
def ClusterOne(path_In_Source, path_In_Text, path_In_Product,
               path_In_AllPicture, path_Out_Product, path_Out_Text):
    TotalData = ReadData(path_In_Source, path_In_Text, path_In_Product,
                         path_In_AllPicture)
    data = TotalData[0]
    output = []
    outputP = []
    outputT = []

    path = path_Out_Product

    # 产品位置聚类
    ProductPosition = []
    for ccData in data:
        ZPData = []
        for each in ccData:
            ZPData.append([(each[2][0] + each[2][1]) / 2,
                           (each[2][2] + each[2][3]) / 2])
        pp = kde.KernelDensity(kernel='gaussian', bandwidth=0.3).fit(ZPData)
        inputData = []
        iData = []
        for a1 in np.linspace(0, 1, 50):
            temp = []
            for a2 in np.linspace(0, 1, 50):
                inputData.append([a1, a2])
                temp.append(a2)
            iData.append(temp)
        iData = np.array(iData)
        outputData = pp.score_samples(inputData)
        count = 0
        tenData = []
        TTData = []
        for each in outputData:
            tenData.append(pow(np.e, each))
            count += 1
            if count == 50:
                TTData.append(tenData)
                tenData = []
                count = 0
        TTData = np.array(TTData)
        yData = iData.T
        xData = iData
        position = np.argmax(TTData)
        y = position % 50 - 1
        x = int((position - (y + 1)) / 50)
        ProductPosition.append([yData[x][y], xData[x][y]])

    # 产品长宽聚类
    ProductAll = []
    f2 = open(path, 'w')
    kP = 0
    for ccData in data:
        ppData = []
        for each in ccData:
            ppData.append([each[2][1] - each[2][0], each[2][3] - each[2][2]])
        pp = kde.KernelDensity(kernel='gaussian', bandwidth=0.5).fit(ppData)
        inputData = []
        for a1 in np.linspace(0, 1, 100):
            for a2 in np.linspace(0, 1, 100):
                inputData.append([a1, a2])
        outputData = pp.score_samples(inputData)
        density = []
        for each in outputData:
            density.append(pow(np.e, each))
        index = density.index(max(density))
        pw = np.linspace(0, 1, 100)[index % 100]
        pl = np.linspace(0, 1, 100)[int((index - pw) / 100)]
        ProductAll.append(
            [ProductPosition[kP][0], ProductPosition[kP][1], pl, pw])
        f2.write('\n' + str(ProductPosition[kP][0]) + ' ' +
                 str(ProductPosition[kP][1]) + ' ' + str(pl) + ' ' + str(pw))
        outputP.append(
            [ProductPosition[kP][0], ProductPosition[kP][1], pl, pw])
        kP += 1
    f2.close()

    # 图片筛选1
    NewData = []
    th = 0.8
    cNum = 0
    for ccData in data:
        cSave = ccData[0]
        yS1 = ProductAll[cNum][0] - ProductAll[cNum][2] * 0.5
        yS2 = ProductAll[cNum][0] + ProductAll[cNum][2] * 0.5
        xS1 = ProductAll[cNum][1] - ProductAll[cNum][3] * 0.5
        xS2 = ProductAll[cNum][1] + ProductAll[cNum][3] * 0.5
        NewCCData = []
        for each in ccData:
            pData = each[2]
            pY = (pData[0] + pData[1]) / 2
            pX = (pData[2] + pData[3]) / 2
            OArea = OverlapArea([yS1, yS2, xS1, xS2], pData)
            SArea = (yS2 - yS1) * (xS2 - xS1)
            PArea = (pData[1] - pData[0]) * (pData[3] - pData[2])
            if OArea / SArea >= th and OArea / PArea >= th:
                NewCCData.append(each)
            pData1 = cSave[2]
            OArea1 = OverlapArea([yS1, yS2, xS1, xS2], pData1)
            SArea1 = (yS2 - yS1) * (xS2 - xS1)
            PArea1 = (pData1[1] - pData1[0]) * (pData1[3] - pData1[2])
            if OArea / SArea + OArea / PArea > OArea1 / SArea1 + OArea1 / PArea1:
                cSave = each
        if len(NewCCData) == 0:
            NewCCData.append(cSave)
        NewData.append(NewCCData)
        cNum += 1

    # 文字位置聚类
    TextPosition = []
    k = 0
    for ttData in NewData:
        TRPData = []
        for each in ttData:
            TRPData.append([
                (each[1][0] + each[1][1]) / 2 - ProductPosition[k][0],
                (each[1][2] + each[1][3]) / 2 - ProductPosition[k][1]
            ])
        k += 1
        pp = kde.KernelDensity(kernel='gaussian', bandwidth=0.5).fit(TRPData)
        inputData = []
        for a1 in np.linspace(-1, 1, 100):
            for a2 in np.linspace(-1, 1, 100):
                inputData.append([a1, a2])
        outputData = pp.score_samples(inputData)
        density = []
        for each in outputData:
            density.append(pow(np.e, each))
        index = density.index(max(density))
        xR = np.linspace(-1, 1, 100)[index % 100]
        yR = np.linspace(-1, 1, 100)[int((index - xR) / 100)]
        TextPosition.append([yR, xR])

    # 文字大小聚类
    path = path_Out_Text
    f3 = open(path, 'w')
    kT = 0
    for ttData in NewData:
        tpData = []
        for each in ttData:
            tpData.append([each[1][1] - each[1][0], each[1][3] - each[1][2]])
        pp = kde.KernelDensity(kernel='gaussian', bandwidth=0.5).fit(tpData)
        inputData = []
        for a1 in np.linspace(0, 1, 100):
            for a2 in np.linspace(0, 1, 100):
                inputData.append([a1, a2])
        outputData = pp.score_samples(inputData)
        density = []
        for each in outputData:
            density.append(pow(np.e, each))
        index = density.index(max(density))
        tw = np.linspace(0, 1, 100)[index % 100]
        tl = np.linspace(0, 1, 100)[int((index - tw) / 100)]
        f3.write('\n' + str(TextPosition[kT][0]) + ' ' +
                 str(TextPosition[kT][1]) + ' ' + str(tl) + ' ' + str(tw))
        outputT.append([TextPosition[kT][0], TextPosition[kT][1], tl, tw])
        kT += 1
    f3.close()
    output.append(outputP)
    output.append(outputT)
    return output
    return [hr, wr]


TotalData = ReadData()
data = TotalData[0]

path = "G:/浙大实习/text_product聚类/One2TwoProduct.txt"

# 产品位置聚类
ProductPosition = []
for ccData in data:
    ZPData = []
    for each in ccData:
        ZPData.append([(each[2][0] + each[2][1]) / 2,
                       (each[2][2] + each[2][3]) / 2])
    pp = kde.KernelDensity(kernel='gaussian', bandwidth=0.3).fit(ZPData)
    inputData = []
    iData = []
    for a1 in np.linspace(0, 1, 50):
        temp = []
        for a2 in np.linspace(0, 1, 50):
            inputData.append([a1, a2])
            temp.append(a2)
        iData.append(temp)
    iData = np.array(iData)
    outputData = pp.score_samples(inputData)
    count = 0
    tenData = []
    TTData = []
    for each in outputData:
        tenData.append(pow(np.e, each))
def estimate_pdf_kde(samples):

    kde_estimator = kde.KernelDensity(kernel='gaussian', bandwidth=0.2)
    kde_estimator.fit(samples)
    return kde_estimator
def cal_prob_smooth(vec, sample):
    model = kde.KernelDensity(kernel='gaussian', bandwidth=0.2).fit(sample)
    prob = np.exp(model.score_samples(vec.reshape(1, -1)))
    return prob
Example #8
0
        10 * t + 6, 10 * t + 7, 10 * t + 8, 10 * t + 9
    ], 0)
    X2 = np.delete(X2, [
        10 * t, 10 * t + 1, 10 * t + 2, 10 * t + 3, 10 * t + 4, 10 * t + 5,
        10 * t + 6, 10 * t + 7, 10 * t + 8, 10 * t + 9
    ], 0)
    X3 = np.delete(X3, [
        10 * t, 10 * t + 1, 10 * t + 2, 10 * t + 3, 10 * t + 4, 10 * t + 5,
        10 * t + 6, 10 * t + 7, 10 * t + 8, 10 * t + 9
    ], 0)

    test1 = A[t * 10:t * 10 + 10, 0:4].copy()
    test2 = A[50 + t * 10:60 + t * 10, 0:4].copy()
    test3 = A[100 + t * 10:110 + t * 10, 0:4].copy()

    pattern1 = kde.KernelDensity(kernel="gaussian", bandwidth=h).fit(X1)
    pattern2 = kde.KernelDensity(kernel="gaussian", bandwidth=h).fit(X2)
    pattern3 = kde.KernelDensity(kernel="gaussian", bandwidth=h).fit(X3)

    log_dens11 = pattern1.score_samples(test1)
    log_dens12 = pattern2.score_samples(test1)
    log_dens13 = pattern3.score_samples(test1)

    log_dens21 = pattern1.score_samples(test2)
    log_dens22 = pattern2.score_samples(test2)
    log_dens23 = pattern3.score_samples(test2)

    log_dens31 = pattern1.score_samples(test3)
    log_dens32 = pattern2.score_samples(test3)
    log_dens33 = pattern3.score_samples(test3)
Example #9
0
from sklearn.neighbors import kde
import numpy as np
import matplotlib.pyplot as plt

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
X = np.random.normal(0, 3, 30)[:, np.newaxis]
X = np.concatenate((np.random.normal(0, 1, 30),
                     np.random.normal(5, 1, 70)))[:, np.newaxis]

kde = kde.KernelDensity(kernel='gaussian', bandwidth=0.2).fit(X)

log_dens = kde.score_samples(X)
print(log_dens)
print(np.exp(kde.score_samples(X)))


score = kde.score([[5]])
print(score)
print(np.exp(score))



fig, ax = plt.subplots()
ax.plot(X[:, 0], np.exp(log_dens), '*', label="kernel = '{0}'".format('gaussian'))


ax.set_xlim(-4, 10)
ax.set_ylim(-0.02, 1)

plt.show()