Esempio n. 1
0
    def tu_Gussian(self,dataname="None",X=None,TrainData=None,choice=12):

    #测试数据集
        if X==None:
            X,y=make_blobs(n_samples=100,n_features=3,centers=[[3,3, 3], [0,0,0], [1,1,1], [2,2,2]], cluster_std=[0.2, 0.1, 0.2, 0.2],
                              random_state =9)
        n=X.shape[1]
        m=X.shape[0]
        if TrainData==None:
            col=np.random.randint(0,1,(98,1))
            col2=np.random.randint(1,2,(2,1))
            TrainData=np.column_stack((X,np.row_stack((col,col2))))
    #高斯模型
        mu1,sigms1=self.estimateGaussion(X)
        if choice==1:
            px_one=gaussian(sigms1,X,mu1)
            if n<=10:
                scatter=Scatter("featurn")
                for j in range(0,n):
                    scatter.add(str(j),X[:,j],px_one[:,j])

        else:
            px_one=self.multivariateGaussian(X,mu1,sigms1)
            scatter=Scatter("featurn")
            def f(x):
                y=1
                for i in range(n):
                    y=y*x[i]
                return y

            scatter.add("总体分布",map(f,X),px_one)

        #交叉验证,取得最好epsilon
        len = TrainData.shape[1]

        Xval=TrainData[:,0:-1]
        Yval=TrainData[:,-1]

        pvals=[]                       #各个特征值的概率相乘
        if choice==1:
            pval=gaussian(sigms1,Xval,mu1)
            for i in range(0,m):
                pvals.append(reduce(mul,pval[i,:]))
        else:
            pvals=self.multivariateGaussian(Xval,mu1,sigms1)

        epsilon,F1=self.selectThreshold(Yval,pvals)

        yc=[0]              #异常点为0

        def filteryc(x):
            return x[n-1]  in yc
        newdata=filter(filteryc,X)
        save_helper.save_txt_helper(newdata,dataname)
        outliers=np.where(px_one<epsilon)
        scatter2=ksh.ksh_scatter("离散点异常分布图","正常点",X,"FG","异常点",X[outliers])
        self.page.add(scatter)
        self.page.add(scatter2)

        save_helper.save_tu_helper(self.page,dataname)
Esempio n. 2
0
    def mst(self, data=None, dataN=None, dataname="None", choice="prim"):
        page = Page()
        if data == None:
            max_value = 9999999
            row0 = [0, 7, max_value, max_value, max_value, 5]
            row1 = [7, 0, 9, max_value, 3, max_value]
            row2 = [max_value, 9, 0, 6, max_value, max_value]
            row3 = [max_value, max_value, 6, 0, 8, 10]
            row4 = [max_value, 3, max_value, 8, 0, 4]
            row5 = [5, max_value, max_value, 10, 4, 0]
            data = [row0, row1, row2, row3, row4, row5]
            dataN = ["节点1", "节点2", "节点3", "节点4", "节点5", "节点6"]
        #对初始数据可视化
        link = []
        node = []
        n = len(data)
        m = len(data[0])
        for i in range(n):
            for j in range(m):
                if data[i][j] == max_value:
                    continue
                else:
                    link.append({"source": dataN[i], "target": dataN[j]})
            fdata = filter(lambda x: x != max_value, data[i])
            big = reduce(lambda x, y: x + y, fdata)
            node.append({"name": dataN[i], "symbolSize": big})

        tu_graph = Tu_Graph("总关系图")
        tu_graph.add("", node, link)
        page.add(tu_graph)

        graph = tree.Graph(data)

        if choice == "prim":
            res = graph.prim()
        else:
            res = graph.kruskal()
        print(res)

        n1 = len(res)
        m1 = len(res[0])

        def sum2(x, y):
            if type(x) == int:
                return x + y[2]
            else:
                return x[2] + y[2]

        big = reduce(sum2, res)

        link2 = []
        for i in res:
            link2.append({"source": i[0], "target": i[1]})
        tu_graph2 = Tu_Graph("最小生成树图")
        tu_graph2.add("权重和为:" + str(big), node, link2)
        print(link2)
        page.add(tu_graph2)
        sh.save_tu_helper(page, dataname)
Esempio n. 3
0
    def GuanJianCi(self, data_name="None", num=20, text=None):
        page = Page()
        if text == None:
            text = "SimHash是一种局部敏感hash,它也是Google公司进行海量网页去重使用的主要算法。传统的Hash算法只负责将原始内容尽量均匀随机地映射为一个签名值,原理上仅相当于伪随机数产生算法。传统的hash算法产生的两个签名,如果原始内容在一定概率下是相等的;如果不相等,除了说明原始内容不相等外,不再提供任何信息,因为即使原始内容只相差一个字节,所产生的签名也很可能差别很大。所以传统的Hash是无法在签名的维度上来衡量原内容的相似度,而SimHash本身属于一种局部敏感哈希算法,它产生的hash签名在一定程度上可以表征原内容的相似度。我们主要解决的是文本相似度计算,要比较的是两个文章是否相似,当然我们降维生成了hash签名也用于这个目的。看到这里估计大家就明白了,我们使用的simhash就算把文章中的字符串变成 01 串也还是可以用于计算相似度的,而传统的hash却不行。"

        tags = jieba.analyse.extract_tags(text,
                                          topK=num,
                                          withWeight=True,
                                          withFlag=True)

        name = []
        value = []

        for tag in tags:
            name.append(tag[0])
            value.append(tag[1])
        print(value)
        wordCloud = WordCloud(data_name)
        wordCloud.add("", name, value)

        pie = Pie('前十个词汇占重', "", title_pos='center')
        style = Style()
        pie_style = style.add(label_pos="center",
                              is_label_show=True,
                              label_text_color=None)

        hight = 10
        width = 30
        sum_Wight = sum(value)
        for index, (n, v) in enumerate(zip(name, value)):

            if index == 5:
                hight = 10
                width = width + 40
            if index < 10:
                pie.add("", [n, ""], [v / sum_Wight, 1 - v / sum_Wight],
                        center=[hight, width],
                        radius=[18, 24],
                        **pie_style)
                hight = hight + 20
                print(hight, width)
                print index

        page.add(pie)
        page.add(wordCloud)
        save_helper.save_tu_helper(page, data_name)
Esempio n. 4
0
    def tu_spca(self, dataname="kong", components_n=1, data=None):

        #测试数据
        X, y = make_blobs(n_samples=10000,
                          n_features=3,
                          centers=[[3, 3, 3], [0, 0, 0], [1, 1, 1], [2, 2, 2]],
                          cluster_std=[0.2, 0.1, 0.2, 0.2],
                          random_state=9)
        if data == None:
            data = X

        message = []
        #训练数据
        spca = SparsePCA(n_components=components_n,
                         normalize_components=True,
                         random_state=0)
        spca.fit(X)
        #保存数据
        value = spca.transform(X)
        save_helper.save_txt_helper(value, dataname)

        components = spca.components_
        error = spca.error_
        page2 = Page()
        #绘图
        for j in range(0, components.shape[0]):
            bar1 = Bar("稀疏组建" + str(j))
            bar1.add("", [
                "components_" + str(i) for i in range(0, components.shape[1])
            ], components[j])
            page2.add(bar1)
        message.append("我们仅提供稀疏组建和数据误差供给分析")

        print(error)
        bar2 = Bar("数据误差分析")
        bar2.add("", ["error" + str(i) for i in range(0, len(error))], error)
        page2.add(bar2)
        save_helper.save_tu_helper(page2, dataname)

        return message
Esempio n. 5
0
    def corr_m(self, dataname, data_place):

        if data_place != None and os.path.exists(data_place):
            df = pd.read_excel(data_place, header=0, index=None)
        else:
            message = "数据不存在,或者数据格式错误"
            return message

        col_name = df.columns.values
        ax = df.corr()
        len = ax.shape[1]
        array = np.array(ax)
        rank = array.argsort()
        first_rank = rank[:, -2]

        first_name = []
        for j in range(len - 1):
            first_name.append(col_name[first_rank[j]])

        page = Page()
        bar = Bar("相关性分析")
        page.add(bar)

        for j in range(len - 1):
            bar.add(col_name[j],
                    col_name,
                    array[j],
                    is_more_utils=True,
                    is_datazoom_show=True,
                    datazoom_type="both",
                    is_datazoom_extra_show=True,
                    datazoom_extra_type="slider")

        save_helper.save_tu_helper(page, dataname)
        message = []
        message.append("如果出现NAN数据说明该列不是数值型,请删除")
        return message
Esempio n. 6
0
    def tu_pca(self,
               dataname="None",
               components_ratio=None,
               components_n=None,
               data=None):
        if components_n == None:
            components_n = 1

        #返回信息
        message = []
        #3D图参数
        page = Page()
        range_color = [
            '#313695', '#4575b4', '#74add1', '#abd9e9', '#e0f3f8', '#ffffbf',
            '#fee090', '#fdae61', '#f46d43', '#d73027', '#a50026'
        ]

        #测试数据
        X, y = make_blobs(n_samples=10000,
                          n_features=3,
                          centers=[[3, 3, 3], [0, 0, 0], [1, 1, 1], [2, 2, 2]],
                          cluster_std=[0.2, 0.1, 0.2, 0.2],
                          random_state=9)
        if data == None:
            data = X

        #如果为3维数据,画出该图
        if (data.shape[1] == 3):
            scatter3D = Scatter3D(dataname)
            scatter3D.add("",
                          X,
                          is_visualmap=True,
                          visual_range_color=range_color)
            html_name = dataname + ".html"
            page.add(scatter3D)

        #pca参数
        pca_ = PCA(n_components=data.shape[1])
        pca_.fit(data)
        ratio = pca_.explained_variance_ratio_
        variance = pca_.explained_variance_
        attr = ["成分" + str(i) for i in range(0, data.shape[1])]
        pie = Pie("PAC成分图", width=1000, height=600)
        pie.add(
            "百分比",
            attr,
            ratio,
            radius=[50, 55],
            center=[25, 50],
            is_random=True,
        )
        pie.add(
            "最大方差",
            attr,
            variance,
            radius=[0, 45],
            center=[25, 50],
            rosetype="radius",
        )
        page.add(pie)

        if components_ratio != None:
            pca = PCA(n_components=components_ratio)
            pca.fit(X)

            value = pca.transform(X)
            save_helper.save_txt_helper(value, dataname)

            #信息提示
            ratio_ = np.sum(pca.explained_variance_ratio_)
            if ratio_ > components_ratio:
                message.append("所选百分比可能过小,为保证充分利用信息可以选择稍微向上调整")
            #绘图
            sum = 0
            bar_data = None
            for i in range(0, data.shape[1]):
                sum = sum + ratio[i]
                if sum == ratio_:
                    bar_data = [x for x in ratio[range(i + 1, data.shape[1])]]
                    bar = Bar("剩余成分百分比")
                    bar.add(
                        "",
                        ["剩余成分" + str(i)
                         for i in range(i + 1, data.shape[1])], bar_data)
                    page.add(bar)
                    break

        else:
            print(1)
            pca2 = PCA(n_components=components_n)
            pca2.fit(X)
            value = pca2.transform(X)
            save_helper.save_txt_helper(value, dataname + "2")
            ratio_ = np.sum(pca2.explained_variance_ratio_)
            pie_data = [ratio_] + [
                x for x in ratio[range(components_n, data.shape[1])]
            ]
            attr = ["s"] + [
                "s" + str(i) for i in range(components_n, data.shape[1])
            ]
            pie2 = Pie("选择成分百分比")
            pie2.add("",
                     attr,
                     pie_data,
                     radius=[40, 75],
                     label_text_color=None,
                     is_label_show=True,
                     legend_orient="vertical",
                     legend_pos="right")
            page.add(pie2)

        #绘图
        save_helper.save_tu_helper(page, dataname)
        return message
Esempio n. 7
0
    def china_city(self, dataname, data=None):
        if data == None:
            data = [(u"海门", 9), (u"鄂尔多斯", 12), (u"招远", 12), (u"舟山", 12),
                    (u"齐齐哈尔", 14), (u"盐城", 15), (u"赤峰", 16), (u"青岛", 18),
                    (u"乳山", 18), (u"金昌", 19), (u"泉州", 21), (u"莱西", 21),
                    (u"日照", 21), (u"胶南", 22), (u"南通", 23), (u"拉萨", 24),
                    (u"云浮", 24), (u"梅州", 25), (u"文登", 25), (u"上海", 25),
                    (u"攀枝花", 25), (u"威海", 25), (u"承德", 25), (u"厦门", 26),
                    (u"汕尾", 26), (u"潮州", 26), (u"丹东", 27), (u"太仓", 27),
                    (u"曲靖", 27), (u"烟台", 28), (u"福州", 29), (u"瓦房店", 30),
                    (u"即墨", 30), (u"抚顺", 31), (u"玉溪", 31), (u"张家口", 31),
                    (u"阳泉", 31), (u"莱州", 32), (u"湖州", 32), (u"汕头", 32),
                    (u"昆山", 33), (u"宁波", 33), (u"湛江", 33), (u"揭阳", 34),
                    (u"荣成", 34), (u"连云港", 35), (u"葫芦岛", 35), (u"常熟", 36),
                    (u"东莞", 36), (u"河源", 36), (u"淮安", 36), (u"泰州", 36),
                    (u"南宁", 37), (u"营口", 37), (u"惠州", 37), (u"江阴", 37),
                    (u"蓬莱", 37), (u"韶关", 38), (u"嘉峪关", 38), (u"广州", 38),
                    (u"延安", 38), (u"太原", 39), (u"清远", 39), (u"中山", 39),
                    (u"昆明", 39), (u"寿光", 40), (u"盘锦", 40), (u"长治", 41),
                    (u"深圳", 41), (u"珠海", 42), (u"宿迁", 43), (u"咸阳", 43),
                    (u"铜川", 44), (u"平度", 44), (u"佛山", 44), (u"海口", 44),
                    (u"江门", 45), (u"章丘", 45), (u"肇庆", 46), (u"大连", 47),
                    (u"临汾", 47), (u"吴江", 47), (u"石嘴山", 49), (u"沈阳", 50),
                    (u"苏州", 50), (u"茂名", 50), (u"嘉兴", 51), (u"长春", 51),
                    (u"胶州", 52), (u"银川", 52), (u"张家港", 52), (u"三门峡", 53),
                    (u"锦州", 54), (u"南昌", 54), (u"柳州", 54), (u"三亚", 54),
                    (u"自贡", 56), (u"吉林", 56), (u"阳江", 57), (u"泸州", 57),
                    (u"西宁", 57), (u"宜宾", 58), (u"呼和浩特", 58), (u"成都", 58),
                    (u"大同", 58), (u"镇江", 59), (u"桂林", 59), (u"张家界", 59),
                    (u"宜兴", 59), (u"北海", 60), (u"西安", 61), (u"金坛", 62),
                    (u"东营", 62), (u"牡丹江", 63), (u"遵义", 63), (u"绍兴", 63),
                    (u"扬州", 64), (u"常州", 64), (u"潍坊", 65), (u"重庆", 66),
                    (u"台州", 67), (u"南京", 67), (u"滨州", 70), (u"贵阳", 71),
                    (u"无锡", 71), (u"本溪", 71), (u"克拉玛依", 72), (u"渭南", 72),
                    (u"马鞍山", 72), (u"宝鸡", 72), (u"焦作", 75), (u"句容", 75),
                    (u"北京", 79), (u"徐州", 79), (u"衡水", 80), (u"包头", 80),
                    (u"绵阳", 80), (u"乌鲁木齐", 84), (u"枣庄", 84), (u"杭州", 84),
                    (u"淄博", 85), (u"鞍山", 86), (u"溧阳", 86), (u"库尔勒", 86),
                    (u"安阳", 90), (u"开封", 90), (u"济南", 92), (u"德阳", 93),
                    (u"温州", 95), (u"九江", 96), (u"邯郸", 98), (u"临安", 99),
                    (u"兰州", 99), (u"沧州", 100), (u"临沂", 103), (u"南充", 104),
                    (u"天津", 105), (u"富阳", 106), (u"泰安", 112), (u"诸暨", 112),
                    (u"郑州", 113), (u"哈尔滨", 114), (u"聊城", 116), (u"芜湖", 117),
                    (u"唐山", 119), (u"平顶山", 119), (u"邢台", 119), (u"德州", 120),
                    (u"济宁", 120), (u"荆州", 127), (u"宜昌", 130), (u"义乌", 132),
                    (u"丽水", 133), (u"洛阳", 134), (u"秦皇岛", 136), (u"株洲", 143),
                    (u"石家庄", 147), (u"莱芜", 148), (u"常德", 152), (u"保定", 153),
                    (u"湘潭", 154), (u"金华", 157), (u"岳阳", 169), (u"长沙", 175),
                    (u"衢州", 177), (u"廊坊", 193), (u"菏泽", 194), (u"合肥", 229),
                    (u"武汉", 273), (u"大庆", 279)]

        geo = Geo(
            "全国主要城市空气质量",
            "data from pm2.5",
            title_color="#fff",
            title_pos="center",
            width=1200,
            height=600,
            background_color="#404a59",
        )
        page = Page()
        page.add(geo)
        attr, value = geo.cast(data)

        geo.add("",
                attr,
                value,
                visual_range=[0, 200],
                visual_text_color="#fff",
                symbol_size=10,
                is_visualmap=True)
        save_helper.save_tu_helper(page, dataname)
Esempio n. 8
0
    def tu_kmeans(self,v=None,n_c=4,dataname="None"):

        def lable_(ny,num,lable):
            num=0
            for i in lable:
                for j in range(0,n_c):
                    if i == j:
                        ny[j]=np.vstack((ny[j],np.array(v[num])))
                        num=num+1
            return ny

        #test data
        if v==None:
            v=np.random.random((190,2))

        setattr=Scatter("数据平面图")
        #kmeans

        kmeans = KMeans(n_clusters=n_c, random_state=9).fit(v)
        y_pred=kmeans.labels_
        print(kmeans.cluster_centers_[1])
        setattr.add("center",kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1])
        nn={}
        for i in range(0,n_c):
            nn[i]=np.array([0,0])
        nn=lable_(nn,n_c,y_pred)


        arr=np.array([[1,3],[2,3],[3,2]])

        bar =Bar("方差分析")

        td=["cul"+str(i) for i in range(0,n_c)]
        td.append("u")
        td.append("sum")


        for i in range(0,n_c):
            nn[i]=np.delete(nn[i],0,0)
            setattr.add("cul"+str(i),nn[i][:,0],nn[i][:,1])

        def manhattan_distance(x,y):
            sum=0
            for poin in y:
                sum=np.sum(abs(x-y))
            return sum

        dis={}

        for i in range(0,n_c):
            dis[i]=manhattan_distance(kmeans.cluster_centers_[i],nn[i])

        dis_list=[dis[i] for i in dis]
        dis_sum=reduce(lambda x,y:x+y,dis_list)
        print dis_sum


        radar=Radar("簇点误差分析.html")

        #dbscan
        dis_db=0.1
        num_simple=5
        dbscan=skc.DBSCAN(dis_db,num_simple).fit(v)
        n_clusters=len(set(dbscan.labels_))-1
        clu_lab=dbscan.labels_

        scatter=Scatter("噪声分析")

        for i in range(n_clusters):
            print(i)
            one_clu=v[clu_lab == i]

            scatter.add("scan"+str(i),one_clu[:,0],one_clu[:,1])


        zaosheng=v[dbscan.labels_==-1]
        if zaosheng != []:
            scatter.add("噪声点",zaosheng[:,0],zaosheng[:,1])


        radar.config([("clu"+str(i),reduce(max,dis_list)) for i in range(0,n_c)])
        dis_list.append(dis_sum/n_c)
        dis_list.append(dis_sum)
        bar.add("",td,dis_list,is_stack=True,label_pos='inside')
        radar.add("bia",[dis_list],is_splitline=True, is_axisline_show=True)
        page=Page()
        page.add(setattr)
        page.add(bar)
        page.add(radar)
        page.add(scatter)

        save_helper.save_tu_helper(page,dataname)