Esempio n. 1
0
 def train(self):
     # 训练
     # 训练可以使用无监督或有监督
     n_classes = 3
     _model = GaussianMixture(n_components=n_classes,
                              covariance_type='full',
                              random_state=0,
                              max_iter=20)
     if self.rgb_or_gray == "rgb":
         _model.means_init = np.array([[0, 0, 0], [120, 100, 80],
                                       [225, 225, 225]])  # 使用标签初始化
     else:
         _model.means_init = np.array([[0], [100], [255]])  # 使用标签初始化
     _model.fit(self.img_arr_1d)
     return _model
Esempio n. 2
0
def train_gmm(path):
    check_path(path)
    trainDict = pickle.load(open(os.path.join(path, 'train.dict'), 'rb'))
    validDict = pickle.load(open(os.path.join(path, 'valid.dict'), 'rb'))
    testDict = pickle.load(open(os.path.join(path, 'test.dict'), 'rb'))

    silentData = np.concatenate(
        (np.asarray(trainDict['silent']), np.asarray(validDict['silent'])),
        axis=0)
    silentData = np.concatenate((silentData, np.asarray(testDict['silent'])),
                                axis=0)

    voiceData = np.concatenate(
        (np.asarray(trainDict['voice']), np.asarray(validDict['voice'])),
        axis=0)
    voiceData = np.concatenate((voiceData, np.asarray(testDict['voice'])),
                               axis=0)

    estimator = GaussianMixture(n_components=2,
                                covariance_type='diag',
                                max_iter=100,
                                random_state=11)
    meanInit = np.zeros((2, dims))
    meanInit[0] = silentData.mean(axis=0)
    meanInit[1] = voiceData.mean(axis=0)
    estimator.means_init = meanInit
    estimator.fit(np.concatenate((silentData, voiceData), axis=0))

    pickle.dump(estimator,
                open(os.path.join(modelPath, 'gmm1.model'), 'wb'),
                protocol=pickle.HIGHEST_PROTOCOL)
def train(train_list, novocal_clf, vocal_clf):
    print "Extracting Train Features"
    train_features = np.empty((n_features, 0))
    for i in range(len(train_list)):
        print train_list[i]
        y, sr = librosa.load(train_list[i], sr=fs)
        y = librosa.effects.hpss(y)[0]  #Perform HPSS
        y = bandpass_filter(y, sr, low, high, 2)  #Bandpass
        train_features = np.concatenate(
            (train_features,
             feature_extractor(y, fs, frame_length, hop_length, n_mfcc)),
            axis=1)

    # Transpose feature matrix
    train_features = train_features.T
    # Build a Tri-gaussian model from the extracted features
    clf = GaussianMixture(n_components=n_components_vocals +
                          n_components_novocals,
                          covariance_type=covariance_type,
                          max_iter=max_iter)
    # Initialize model with bootstrap models
    clf.means_init = np.concatenate((novocal_clf.means_, vocal_clf.means_))
    # Expectation-Maximization
    print "EM Estimations of parameters"
    clf.fit(train_features)
    return (clf, n_components_novocals, n_components_vocals)
Esempio n. 4
0
def gmm_dbscan(minPts=5, e=1300):
    """
    gmm算法与dbscan算法比较
    :param minPts: q3取到的最优值,默认为最优
    :param e: q3取到的最优半径,默认为最优
    :return: 无
    """
    datas_set, datas_matrix = get_data()
    datas_matrix_T = datas_matrix.T

    X = datas_matrix_T
    res_vipno, random_vipno = lsh(0.01, "cosine")

    db = DBSCAN(eps=e, min_samples=minPts).fit(X)
    y_train = db.labels_
    n_cluster = len(set(y_train)) - (1 if -1 in y_train else 0)

    accs = []
    types = ['full', 'tied', 'diag', 'spherical']
    # 比较了四种协方差矩阵对应的acc值
    for type in types:
        estimator = GaussianMixture(n_components=n_cluster,
                                    covariance_type='tied')

        # 我们假定KMeans是真实的聚类结果,那么我们可以预先确定部分GMM参数
        estimator.means_init = np.array(
            [X[y_train == i].mean(axis=0) for i in range(n_cluster)])
        estimator.fit(X)

        y_train_pred = estimator.predict(X)

        train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100
        print(y_train)
        print(y_train_pred)

        print("Comparing with DBScan, the accuracy of GMM is:", train_accuracy,
              "%")
        accs.append(train_accuracy)
        res = 0
        # pos为q1中输入的随机vipno在gmm中的分类结果
        pos = y_train_pred[datas_set.columns.get_loc(random_vipno)]
        # 逐个获取q1中输出的knn对应在gmm中的分类结果,和pos比较
        for i in res_vipno:
            if y_train_pred[datas_set.columns.get_loc(i)] == pos:
                res += 1

        print("For k =", len(res_vipno), "There are", res,
              "in the same cluster as GMM predicted")
    # 做四种协方差的acc值图
    plt.bar(types,
            accs,
            alpha=0.9,
            width=0.35,
            facecolor='lightskyblue',
            edgecolor='white',
            label='acc',
            lw=1)
    plt.title("four covariances` acc")
    plt.legend(loc="upper left")
    plt.show()
Esempio n. 5
0
    def make_ellipses(self, ax):
        gmm = GaussianMixture(n_components=self.k,
                              covariance_type="full",
                              max_iter=500,
                              random_state=0)
        gmm.means_init = self.kmeans.cluster_centers_
        gmm.fit(self.data)

        for n in range(self.k):
            color = colors[n]
            if gmm.covariance_type == 'full':
                covariances = gmm.covariances_[n][:2, :2]
            elif gmm.covariance_type == 'tied':
                covariances = gmm.covariances_[:2, :2]
            elif gmm.covariance_type == 'diag':
                covariances = np.diag(gmm.covariances_[n][:2])
            elif gmm.covariance_type == 'spherical':
                covariances = np.eye(gmm.means_.shape[1]) * gmm.covariances_[n]
            v, w = np.linalg.eigh(covariances)
            u = w[0] / np.linalg.norm(w[0])
            angle = np.arctan2(u[1], u[0])
            angle = 180 * angle / np.pi  # convert to degrees
            v = 2. * np.sqrt(2.) * np.sqrt(v)
            ell = mpl.patches.Ellipse(gmm.means_[n, :2],
                                      v[0],
                                      v[1],
                                      180 + angle,
                                      color=color)
            ell.set_clip_box(ax.bbox)
            ell.set_alpha(0.5)
            ax.add_artist(ell)
            ax.set_aspect('equal', 'datalim')
Esempio n. 6
0
def train_gmm(estimator, feaPath, fileList):
    check_path(feaPath)
    data = []
    for files in fileList:
        check_file(files)
        lines = open(files, 'rb').readlines()
        for items in lines:
            items = items.split('\n')[0].split('\t')
            audioClass, audioName = os.path.split(items[0])
            audioName, _ = os.path.splitext(audioName)
            audioID = int(items[1])

            check_file(os.path.join(feaPath, audioName+'.fea'))
            tmpdata = pickle.load(open(os.path.join(feaPath, audioName+'.fea'), 'rb'))
            assert tmpdata.shape[0] == dims
            for i in range(tmpdata.shape[1]):
                data.append(tmpdata[:, i])
    data = np.asarray(data)

    gmm = GaussianMixture(n_components=2,
                   covariance_type='diag', max_iter=100, random_state=0)
    gmm.means_init = estimator.means_
    gmm.fit(data)

    pickle.dump(gmm, open(os.path.join(modelPath, 'gmm_esc50.model'), 'wb'),
                protocol=pickle.HIGHEST_PROTOCOL)
    return gmm
Esempio n. 7
0
def train_gmm(files):
    print 'Load train data & train gmm model'
    stime = time.time()
    check_file(files)
    trainDict = pickle.load(open(files, 'rb'))
    silentData = np.asarray(trainDict['silent'])
    voiceData = np.asarray(trainDict['voice'])
    trainData = np.concatenate((silentData, voiceData), axis=0)

    meanInit = np.zeros((2, dims))
    meanInit[0] = silentData.mean(axis=0)
    meanInit[1] = voiceData.mean(axis=0)

    estimator = GaussianMixture(n_components=2,
                                covariance_type='diag',
                                max_iter=100,
                                random_state=0)
    estimator.means_init = meanInit
    estimator.fit(trainData)

    pickle.dump(trainData.mean(),
                open(os.path.join(waitPath, 'train.mean'), 'wb'),
                protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(trainData.std(),
                open(os.path.join(waitPath, 'train.std'), 'wb'),
                protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(estimator,
                open(os.path.join(modelPath, 'gmm.model'), 'wb'),
                protocol=pickle.HIGHEST_PROTOCOL)
    print 'Finished train & saved gmm model in:\n   {:s}\nUsetime {:f}\n'.format(
        os.path.join(modelPath, 'gmm.model'),
        time.time() - stime)
Esempio n. 8
0
def Lap_update(good_samples, n_comp=40, cov_type='full'):
    # returns a generator function that generates samples from a Laplace approximation of points in good_samples


    print('Fitting mixture of Gaussians ... ')
    n, dim = good_samples.shape

    if n < n_comp:
        n_comp = n

    estimator = GaussianMixture(n_components=n_comp,
                    covariance_type=cov_type, max_iter=2500, random_state=0)
    estimator.means_init = [np.random.random_sample(dim)
                            for i in range(n_comp)]
    estimator.fit(good_samples)
    print('Done!')

    def gen_lap(batch_size):
        while True:
            yield estimator.sample(batch_size)[0]

    return gen_lap

# good_samples = np.ones([500, 2])
# g_fun = Lap_update(good_samples)
# gen = g_fun(10)
# samp = next(gen)
# print(samp)
Esempio n. 9
0
def gmm_kmeans(n_cluster=2):
    """
    gmm与kmeans的比较
    :param n_cluster: q2取到的最优值,默认为最优
    :return: 无
    """
    # 数据获取
    datas_set, datas_matrix = get_data()
    datas_matrix_T = datas_matrix.T
    X = datas_matrix_T
    res_vipno, random_vipno = lsh(0.01, "cosine")

    # 数据利用KMeans训练
    clusterer = KMeans(n_clusters=n_cluster)
    y_train = clusterer.fit_predict(X)

    accs = []
    types = ['full', 'tied', 'diag', 'spherical']
    # 比较了四种协方差矩阵对应的acc值
    for type in types:
        estimator = GaussianMixture(n_components=n_cluster,
                                    covariance_type='diag')

        # 我们假定KMeans是真实的聚类结果,那么我们可以预先确定部分GMM参数
        estimator.means_init = np.array(
            [X[y_train == i].mean(axis=0) for i in range(n_cluster)])
        estimator.fit(X)

        y_train_pred = estimator.predict(X)
        train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100
        print(y_train)
        print(y_train_pred)

        print("Comparing with KMeans, the accuracy of GMM is:", train_accuracy,
              "%")
        accs.append(train_accuracy)
        res = 0
        # pos为q1中输入的随机vipno在gmm中的分类结果
        pos = y_train_pred[datas_set.columns.get_loc(random_vipno)]
        # 逐个获取q1中输出的knn对应在gmm中的分类结果,和pos比较
        for i in res_vipno:
            if y_train_pred[datas_set.columns.get_loc(i)] == pos:
                res += 1

        print("For k =", len(res_vipno), "There are", res,
              "in the same cluster as gmm predicted")

    # 做四种协方差的acc值图
    plt.bar(types,
            accs,
            alpha=0.9,
            width=0.35,
            facecolor='lightskyblue',
            edgecolor='white',
            label='time',
            lw=1)
    plt.title("four covariances` acc")
    plt.legend(loc="upper left")
    plt.show()
def test_check_means():
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng)

    n_components, n_features = rand_data.n_components, rand_data.n_features
    X = rand_data.X['full']

    g = GaussianMixture(n_components=n_components)

    # Check means bad shape
    means_bad_shape = rng.rand(n_components + 1, n_features)
    g.means_init = means_bad_shape
    assert_raise_message(ValueError,
                         "The parameter 'means' should have the shape of ",
                         g.fit, X)

    # Check good means matrix
    means = rand_data.means
    g.means_init = means
    g.fit(X)
    assert_array_equal(means, g.means_init)
Esempio n. 11
0
def train(_x_data):
    # 训练
    # 训练可以使用无监督或有监督
    n_classes = 3
    _model = GaussianMixture(n_components=n_classes,
                             covariance_type='full',
                             random_state=0,
                             max_iter=20)
    _model.means_init = np.array([[0, 0, 0], [120, 100, 80], [225, 225,
                                                              225]])  # 使用标签初始化
    _model.fit(_x_data)
    return _model
Esempio n. 12
0
def cross_validate_gmm(min_count, feature_type):
    """Summary

    Args:
        min_count (TYPE): Description
        feature_type (TYPE): Description
    """
    logger.info('train GaussianMixture model, min_count=%s, extract_method=%s',
                min_count, feature_type)

    X_train, y_train, X_test, y_test = compute_train_test_matrix(
        min_count, feature_type)

    cov_types = ['full', 'tied', 'diag', 'spherical']
    n_components = 3
    for cov_type in cov_types:
        logger.info('cov_type: %s', str.upper(cov_type))

        clf = GaussianMixture(n_components=n_components,
                              covariance_type=cov_type,
                              max_iter=100,
                              random_state=42,
                              verbose=2)

        # Since we have class labels for the training data, we can
        # initialize the GMM parameters in a supervised manner.
        clf.means_init = np.array(
            [X_train[y_train == i].mean(axis=0) for i in range(n_components)])

        # Train the other parameters using the EM algorithm.
        clf.fit(X_train)

        logger.info('TRAIN RESULT:')
        y_train_pred = clf.predict(X_train)
        accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100
        logger.info('Accuracy: %.1f', accuracy)
        logger.info('Confusion_matrix: \n%s',
                    confusion_matrix(y_train, y_train_pred))

        logger.info('TEST RESULT:')
        y_test_pred = clf.predict(X_test)
        logger.info('Accuracy: %.1f',
                    accuracy_score(y_test, y_test_pred) * 100)

        logger.info('Classification report: \n%s',
                    classification_report(y_test, y_test_pred))

        logger.info('Confusion_matrix: \n%s',
                    confusion_matrix(y_test, y_test_pred))
Esempio n. 13
0
    def trainGMM(self, nClasses=2, covType='spherical', maxIts=20):
        model = GaussianMixture(n_components=nClasses,
                                covariance_type=covType,
                                max_iter=maxIts)
        model.means_init = np.array([
            self.train[self.trainTgt == i].mean(axis=0)
            for i in range(nClasses)
        ])
        model.fit(self.train, self.trainTgt)

        trainOut = model.predict(self.train)
        trainError = np.mean(self.trainTgt.ravel() == trainOut.ravel()) * 100
        print("Training Error: ", trainError)

        return model
Esempio n. 14
0
def get_predictions_semi(path, k_min, k_max, num_class, cov_type, seed,
                         labels):
    targets = []
    kmer_table = get_kmer_table(path, k_min, k_max)
    finalDf = pd.concat([kmer_table, pd.Series(labels)], axis=1)
    gmm = GMM(n_components=num_class,
              covariance_type=cov_type,
              random_state=seed)
    for i in range(num_class):
        if (i in list(finalDf.Labels)):
            targets.append(i)
    if (len(targets) == num_class):
        gmm.means_init = np.array(
            [kmer_table[finalDf.Labels == i].mean(axis=0) for i in targets])
    gmm.fit(kmer_table)
    predictions = gmm.predict(kmer_table)
    return predictions
Esempio n. 15
0
def gmm_dist(rad, beam, stm, etm, data_dict):
    gate = data_dict['gate']
    vel = map(abs, data_dict['velocity'])  #data_dict['velocity']
    wid = data_dict['width']
    power = data_dict['power']
    gsflg = data_dict['gsflg']

    # fig1 = plt.figure(1,figsize=(12,12))
    #fig1.suptitle(stm.strftime("%d %b %Y")+ ' to ' + etm.strftime("%d %b %Y"), fontsize=16)

    # plt.subplot(221)
    # plt.scatter(vel, gate,c=gsflg)
    # plt.xlabel('Velocity [m/s]')
    # plt.ylabel('Range gate')

    # plt.subplot(222)
    # plt.scatter(wid, gate,c=gsflg)
    # plt.xlabel('Spectral width [m/s]')
    # plt.ylabel('Range gate')

    # plt.subplot(223)
    # plt.scatter(power, gate,c=gsflg)
    # plt.xlabel('Power [dB]')
    # plt.ylabel('Range gate')

    # plt.subplot(224)
    # plt.scatter(vel, wid,c=gsflg)
    # plt.xlabel('Velocity [m/s]')
    # plt.ylabel('Spectral width [m/s]')

    # fig1.tight_layout()
    #fig1.savefig(rad+'_beam'+str(beam)+'_'+stm.strftime("%y-%m-%d")+'_scatter_plot.png')
    #plt.show()

    #need to scale data before apply kmeans
    gate_scaled = preprocessing.scale(gate)
    vel_scaled = preprocessing.scale(vel)
    wid_scaled = preprocessing.scale(wid)
    power_scaled = preprocessing.scale(power)

    #data = np.column_stack((gate,vel,wid,power))
    #data = np.column_stack((vel_scaled,wid_scaled))
    full_data = np.column_stack(
        (gate_scaled, vel_scaled, wid_scaled, power_scaled))

    # Break up the dataset into non-overlapping training (95%) and testing
    # (5%) sets.
    skf = StratifiedKFold(n_splits=20)
    # Only take the first fold.
    N, D = full_data.shape  # TODO UGLY, FIX!
    train_index, test_index = next(iter(skf.split(full_data, np.ones(N))))

    data = full_data[train_index, :]
    validation_data = full_data[test_index, :]

    N, D = data.shape
    # Z = KMeans(init = 'k-means++',n_clusters = 2).fit_predict(data)
    # source
    # http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_covariances.html#sphx-glr-auto-examples-mixture-plot-gmm-covariances-py
    n_classes = 4
    cov_type = 'full'  # ['spherical', 'diag', 'tied', 'full']
    estimator = GaussianMixture(n_components=n_classes, \
                           covariance_type=cov_type, max_iter=20, \
                           random_state=0)
    # initialize the GMM parameters in a supervised manner.
    # estimator.means_init = np.array([X_train[y_train ==i].mean(axis=0))
    estimator.means_init = np.random.random((n_classes, D)) * 2.0 - 1.0
    # Train the other parameters using the EM algorithm.
    estimator.fit(data)

    fig2 = plt.figure(2, figsize=(12, 12))

    for plot_data, marker, alpha in zip([data, validation_data], ['.','x'], \
            [0.1, 0.7]):
        Z = estimator.predict(plot_data)

        #plt.subplot(111)
        #plt.scatter(plot_data[:,0], plot_data[:,1],c=Z)
        #plt.xlabel('Scaled Velocity')
        #plt.ylabel('Scaled Spectral width')

        plt.subplot(221)
        plt.scatter(plot_data[:, 1],
                    plot_data[:, 0],
                    c=Z,
                    marker=marker,
                    alpha=alpha)
        plt.xlabel('Scaled Velocity')
        plt.ylabel('Scaled Range gate')

        plt.subplot(222)
        plt.scatter(plot_data[:, 2],
                    plot_data[:, 0],
                    c=Z,
                    marker=marker,
                    alpha=alpha)
        plt.xlabel('Scaled Spectral width')
        plt.ylabel('Scaled Range gate')

        plt.subplot(223)
        plt.scatter(plot_data[:, 3],
                    plot_data[:, 0],
                    c=Z,
                    marker=marker,
                    alpha=alpha)
        plt.xlabel('Scaled Power')
        plt.ylabel('Scaled Range gate')

        plt.subplot(224)
        plt.scatter(plot_data[:, 1],
                    plot_data[:, 2],
                    c=Z,
                    marker=marker,
                    alpha=alpha)
        plt.xlabel('Scaled Velocity')
        plt.ylabel('Scaled Spectral width')

    fig2.tight_layout()

    plot_data = full_data
    Z = estimator.predict(plot_data)

    fig3 = plt.figure(3, figsize=(6, 6))
    plt.subplot(111)
    ax3 = Axes3D(fig3, elev=48, azim=134)  #, rect=[0, 0, .95, 1]
    ax3.scatter(vel, gate, wid, c=gsflg)
    ax3.set_xlabel('Velocity [m/s]')
    ax3.set_ylabel('Range gate')
    ax3.set_zlabel('Spectral width [m/s]')

    fig4 = plt.figure(4, figsize=(6, 6))
    plt.subplot(111)
    ax4 = Axes3D(fig4, elev=48, azim=134)  #, rect=[0, 0, .95, 1]
    ax4.scatter(plot_data[:, 1], plot_data[:, 0], plot_data[:, 2], c=Z)
    ax4.set_xlabel('Scaled Velocity')
    ax4.set_ylabel('Scaled Range gate')
    ax4.set_zlabel('Scaled Spectral width')

    plt.show()
Esempio n. 16
0
def GMM_Sklearn(data, targets, colors,dataset, target_names):

  print('APPLY GMM...')
  X_train, Y_train = data[0], targets[0]
  X_val, Y_val = data[1], targets[1]
  X_test, Y_test = data[2], targets[2]

  n_classes = len(np.unique(Y_train))

  # Try GMMs using different types of covariances.
  estimators = {cov_type: GaussianMixture(n_components=n_classes,
                covariance_type=cov_type, max_iter=20, random_state=0)
                for cov_type in ['spherical', 'diag', 'tied', 'full']}

  n_estimators = len(estimators)

  plt.figure(figsize=(3 * n_estimators // 2, 6))
  plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05,
                      left=.01, right=.99)


  for index, (name, estimator) in enumerate(estimators.items()):
    # Since we have class labels for the training data, we can
    # initialize the GMM parameters in a supervised manner.
    estimator.means_init = np.array([X_train[Y_train == i].mean(axis=0)
                                    for i in range(n_classes)])

    # Train the other parameters using the EM algorithm.
    estimator.fit(X_train)

    h = plt.subplot(2, n_estimators // 2, index + 1)
    make_ellipses(estimator, h, colors)

    for n, color in enumerate(colors):
      dataf = X_train[(Y_train == n)]
      plt.scatter(dataf[:, 0], dataf[:, 1], s=0.8, color=color,
                  label=target_names[n])
    # Plot the test data with crosses
    for n, color in enumerate(colors):
      dataf = X_test[Y_test == n]
      plt.scatter(dataf[:, 0], dataf[:, 1], marker='x', color=color)

    y_train_pred = estimator.predict(X_train)
    train_accuracy = np.mean(y_train_pred.ravel() == Y_train.ravel()) * 100
    plt.text(0.05, 0.9, 'Train accuracy: %.1f' % train_accuracy,
            transform=h.transAxes)

    y_test_pred = estimator.predict(X_test)
    test_accuracy = np.mean(y_test_pred.ravel() == Y_test.ravel()) * 100
    plt.text(0.05, 0.8, 'Test accuracy: %.1f' % test_accuracy,
            transform=h.transAxes)

    plt.xticks(())
    plt.yticks(())
    plt.title(name)

  plt.legend(scatterpoints=1, loc='lower right', prop=dict(size=12))

  ### APPLY GMM ### 
  estimator = GaussianMixture(n_components=n_classes,
                covariance_type='tied', max_iter=20, random_state=0)

  estimator.means_init = np.array([X_train[Y_train == i].mean(axis=0)
                                      for i in range(n_classes)])

  # Train the other parameters using the EM algorithm.
  estimator.fit(X_train)
  predictions = estimator.predict(X_test)
  scores = estimator.predict_proba(X_test)

  Y_test = list(Y_test) 

  assert len(predictions) == len(scores)
  assert len(scores) == len(Y_test)

  acc, prec, rec, sens, spec = evaluate(estimator, X_test, Y_test, np.array(predictions), np.array(scores[:,1]), np.array(Y_test), n_classes, 'gmm' + dataset) 
  print('Test Accuracy, Precision, Recall', acc, prec, rec)
  print()


  # print("Classification report for classifier %s:\n%s\n"
  #     % (estimator, metrics.classification_report(Y_test, predictions)))
  # plt.figure() 
  # disp = metrics.plot_confusion_matrix(estimator, X_test, Y_test)
  # disp.figure_.suptitle("Confusion Matrix")
  # print("Confusion matrix:\n%s" % disp.confusion_matrix)
  # plt.savefig('../plots/'+dataset +'_Gmm_confmatrix.png')

  return acc, prec, rec, sens, spec 
Esempio n. 17
0
print score
writeHTML(clusterType="Kmean_final", clusterLabel=cluster_labels)

#使用最佳的eps进行DBScan聚类
db = DBSCAN(eps=eps, min_samples=3).fit(XYMatrix)
db_label = np.array([i + 1 for i in db.labels_])
score = silhouette_score(XYMatrix, db_label)
print score
writeHTML(clusterType="DBSCAN_final", clusterLabel=db_label)

#Kmeans聚类与GMM聚类进行比较
kmean_classes = len(np.unique(cluster_labels))

#GMM聚类 n_components = kmean_classes
gmm = GaussianMixture(n_components=kmean_classes, max_iter=20, random_state=0)
gmm.means_init = np.array(
    [XYMatrix[cluster_labels == i].mean(axis=0) for i in range(kmean_classes)])
gmm.fit(XYMatrix)
gmm_labels = gmm.predict(XYMatrix)
#以Kmeans为基础,计算GMM的准确率
train_accuracy = np.mean(gmm_labels.ravel() == cluster_labels.ravel()) * 100
print "gmm - kmeans accuracy : ", train_accuracy

#去除DBScan算法认定的噪音
no_noise_matrix = np.array(XYMatrix[db_label != 0])
no_noise_label = np.array(db_label[db_label != 0])

dbscan_class = len(np.unique(no_noise_label))

gmm = GaussianMixture(n_components=dbscan_class, random_state=0)
gmm.means_init = np.array([
    no_noise_matrix[no_noise_label == i].mean(axis=0)
# Break up the dataset into non-overlapping training (75%) and testing
# (25%) sets.
skf = StratifiedKFold(n_splits=4)
# Only take the first fold.
train_index, test_index = next(iter(skf.split(iris.data, iris.target)))

X_train = iris.data[train_index]
y_train = iris.target[train_index]
X_test = iris.data[test_index]
y_test = iris.target[test_index]

print("X_train shape:",X_train.shape)
print("X_test shape:",X_test.shape)

# 训练
# 训练可以使用无监督或有监督
n_classes = 3
clf = GaussianMixture(n_components=n_classes, covariance_type='full',random_state=0,max_iter=20)
clf.means_init = np.array([X_train[y_train == i].mean(axis=0)for i in range(n_classes)]) # 使用标签初始化
clf.fit(X_test)
print("model means:",clf.means_)
print("model weights:",clf.weights_)
# 预测
#预测trian
y_predict = clf.predict(X_train)
print("train:",np.mean(y_predict.ravel()==y_train.ravel()))
#预测test
y_predict = clf.predict(X_test)
print("test:",np.mean(y_predict.ravel()==y_test.ravel()))

Esempio n. 19
0
def em(algo,
       X_train,
       X_test,
       y_train,
       y_test,
       init_means,
       no_iter=1000,
       component_list=[3, 4, 5, 6, 7, 8, 9, 10, 11],
       num_class=7,
       toshow=1):

    array_aic = []
    array_bic = []
    array_homo = []
    array_comp = []
    array_sil = []
    array_avg_log = []

    for num_classes in component_list:

        clf = GaussianMixture(n_components=num_classes,
                              covariance_type='spherical',
                              max_iter=no_iter,
                              init_params='kmeans')
        #     clf = KMeans(n_clusters= num_classes, init='k-means++')

        clf.fit(X_train)

        y_test_pred = clf.predict(X_test)
        #Per sample average log likelihood
        avg_log = clf.score(X_test)
        array_avg_log.append(avg_log)

        #AIC on the test data
        aic = clf.aic(X_test)
        array_aic.append(aic)

        #BIC on the test data
        bic = clf.bic(X_test)
        array_bic.append(bic)

        #Homogenity score on the test data
        h**o = metrics.homogeneity_score(y_test, y_test_pred)
        array_homo.append(h**o)

        #Completeness score
        comp = metrics.completeness_score(y_test, y_test_pred)
        array_comp.append(comp)

        #Silhoutette score
        sil = metrics.silhouette_score(X_test, y_test_pred, metric='euclidean')
        array_sil.append(sil)

    #Generating plots

    fig1, ax1 = plt.subplots()
    ax1.plot(component_list, array_aic)
    ax1.plot(component_list, array_bic)
    plt.legend(['AIC', 'BIC'])
    plt.xlabel('Number of clusters')
    plt.title('AIC - BIC curve for EM - ' + algo)

    fig2, ax2 = plt.subplots()
    ax2.plot(component_list, array_homo)
    ax2.plot(component_list, array_comp)
    ax2.plot(component_list, array_sil)
    plt.legend(['homogenity', 'completeness', 'silhoutette'])
    plt.xlabel('Number of clusters')
    plt.title('Performance scores for EM - ' + algo)

    fig3, ax3 = plt.subplots()
    ax3.plot(component_list, array_avg_log)
    plt.xlabel('Number of clusters')
    plt.title('sample log avg likelihood for EM - ' + algo)

    if (toshow == 1):
        plt.show()

    #Training and testing accuracy for K = number of classes

    clf = GaussianMixture(n_components=num_class,
                          covariance_type='spherical',
                          max_iter=no_iter,
                          init_params='kmeans')

    #Assigning the initial means as the mean feature vector for the class
    clf.means_init = init_means

    clf.fit(X_train)

    #Training accuracy
    y_train_pred = clf.predict(X_train)
    train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100
    print('Training accuracy for Expected Maximization for K = {}:  {}'.format(
        num_class, train_accuracy))

    #Testing accuracy
    y_test_pred = clf.predict(X_test)
    test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100
    print('Testing accuracy for Expected Maximization for K = {}:  {}'.format(
        num_class, test_accuracy))

    return component_list, array_aic, array_bic, array_homo, array_comp, array_sil, array_avg_log
Esempio n. 20
0
def MyGaussianMixture(begin, end, iinput, nums, mystr):
    dir_path = "../Others/data/First_data.csv"
    ff = pd.read_csv(dir_path,
                     sep=',',
                     index_col=False,
                     encoding="utf-8",
                     low_memory=False)  ##Read file
    list_train = []
    list_target = []
    max_x = -200
    min_x = 200
    max_y = -200
    min_y = 200
    for item in ff.index:
        if begin > ff.iloc[item]["Detection Date"] or end < ff.iloc[item][
                "Detection Date"]:
            continue
        else:
            if float(ff.iloc[item]["Longitude"]) > max_x:
                max_x = float(ff.iloc[item]["Longitude"])
            if float(ff.iloc[item]["Latitude"]) > max_y:
                max_y = float(ff.iloc[item]["Latitude"])
            if float(ff.iloc[item]["Longitude"]) < min_x:
                min_x = float(ff.iloc[item]["Longitude"])
            if float(ff.iloc[item]["Latitude"]) < min_y:
                min_y = float(ff.iloc[item]["Latitude"])
            list_train.append([
                float(ff.iloc[item]["Longitude"]),
                float(ff.iloc[item]["Latitude"])
            ])
            list_target.append(int(ff.iloc[item]["Lab Status"]))
    skf = StratifiedKFold(n_splits=2, random_state=0, shuffle=True)
    train = np.array(list_train)
    target = np.array(list_target)
    my_x_ticks = np.linspace(min_x, max_x, 5)
    my_y_ticks = np.arange(min_y, max_y, 5)
    train_index, test_index = next(iter(skf.split(train, target)))
    X_train = train[train_index]
    y_train = target[train_index]
    X_test = train[test_index]
    y_test = target[test_index]
    n_classes = np.unique(y_train)
    # Try GMMs using different types of covariances.
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111)
    list_return = []
    for index, label in enumerate(n_classes):
        estimators = GaussianMixture(n_components=1,
                                     covariance_type="full",
                                     max_iter=100,
                                     random_state=0)
        estimators.means_init = np.array(
            [X_train[y_train == label].mean(axis=0)])
        m = X_train[y_train == label]
        if m.shape[0] == 1:
            m = np.concatenate((m, m), axis=0)
        estimators.fit(m)
        ax.add_patch(make_ellipses(estimators, label))
        if iinput == label:
            list_return.append(nums)
            list_return.append(estimators.means_[0][0])
            list_return.append(estimators.means_[0][1])
            list_return.append(estimators.covariances_[0][0][0])
            list_return.append(estimators.covariances_[0][1][1])
            Test(m[:, 0], estimators.means_[0][0])
            Test(m[:, 1], estimators.means_[0][1])
        data = train[target == label]
        plt.scatter(data[:, 0],
                    data[:, 1],
                    s=0.8,
                    color=colors[label],
                    label=target_names[label])
        data = X_test[y_test == label]
        plt.scatter(data[:, 0], data[:, 1], marker='x', color=colors[label])
    plt.title(mystr)
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.xlim((-124.665014 - 0.5, -116.87368700000002 + 0.5))
    plt.ylim((45.488689 - 0.5, 49.548004 + 0.5))
    print(min_x, max_x, min_y, max_y)
    plt.legend(scatterpoints=1, loc='best', prop=dict(size=12))
    plt.show()
    return list_return
Esempio n. 21
0
def gmm_dist(rad, beam, stm, etm, data_dict):
    gate = np.hstack(data_dict['gate'])
    vel = np.hstack(data_dict['velocity'])
    wid = np.hstack(data_dict['width'])
    power = np.hstack(data_dict['power'])
    elev = np.hstack(data_dict['elevation'])
    gs_flg = np.hstack(data_dict['gsflg'])

           
    plot_rti(data_dict,'velocity',gsct=False,fig_num=1)
    
    date_time, time, freq = [], [], []
    
    num_scatter = data_dict['num_scatter']
    for i in range(len(num_scatter)):
        #date_time.extend([data_dict['datetime'][i]]*num_scatter[i])
        time.extend(date2num([data_dict['datetime'][i]]*num_scatter[i]))
        freq.extend([data_dict['frequency'][i]]*num_scatter[i])
    
    time = np.array(time)
    freq = np.array(freq)

    alpha = 0.2
    size = 2
    marker = 's'
    fig2 = plt.figure(figsize=(10,6))
    ax1 = plt.subplot(211)
    plt.scatter(time[gs_flg == 1], gate[gs_flg == 1],s=size,c='grey',marker=marker, alpha=alpha) #plot ground scatter as grey
    plt.scatter(time[gs_flg == 0], gate[gs_flg == 0],s=size,c='red',marker=marker, alpha=alpha)  #plot the other scatter (IS) as red
    ax1.xaxis.set_major_formatter(DateFormatter('%H:%M'))
    #ax1.set_xlabel('Time UT')
    ax1.set_ylabel('Range gate')

    #need to scale data before apply kmeans
    gate_scaled = preprocessing.scale(gate)
    vel_scaled = preprocessing.scale(vel)
    wid_scaled = preprocessing.scale(wid)
    power_scaled = preprocessing.scale(power)
    time_scaled = preprocessing.scale(time)
    elev_scaled = preprocessing.scale(elev)
    freq_scaled = preprocessing.scale(freq)


    #data = np.column_stack((gate,vel,wid,power))
    #data = np.column_stack((vel_scaled,wid_scaled))
    data = np.column_stack((gate_scaled,vel_scaled,wid_scaled,\
                                 power_scaled,elev_scaled,freq_scaled,time_scaled))
    N,D = data.shape
    n_classes = 3

    kmeans = KMeans(init = 'k-means++', n_clusters = n_classes, n_init=50).fit(data)
    


    # source
    # http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_covariances.html#sphx-glr-auto-examples-mixture-plot-gmm-covariances-py

    cov_type = 'full' # ['spherical', 'diag', 'tied', 'full']
    estimator = GaussianMixture(n_components=n_classes, \
                           covariance_type=cov_type, max_iter=100, \
                           random_state=0)
    # initialize the GMM parameters in a supervised manner.
    #estimator.means_init = np.array([X_train[y_train == i].mean(axis=0))
    estimator.means_init = kmeans.cluster_centers_ #np.random.random((n_classes, D))*2.0-1.0
    # Train the other parameters using the EM algorithm.
    estimator.fit(data)
    Z = estimator.predict(data)

    mean_vels = np.zeros(n_classes)
    mean_wids = np.zeros(n_classes)
    for i in range(n_classes):
        mean_vels[i] = np.mean(np.abs(vel[Z == i]))
        mean_wids[i] = np.mean(wid[Z == i])
        print mean_vels[i]
        print mean_wids[i]

    gsfg_min_vel = np.argmin(mean_vels)   #denote the cluster with minimum mean velocity as ground scatter
    gsfg_max_vel = np.argmax(mean_vels)   #denote the cluster with maxmum mean velocity as ionospheric scatter
    print gsfg_min_vel
    print gsfg_max_vel


    new_gsflg = []
    tnum_scatter = 0
    for i in range(len(num_scatter)):
        new_gsflg.append(Z[tnum_scatter:(tnum_scatter+num_scatter[i])].tolist())
        tnum_scatter += num_scatter[i]

    #ipdb.set_trace()
    data_dict['gsflg'] = new_gsflg
    #print len(new_gsflg)

    
    ax2 = plt.subplot(212)
    plt.scatter(time, gate,s=size,c='blue',marker=marker,alpha = alpha) #plot the third scatter (E region/meteor scatter or noise?) as blue
    plt.scatter(time[Z == gsfg_min_vel], gate[Z == gsfg_min_vel],s=size,c='grey',marker=marker,alpha = alpha) #plot ground scatter as grey
    plt.scatter(time[Z == gsfg_max_vel], gate[Z == gsfg_max_vel],s=size,c='red',marker=marker,alpha = alpha)  #plot ionospheric scatter as red
    ax2.xaxis.set_major_formatter(DateFormatter('%H:%M'))
    ax2.set_xlabel('Time UT')
    ax2.set_ylabel('Range gate')
    fig2.tight_layout()

    plot_rti(data_dict,'velocity',gsct=True,gsfg_min_vel=gsfg_min_vel,fig_num=3)

    #scatter_plot(data,Z)
    plt.show()
Esempio n. 22
0
train_index, test_index = next(iter(indices))

X_train = iris.data[train_index]
y_train = iris.target[train_index]

X_test = iris.data[test_index]
y_test = iris.target[test_index]

num_classes = len(np.unique(y_train))

gmm = GaussianMixture(n_components=num_classes,
                      covariance_type='full',
                      init_params='random',
                      random_state=0,
                      max_iter=20)
gmm.means_init = np.array(
    [X_train[y_train == i].mean(axis=0) for i in range(num_classes)])
gmm.fit(X_train)

plt.figure()
axis_handle = plt.subplot(1, 1, 1)
colors = 'bgr'
for i, color in enumerate(colors):
    eigenvalues, eigenvectors = np.linalg.eigh(gmm.covariances_[i][:2, :2])
    norm_vec = eigenvectors[0] / np.linalg.norm(eigenvectors[0])
    angle = np.arctan2(norm_vec[1], norm_vec[0])
    angle = 180 * angle / np.pi
    scaling_factor = 8
    eigenvalues *= scaling_factor
    ellipse = patches.Ellipse(gmm.means_[i, :2],
                              eigenvalues[0],
                              eigenvalues[1],
Esempio n. 23
0
X = iris.data
y = iris.target

# Create GMM for n_classes
n_classes = len(np.unique(y))
colors = ['navy', 'turquoise', 'darkorange'][0:n_classes]
estimator = GaussianMixture(n_components=n_classes,
                            covariance_type='full',
                            max_iter=100,
                            random_state=0)

plt.figure()

# Initialize the centroids
# - Randoml select a data point of each class as the starting centroid
estimator.means_init = np.array(
    [X[y == i][np.random.choice(len(X[y == i]))] for i in range(n_classes)])

# We can do better at initializing the centroid with labeled data (i.e., for unlabeled data this is not possible)
# Since we have class labels for the training data, we can
# initialize the GMM parameters in a supervised manner.
# estimator.means_init = np.array([X[y == i].mean(axis=0)
#                                  for i in range(n_classes)])

# Train the other parameters using the EM algorithm.
estimator.fit(X)

h = plt.subplot()
make_ellipses(estimator, h)

for n, color in enumerate(colors):
    data = iris.data[iris.target == n]
Esempio n. 24
0
from sklearn.mixture import GaussianMixture
estimator = GaussianMixture(n_components=n_classes,
                            covariance_type="spherical",
                            max_iter=20,
                            random_state=0)
estimator.means_init = np.array(
    [x_train[y_train == i].mean(axis=0) for i in range(n_classes)])
estimator.fit(x_train)
Esempio n. 25
0
            print(i)
    if i >= 50 and i < 100:
        if kmeans[i] != 0:
            print(i)
    if i >= 100:
        if kmeans[i] != 2:
            print(i)

#-----------------------------------------------GMM------------------------------------------------------
gmm = GaussianMixture(n_components=3, max_iter=3000)

X_gmm = list()
for i in array[:, 2]:
    X_gmm.append([i])

gmm.means_init = np.array([[1], [4], [6]])
gmm.covariances_init = np.array([[1], [1], [1]])
gmm.weights_init = np.array([0.5, 0.25, 0.25])

gmm.fit(X_gmm)
gmm_result = gmm.predict(X_gmm)

print("mean: ", gmm.means_)
print("covarinace: ", gmm.covariances_)
print("weight: ", gmm.weights_)

class0 = 0
class1 = 0
class2 = 0

for i in gmm_result:
Esempio n. 26
0
def GMM(data, targets, colors, dataset, target_names):
    print('APPLY GMM...')
    X_train, Y_train = data[0], targets[0]
    X_val, Y_val = data[1], targets[1]
    X_test, Y_test = data[2], targets[2]

    n_classes = len(np.unique(Y_train))

    # Try GMMs using different types of covariances.
    estimators = {
        cov_type: GaussianMixture(n_components=n_classes,
                                  covariance_type=cov_type,
                                  max_iter=20,
                                  random_state=0)
        for cov_type in ['spherical', 'diag', 'tied', 'full']
    }

    n_estimators = len(estimators)

    plt.figure(figsize=(3 * n_estimators // 2, 6))
    plt.subplots_adjust(bottom=.01,
                        top=0.95,
                        hspace=.15,
                        wspace=.05,
                        left=.01,
                        right=.99)

    for index, (name, estimator) in enumerate(estimators.items()):
        # Since we have class labels for the training data, we can
        # initialize the GMM parameters in a supervised manner.
        estimator.means_init = np.array(
            [X_train[Y_train == i].mean(axis=0) for i in range(n_classes)])

        # Train the other parameters using the EM algorithm.
        estimator.fit(X_train)

        h = plt.subplot(2, n_estimators // 2, index + 1)
        make_ellipses(estimator, h, colors)

        for n, color in enumerate(colors):
            dataf = X_train[(Y_train == n)]
            plt.scatter(dataf[:, 0],
                        dataf[:, 1],
                        s=0.8,
                        color=color,
                        label=target_names[n])
        # Plot the test data with crosses
        for n, color in enumerate(colors):
            dataf = X_test[Y_test == n]
            plt.scatter(dataf[:, 0], dataf[:, 1], marker='x', color=color)

        y_train_pred = estimator.predict(X_train)
        train_accuracy = np.mean(y_train_pred.ravel() == Y_train.ravel()) * 100
        plt.text(0.05,
                 0.9,
                 'Train accuracy: %.1f' % train_accuracy,
                 transform=h.transAxes)

        y_test_pred = estimator.predict(X_test)
        test_accuracy = np.mean(y_test_pred.ravel() == Y_test.ravel()) * 100
        plt.text(0.05,
                 0.8,
                 'Test accuracy: %.1f' % test_accuracy,
                 transform=h.transAxes)

        plt.xticks(())
        plt.yticks(())
        plt.title(name)

    plt.legend(scatterpoints=1, loc='lower right', prop=dict(size=12))

    ### GMM with Tied Covariance ###

    estimator = GaussianMixture(n_components=n_classes,
                                covariance_type='tied',
                                max_iter=20,
                                random_state=0)

    estimator.means_init = np.array(
        [X_train[Y_train == i].mean(axis=0) for i in range(n_classes)])

    # Train the other parameters using the EM algorithm.
    estimator.fit(X_train)
    predictions = estimator.predict(X_test)
    scores = estimator.predict_proba(X_test)

    Y_test = list(Y_test)

    assert len(predictions) == len(scores)
    assert len(scores) == len(Y_test)

    accuracy, class_report = multiclass_evaluate(estimator, X_test, Y_test,
                                                 np.array(predictions),
                                                 np.array(scores),
                                                 np.array(Y_test), n_classes,
                                                 'gmm' + dataset)

    sensitivity = [
        class_report['0']['recall'], class_report['1']['recall'],
        class_report['2']['recall']
    ]

    confusion_matrix = sklearn.metrics.confusion_matrix(y_true=Y_test,
                                                        y_pred=predictions)

    print('Confusion Matrix', confusion_matrix)

    specificity0 = np.sum(confusion_matrix[1:, 1:]) / (
        np.sum(confusion_matrix[1:, 1:]) + confusion_matrix[1, 0] +
        confusion_matrix[2, 0])
    Tn1 = confusion_matrix[0, 0] + confusion_matrix[0, -1] + confusion_matrix[
        -1, 0] + confusion_matrix[-1, -1]
    specificity1 = Tn1 / (Tn1 + confusion_matrix[0, 1] +
                          confusion_matrix[2, 1])
    specificity2 = np.sum(confusion_matrix[:1, :1]) / (
        np.sum(confusion_matrix[:1, :1]) + confusion_matrix[0, -1] +
        confusion_matrix[1, -1])

    specificity = [specificity0, specificity1, specificity2]

    assert specificity2 < 1 and specificity1 < 1 and specificity0 < 1

    return accuracy, sensitivity, specificity
Esempio n. 27
0
def gmm_dist(rad, beam, stm, etm):

    #Read data with emprical model information and RTI plot###########################################################################################
    data_dict = read_from_updated_db(rad, beam, stm, etm)
    plot_rti_emp(rad,
                 beam,
                 stm,
                 etm,
                 data_dict,
                 'velocity',
                 fig_num=1,
                 title_str='Empirical Model Results')

    gs_hops = [1.0, 2.0, 3.0]
    is_hops = [0.5, 1.5, 2.5]

    #emp_gsflg = np.hstack(data_dict['gsflg'])
    emp_gate = np.hstack(data_dict['gate'])
    emp_time, emp_gsflg = [], []
    emp_num_scatter = data_dict['num_scatter']

    for i in range(len(emp_num_scatter)):
        emp_time.extend(
            date2num([data_dict['datetime'][i]] * emp_num_scatter[i]))
        for j in range(len(data_dict['hop'][i])):
            if data_dict['hop'][i][j] in is_hops:
                emp_gsflg.append(0)
            elif data_dict['hop'][i][j] in gs_hops:
                emp_gsflg.append(1)

    emp_gsflg = np.array(emp_gsflg)
    emp_time = np.array(emp_time)
    #Read data with emprical model information and RTI plot##########################################################################################

    #Read data from database and RTI plot###########################################################################################################
    plot_rti(rad,
             beam,
             stm,
             etm,
             data_dict,
             'velocity',
             gsct=False,
             fig_num=2,
             title_str='Traditional Model Results')

    gate = np.hstack(data_dict['gate'])
    vel = np.hstack(data_dict['velocity'])
    wid = np.hstack(data_dict['width'])
    power = np.hstack(data_dict['power'])
    elev = np.hstack(data_dict['elevation'])
    gs_flg = np.hstack(data_dict['gsflg'])

    time, freq = [], []

    num_scatter = data_dict['num_scatter']
    for i in range(len(num_scatter)):
        #date_time.extend([data_dict['datetime'][i]]*num_scatter[i])
        time.extend(date2num([data_dict['datetime'][i]] * num_scatter[i]))
        freq.extend([data_dict['frequency'][i]] * num_scatter[i])

    time = np.array(time)
    freq = np.array(freq)
    #Read data from database and RTI plot#############################################################################################################

    #GMM and RTI plot#################################################################################################################################
    #need to scale data before apply kmeans
    gate_scaled = preprocessing.scale(gate)
    vel_scaled = preprocessing.scale(vel)
    wid_scaled = preprocessing.scale(wid)
    power_scaled = preprocessing.scale(power)
    time_scaled = preprocessing.scale(time)
    elev_scaled = preprocessing.scale(elev)
    freq_scaled = preprocessing.scale(freq)

    data = np.column_stack((gate_scaled,vel_scaled,wid_scaled,\
                                    power_scaled,elev_scaled,freq_scaled,time_scaled))
    N, D = data.shape
    n_classes = 3

    kmeans = KMeans(init='k-means++', n_clusters=n_classes,
                    n_init=50).fit(data)

    # source
    # http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_covariances.html#sphx-glr-auto-examples-mixture-plot-gmm-covariances-py
    cov_type = 'full'  # ['spherical', 'diag', 'tied', 'full']
    estimator = GaussianMixture(n_components=n_classes, \
                              covariance_type=cov_type, max_iter=100, \
                              random_state=0)
    # initialize the GMM parameters with kmean centroid
    estimator.means_init = kmeans.cluster_centers_  #np.random.random((n_classes, D))*2.0-1.0
    # Train the other parameters using the EM algorithm.
    estimator.fit(data)
    Z = estimator.predict(data)

    median_vels = np.zeros(n_classes)
    median_wids = np.zeros(n_classes)
    for i in range(n_classes):
        median_vels[i] = np.median(np.abs(vel[Z == i]))
        median_wids[i] = np.median(wid[Z == i])
        print median_vels[i]
        print median_wids[i]

    gsfg_min_vel = np.argmin(
        median_vels
    )  #denote the cluster with minimum mean velocity as ground scatter
    gsfg_max_vel = np.argmax(
        median_vels
    )  #denote the cluster with maxmum mean velocity as ionospheric scatter

    for i in range(n_classes):
        if (i != gsfg_min_vel and i != gsfg_max_vel):
            gsfg_undetermined = i  #denote the third cluster as indeterminate scatter

    print gsfg_min_vel
    print gsfg_max_vel

    new_gsflg = []
    tnum_scatter = 0
    for i in range(len(num_scatter)):
        new_gsflg.append(Z[tnum_scatter:(tnum_scatter +
                                         num_scatter[i])].tolist())
        tnum_scatter += num_scatter[i]

#ipdb.set_trace()
    data_dict['gsflg'] = new_gsflg
    #print len(new_gsflg)

    #calculate GS/IS identification accuracy##############################################################################################
    num_true_trad_gs = len(
        np.where(((gs_flg == 1) | (gs_flg == -1)) & (emp_gsflg == 1))[0])
    num_true_trad_is = len(np.where(((gs_flg == 0)) & (emp_gsflg == 0))[0])

    num_emp = len(emp_gsflg)
    accur_tra = float(num_true_trad_gs + num_true_trad_is) / num_emp * 100.
    print 'The GS/IS identification accurary of traditional method is {:3.2f}%'.format(
        accur_tra)

    num_true_gmm_gs1 = len(
        np.where((Z == gsfg_min_vel) & (emp_gsflg == 1))
        [0])  #Assuming the GS is the cluster with minimum median velocity
    num_true_gmm_is1 = len(
        np.where(((Z == gsfg_max_vel) | (Z == gsfg_undetermined))
                 & (emp_gsflg == 0))[0])

    num_true_gmm_gs2 = len(
        np.where(((Z == gsfg_min_vel) | (Z == gsfg_undetermined))
                 & (emp_gsflg == 1))[0])
    num_true_gmm_is2 = len(
        np.where((Z == gsfg_max_vel) & (emp_gsflg == 0))
        [0])  #Assuming the IS is the cluster with maximum median velocity

    accur_gmm1 = float(num_true_gmm_gs1 + num_true_gmm_is1) / num_emp * 100.
    print 'Assuming the GS is the cluster with minimum median velocity and the IS is the remaining two clusters, the GS/IS identification accurary of GMM is {:3.2f}%'.format(
        accur_gmm1)

    accur_gmm2 = float(num_true_gmm_gs2 + num_true_gmm_is2) / num_emp * 100.
    print 'Assuming the IS is the cluster with maximum median velocity and the GS is the remaining two clusters, the GS/IS identification accurary of GMM is {:3.2f}%'.format(
        accur_gmm2)

    accur_gmm = max(accur_gmm1, accur_gmm2)
    #calculate GS/IS identification accuracy###########################################################################################

    plot_rti(rad,
             beam,
             stm,
             etm,
             data_dict,
             'velocity',
             gsct=True,
             gsfg_min_vel=gsfg_min_vel,
             fig_num=3,
             title_str='Gaussian Mixture Model Results')
    #GMM and RTI plot#################################################################################################################################

    #scatter plot####################################################################################################################
    cm = plt.cm.get_cmap('coolwarm')
    alpha = 1.0
    size = 1
    marker = 's'
    fig4 = plt.figure(figsize=(10, 8))

    ax1 = plt.subplot(311)
    plt.scatter(emp_time[emp_gsflg == 1],
                emp_gate[emp_gsflg == 1],
                s=size,
                c='blue',
                marker=marker,
                alpha=alpha,
                cmap=cm)  #plot GS as blue
    plt.scatter(emp_time[emp_gsflg == 0],
                emp_gate[emp_gsflg == 0],
                s=size,
                c='red',
                marker=marker,
                alpha=alpha,
                cmap=cm)  #plot IS as red
    #plt.scatter(emp_time[emp_gsflg == -1], emp_gate[emp_gsflg == -1],s=size,c='blue',marker=marker, alpha=alpha)  #plot the undertermined scatter as blue
    ax1.xaxis.set_major_formatter(DateFormatter('%H:%M'))
    ax1.set_xlabel('Time UT')
    ax1.set_xlim([stm, etm])
    ax1.set_ylabel('Range gate')
    ax1.set_title('Empirical Model Results based on Burrell et al. 2015')

    ax2 = plt.subplot(312)
    plt.scatter(time[gs_flg == 1],
                gate[gs_flg == 1],
                s=size,
                c='blue',
                marker=marker,
                alpha=alpha,
                cmap=cm)  #plot GS as blue
    plt.scatter(time[gs_flg == 0],
                gate[gs_flg == 0],
                s=size,
                c='red',
                marker=marker,
                alpha=alpha,
                cmap=cm)  #plot IS as red
    #the indeterminate updated gflg (-1) was original ground scatter in traditional method when using the emp_data_dict
    plt.scatter(time[gs_flg == -1],
                gate[gs_flg == -1],
                s=size,
                c='blue',
                marker=marker,
                alpha=alpha,
                cmap=cm)
    ax2.xaxis.set_major_formatter(DateFormatter('%H:%M'))
    #ax1.set_xlabel('Time UT')
    ax2.set_xlim([stm, etm])
    ax2.set_ylabel('Range gate')
    ax2.set_title(
        'Traditional Model Results based on Blanchard et al. 2009 with an Accuracy of {:3.2f}%'
        .format(accur_tra))

    ax3 = plt.subplot(313)
    plt.scatter(time[Z == gsfg_min_vel],
                gate[Z == gsfg_min_vel],
                s=size,
                c='blue',
                marker=marker,
                alpha=alpha,
                cmap=cm)  #plot ground scatter as blue
    plt.scatter(time[Z == gsfg_max_vel],
                gate[Z == gsfg_max_vel],
                s=size,
                c='red',
                marker=marker,
                alpha=alpha,
                cmap=cm)  #plot ionospheric scatter as red
    #plot the third scatter (E region/meteor scatter or noise, sometimes GS) as blue
    if accur_gmm1 > accur_gmm2:
        plt.scatter(time[Z == gsfg_undetermined],
                    gate[Z == gsfg_undetermined],
                    s=size,
                    c='red',
                    marker=marker,
                    alpha=alpha,
                    cmap=cm)
    else:
        plt.scatter(time[Z == gsfg_undetermined],
                    gate[Z == gsfg_undetermined],
                    s=size,
                    c='blue',
                    marker=marker,
                    alpha=alpha,
                    cmap=cm)
    ax3.xaxis.set_major_formatter(DateFormatter('%H:%M'))
    ax3.set_xlabel('Time UT')
    ax3.set_xlim([stm, etm])
    ax3.set_ylabel('Range gate')
    ax3.set_title(
        'Gaussian Mixture Model Results with an Accuracy of {:3.2f}%'.format(
            accur_gmm))

    fig4.tight_layout()
    fig4.savefig('Fig4.png')
    #scatter plot#######################################################################################################################

    plt.show()