def load(self, phipsis):
        self.length = len(phipsis)

        num_component = min(10, self.length)
        gm_ = GM(n_components=num_component)
        gm_.fit(X=phipsis)
        weights = gm_.weights_
        to_keep = weights > 0.05
        num_component = sum(to_keep)

        gm = GM(n_components=num_component)
        gm.fit(X=phipsis)
        precisions = gm.precisions_cholesky_

        # self.means = gm.means_
        self.phipsis = phipsis
        weight = np.mean(precisions[:, 0, 0]) \
                 + np.mean(precisions[:, 1, 1])
        weight = weight * self.weight_scaling_factor  # for matcher weight
        self.weight = min(weight, 1)
        self.weight *= self.weight_accom_factor
        covs = gm.covariances_
        cov_invs = np.array([np.linalg.inv(cov) for cov in covs])
        cluster_dist = gm.predict_proba(phipsis)
        self.cov_dist = np.einsum("ijk, li->ljk", cov_invs, cluster_dist)
        self.gm = gm  # for matcher weight
        # matcher_weight should be a product of the precision/clustering
        # behaviour of the distribution, and the posterior probability of the
        #  queried point. So, higher clustering but point does not belong in
        # distribution => other pressures acting on queried point => should
        # assign lower weight. Lower clustering and point belong => low
        # clustering means low pressure on point, so it shouldn't matter that
        #  much.
        return
Exemple #2
0
def gmmFit(data, numMixRange=10, scoreToUse='sil', threshold=0.9, mixToUse=0):
    scaler = dataScaler().fit(data)
    X = scaler.transform(data)
    likelihood_threshold = np.quantile(scaler.transform(data), 1 - threshold)
    scoreList = []
    #scoreToUse = 'db' #or sil, or nothing for my old bad way
    if scoreToUse == 'db':
        bestScore = -10000
    else:
        bestScore = 10000
    if mixToUse == 0:
        for i in np.arange(2, numMixRange + 1):
            gmm = GM(i).fit(X)
            labels = gmm.predict(X)

            if scoreToUse == 'sil':
                currentScore = silScore(X, labels)
                if currentScore < bestScore:
                    bestScore = currentScore
                    bestGmm = gmm
                    bestMix = i

            elif scoreToUse == 'db':
                currentScore = dbScore(X, labels)
                if currentScore > bestScore:
                    bestScore = currentScore
                    bestGmm = gmm
                    bestMix = i

            else:
                currentScore = gmm.bic(X) + (20 * (i**1.5 / numMix))
                if currentScore > bestScore:
                    bestScore = currentScore
                    bestGmm = gmm
                    bestMix = i

            scoreList.append(currentScore)
    else:
        bestGmm = GM(mixToUse).fit(X)
        bestScore = 8
        bestMix = mixToUse

    clf = bestGmm

    #Get decision Boundaries with scatter plotting
    minMax = np.array([[0, 0], [10240, 6]])
    minMaxScaled = scaler.transform(minMax)
    meshSize = 1000
    xx, yy = np.meshgrid(
        np.linspace(minMaxScaled[0, 0], minMaxScaled[1, 0], meshSize),
        np.linspace(minMaxScaled[0, 1], minMaxScaled[1, 1], meshSize))
    U = np.concatenate([xx[..., None], yy[..., None]], 2).reshape([-1, 2])
    prob_U = -clf.score_samples(U) + likelihood_threshold
    uPlot = prob_U.reshape([meshSize, meshSize])
    mask = (uPlot < 0).astype(np.int)
    scatterCoord = mask.nonzero()
    scatterCoordFreq = scatterCoord[1]
    scatterCoordMag = scatterCoord[0]
    return clf, bestMix, scaler, bestScore, scatterCoordMag, scatterCoordFreq
Exemple #3
0
def one_or_two_mixtures(X, alpha=0.05, min_dist=0.2, min_zscore=2):
    column = np.array(X).reshape(-1, 1)
    gm = GM(n_components=2).fit(column)
    inv_map = trygmonvector(gm, X)
    mean = np.mean(X)
    std = np.std(X)

    if len(inv_map) <= 1 or len(inv_map[0]) < 3 or len(inv_map[1]) < 3:
        gm = GM(n_components=1).fit(column)
        mi = confint(X)
        return {
            "data": X,
            "mean": mean,
            "std": std,
            "gm": gm,
            "low_means": [mi["low"]],
            "high_means": [mi["high"]],
            "n": [len(X)]
        }

    mi1 = confint(inv_map[0], alpha=alpha)
    mi2 = confint(inv_map[1], alpha=alpha)
    if dist(mi1, mi2) <= min_dist or abs(gm.means_[1][0] - gm.means_[0][0]) / (
            max(gm.covariances_)[0][0]) < min_zscore:
        gm = GM(n_components=1).fit(column)
        mi = confint(X)
        result = {
            "data": X,
            "mean": mean,
            "std": std,
            "gm": gm,
            "low_means": [mi["low"]],
            "high_means": [mi["high"]],
            "n": [len(X)]
        }
    elif mi1["low"] < mi2["low"]:
        result = {
            "data": X,
            "mean": mean,
            "std": std,
            "gm": gm,
            "label_order": [0, 1],
            "low_means": [mi1["low"], mi2["low"]],
            "high_means": [mi1["high"], mi2["high"]],
            "n": [mi1["n"], mi2["n"]]
        }
    else:
        result = {
            "data": X,
            "mean": mean,
            "std": std,
            "gm": gm,
            "label_order": [1, 0],
            "low_means": [mi2["low"], mi1["low"]],
            "high_means": [mi2["high"], mi1["high"]],
            "n": [mi2["n"], mi1["n"]]
        }
    return result
Exemple #4
0
    def train(self, train_file):

        # Load training data. train_file must be in the same folder as the script implementing this class.

        (train_t, train_ir) = np.loadtxt(train_file,
                                         delimiter=',',
                                         skiprows=1,
                                         unpack=True)

        # Reshape training data to be a 2D array

        train_ir = np.array([train_ir]).reshape(-1, 1)
        train_t = np.array([train_t]).reshape(-1, 1)

        # Unit normalize

        train_ir = self._normalize(train_ir)

        # Create GMM object

        gmm = GM(n_components=2)

        # Find parameters for GMM based on training data

        self.model = gmm.fit(train_ir)

        if self.plot:
            self.plot_histo(train_ir)
            self.plot_labels(train_t, train_ir)
    def _gmm_fit(self, x, n_components):
        """Fit a Gaussian Mixture Model to the data given by x.

        Parameters
        ----------
        x : array-like, shape (n_samples, n_attributes)
            The data to be fit.

        n_components : int
            The number of components to use in the fit. Although
            this is a parameter of the overall class, it is
            included here to facilitate parallel processing.

        Returns
        -------
        model : GaussianMixture from the sklearn package
            The GaussianMixture object that has been fit to the data.
        """
        model = GM(n_components=n_components,
                   tol=self.tol,
                   max_iter=self.max_iter,
                   n_init=self.n_init,
                   covariance_type=self.cov_type)
        data = x.astype('float32')
        model.fit(data)

        return model
Exemple #6
0
    def train(self, train_file):
        """
        Loads the training data and learns the GMM. Optionally plots the results.
        :param train_file: filename for training file containing time and pre-processed ir data.
        :return: None
        """
        # Load training data. train_file must be in the same folder as the script implementing this class.
        train_t, train_ir = np.loadtxt(train_file,
                                       delimiter=",",
                                       skiprows=1,
                                       unpack=True)

        # Reshape training data to be a 2D array
        train_ir = train_ir.reshape(len(train_ir), 1)
        train_t = train_t.reshape(len(train_ir), 1)

        # Unit normalize
        train_ir = self._normalize(train_ir)

        # Create GMM object
        #        gmm = GM(n_components=2, means_init=np.array([[0.25], [0.85]]))     # mean guesses for normalized data
        gmm = GM(n_components=2, means_init=np.array([[0.25], [0.85]]))

        # Find parameters for GMM based on training data
        self.model = gmm.fit(train_ir)
        if self.plot:
            histo = self.plot_histo(train_ir)
            histo.axes[0].set_title(f"Histogram for: {train_file}")
            labels = self.plot_labels(train_t, train_ir)
            labels.axes[0].set_title(f"Labels for: {train_file}")
        return
    def make_gmm(self):
        model_kwds = dict(n_components=self.gmm_components,
                          max_iter=self.max_iter,
                          n_init=100,
                          random_state=self.random_state)

        gmm = GM(**model_kwds)
        return gmm
Exemple #8
0
 def train(self):
     n_epochs = 250
     self.trainer.train(n_epochs=n_epochs)
     latent, batch_indices, labels = self.trainer.train_set.get_latent()
     gm = GM(n_components=self.n_label, covariance_type='tied')
     gm.fit(latent)
     self.gm = gm
     return
Exemple #9
0
def gmm(path):
    train, test = data_split(path)
    gm = GM(n_components=6, random_state=0).fit(train)
    preds = gm.predict(test)
    mse = mean_squared_error(test['price'], preds)
    preds_rmse = mse**(1 / 2)
    print(preds_rmse)
    print(mse)
Exemple #10
0
    def load(self, phipsis):
        self.length = len(phipsis)
        if np.allclose(phipsis, np.full(phipsis.shape, 360)):
            self.to_skip = True
            return
        i_to_ignore = np.array(phipsis == np.array([360., 360.]))[:, 0]
        self.ignored_i = i_to_ignore
        phipsis = phipsis[~i_to_ignore]

        phipsi_median = np.median(phipsis, axis=0)
        phipsis = phipsis - phipsi_median
        phipsis[phipsis > 180] -= 360.
        phipsis[phipsis < -180] += 360.

        gm_ = GM(n_components=30)
        gm_.fit(X=phipsis)
        weights = gm_.weights_
        to_keep = weights > 0.05
        num_component = sum(to_keep)

        gm = GM(n_components=num_component)
        gm.fit(X=phipsis)
        precisions = gm.precisions_cholesky_

        # self.means = gm.means_
        self.phipsis = phipsis
        self.medians = phipsi_median
        weight = np.mean(precisions[:, 0, 0]) \
                 + np.mean(precisions[:, 1, 1])
        weight = weight * self.weight_scaling_factor  # for matcher weight
        self.weight = float(min(weight, 1.))

        covs = gm.covariances_
        cov_invs = np.array([np.linalg.inv(cov) for cov in covs])
        cluster_dist = gm.predict_proba(phipsis)
        self.cov_dist = np.einsum("ijk, li->ljk", cov_invs, cluster_dist)
        self.gm = gm  # for matcher weight
        # matcher_weight should be a product of the precision/clustering
        # behaviour of the distribution, and the posterior probability of the
        #  queried point. So, higher clustering but point does not belong in
        # distribution => other pressures acting on queried point => should
        # assign lower weight. Lower clustering and point belong => low
        # clustering means low pressure on point, so it shouldn't matter that
        #  much.
        return
Exemple #11
0
def evaluate_kmeans(X, y, problem, out='./results/Clustering/'):
    """Also evaluate kmeans and em both"""
    sm = SMOTE()
    X_res, y_res = sm.fit_sample(X, y)

    SSE = defaultdict(dict)
    ll = defaultdict(dict)
    distort_km = []
    distort_gm = []
    acc = defaultdict(lambda: defaultdict(dict))
    adjMI = defaultdict(lambda: defaultdict(dict))
    km = KMeans(random_state=5)
    gm = GM(random_state=5)

    st = clock()
    clusters = [2, 3, 4, 5, 6]
    for k in clusters:
        print('now doing k=' + str(k))
        km.set_params(n_clusters=k)
        gm.set_params(n_components=k)
        km.fit(X_res)
        gm.fit(X_res)

        #distort_km.append(sum(np.min(cdist(X, km.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
        ##distort_gm.append(sum(np.min(cdist(X, gm.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
        SSE[k][problem] = km.score(X_res)
        ll[k][problem] = gm.score(X_res)
        print('km score:', SSE[k][problem])
        print('gm score:', ll[k][problem])
        acc[k][problem]['Kmeans'] = cluster_acc(y_res, km.predict(X_res))
        acc[k][problem]['GM'] = cluster_acc(y_res, gm.predict(X_res))
        adjMI[k][problem]['Kmeans'] = metrics.adjusted_mutual_info_score(
            y_res, km.predict(X_res))
        adjMI[k][problem]['GM'] = metrics.adjusted_mutual_info_score(
            y_res, gm.predict(X_res))

    print(k, clock() - st)

    SSE = (-pd.DataFrame(SSE)).T
    SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True)
    ll = pd.DataFrame(ll).T
    ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True)
    acc = pd.Panel(acc)
    adjMI = pd.Panel(adjMI)

    SSE.to_csv(out + problem + ' SSE.csv')
    ll.to_csv(out + problem + ' logliklihood.csv')
    acc.ix[:, :, problem].to_csv(out + problem + ' acc.csv')
    acc.ix[:, :, problem, ].to_csv(out + problem + ' acc.csv')
    adjMI.ix[:, :, problem].to_csv(out + problem + ' adjMI.csv')
    adjMI.ix[:, :, problem].to_csv(out + problem + ' adjMI.csv')

    return SSE, ll, acc, adjMI, km, gm
Exemple #12
0
def evaluate_cluster_results(X, Y, title, km_clusters, em_components, em_type):
    km = KMeans(n_clusters=km_clusters, random_state=randomSeed, n_jobs=-1)
    evaluate_kmeans(km, X, Y, title)
    df = pd.DataFrame(km.cluster_centers_)
    df.to_csv(csvdir + "/" + title + "kMeansCenters.csv")

    em = GM(n_components=em_components,
            covariance_type=em_type,
            warm_start=True,
            random_state=randomSeed)
    evaluate_EM(em, X, Y, title)
    df = pd.DataFrame(em.means_)
    df.to_csv(csvdir + "/" + title + "EMComponentMeans.csv")
Exemple #13
0
def init_PRS():
    data = sio.loadmat('stats.mat')
    ax_distrs = data['ax_distrs']
    Axd = ax_distrs[0, 0]
    mus = np.zeros((3, len(Axd)), dtype=float)
    ps = np.zeros((3, len(Axd) * 2), dtype=float)
    n = 0
    GMs = []
    for value in Axd:
        a = value * value
        mu = np.mean(a, axis=1)
        temp = np.matlib.repmat(mu, 1, np.shape(a)[1])
        temp = np.reshape(temp, (np.shape(a)[1], 3))
        a = a - temp.T
        eigv, V = np.linalg.eig(np.dot(a, a.T))
        idx = eigv.argsort()  #[::-1]
        V = V[:, idx]
        p = V[:, 1:3]
        # p = np.c_[p,V[:,0]]
        ared = np.dot(p.T, a)
        try:
            gmm = GM(n_components=10).fit(ared.T)
        except:
            print('cannot do it with 10 components, trying with 3')
            try:
                gmm = GM(n_components=3).fit(ared.T)
            except:
                print('cannot do it with 3 components, trying with 1')
                gmm = GM(n_components=1).fit(ared.T)

        GMs.append(gmm)
        mus[:, n] = mu
        ps[:, 2 * n:2 * (n + 1)] = p
        n += 1

    return GMs, mus, ps
def gauss_cluster_lc(in_path):
	X_lc = lc[['AVE', 'KUR', 'MAX', 'MIN', 'RAN', 'STD']]	
	y_lc = lc['NUM']	
	model = PCA(n_components = 2)
	model.fit(X_lc)
	X_2D = model.transform(X_lc)		
	lc['PCA1'] = X_2D[:, 0]
	#print(lc['PCA1'])
	lc['PCA2'] = X_2D[:, 1]	
	
	model = GM(n_components=6, covariance_type='full')
	model.fit(X_lc)
	y_gmm = model.predict(X_lc)
	lc['cluster']= y_gmm
	df = pd.DataFrame(lc)
	sdf = df.sort_values(by=['RAN'])
	return sdf 
Exemple #15
0
 def test_split(self, xdata, model=None):
     """Test split from some value from one
     dimension to create two new children.
     Using unsupervised method when model is None,
     otherwise supervised"""
     x = [xdata[i] for i in self.index]
     if model is None:
         gm = GM(n_components=2, max_iter=500).fit(x)
         p = gm.predict_proba(x)[:, 1]
     else:
         p = model.pred_prob(x)
     """(best dimension, best split value, lowest Gini score)"""
     d, v, score = None, None, sys.float_info.max
     for dj in range(len(self.region)):
         """dict(): key is distinct value on
         current dimension, value is a tuple
         of [number of points, number of '1's]"""
         val2cnts = dict()
         for xi, pi in zip(x, p):
             xij = xi[dj]
             if xij not in val2cnts:
                 val2cnts[xij] = [0, 0]
             val2cnts[xij][0] += 1
             val2cnts[xij][1] += pi
         val2cnts = sorted(val2cnts.items(), key=lambda t: t[0])
         if len(val2cnts) < 2: continue
         n = float(len(x))
         left_n, right_n = 0, n
         left_p, right_p = 0.0, float(sum(pi for pi in p))
         for (val, cnts) in val2cnts[:-1]:
             left_n += cnts[0]
             left_p += cnts[1]
             left_label = left_p / left_n
             right_n -= cnts[0]
             right_p -= cnts[1]
             right_label = right_p / right_n
             s = left_n/n*gini(left_label)+\
                     right_n/n*gini(right_label)
             if s < score:
                 d, v, score = dj, val, s
     assert (d is not None) and (v is not None)
     return d, v
Exemple #16
0
def clustering(idTfidf, num_clu, term_num):
    docFeature = idTfidf
    vecTfidf = {}
    for file in idTfidf:
        row = np.zeros(len(idTfidf[file]))
        col = idTfidf[file].keys()
        val = idTfidf[file].values()
        vec = csc_matrix((np.array(val), (np.array(row), np.array(col))), shape=(1, term_num))
        vecTfidf[file] = vec.todense().tolist()[0]
    # print vecTfidf
    features = vecTfidf.values()
    # print features

    selection = 'GM'  # selecting model here!!! Options: AgglomerativeClustering as AC, SpectralClustering as SC, GMM

    if selection == 'AC':
        model = AC(n_clusters=num_clu, affinity='cosine', linkage='average')
    if selection == 'SC':
        model = SC(n_clusters=num_clu, affinity='cosine')
    if selection == 'GMM':
        model = GMM(n_components=num_clu, covariance_type='full')
    if selection == 'GM':
        model = GM(n_components=num_clu)
        model.fit(features)
        res = model.predict(features)
    else:
        res = model.fit_predict(features)

    resDic = {}
    for i in range(len(res)):
        if not resDic.has_key(res[i]):
            resDic[res[i]] = []
            resDic[res[i]].append(int(docFeature.keys()[i]))
        else:
            resDic[res[i]].append(int(docFeature.keys()[i]))
    result = resDic.values()
    # print result
    with open('gt_GMRes.json', 'w') as f:
        f.write(json.dumps(result))

    return result
Exemple #17
0
 def train(self, train_file):
     # Load training data. train_file must be in the same folder as the script implementing this class.
     train_t, train_ir = np.loadtxt(train_file,
                                    delimiter=",",
                                    skiprows=1,
                                    unpack=True)
     # Reshape training data to be a 2D array
     train_ir = np.array(train_ir).reshape(-1, 1)
     train_t = np.array(train_t).reshape(-1, 1)
     # Unit normalize
     train_ir = self._normalize(train_ir)
     # Create GMM object
     means_init = [[np.min(train_ir)], [np.max(train_ir)]]
     gmm = GM(n_components=2, means_init=means_init)
     # Find parameters for GMM based on training data
     self.model = gmm.fit(train_ir)
     if self.plot:
         #Call process(self, t_data, ir_data)
         #self.process(t_data = train_t, ir_data = train_ir)
         #Call to functions with param
         self.plot_histo(ir=train_ir)
         self.plot_labels(t=train_t, ir=train_ir)
     return train_t, train_ir
Exemple #18
0
def unsupervised_clu(feature, part, model_selection):
    if part:
        if feature == 'graph':
            docFeature = json.loads(
                open('rmMultiPart1WOZeroGraph.json').read())
        if feature == 'doc2vec':
            docFeature = json.loads(open('rmMultiPart1Doc2vec.json').read())
        if feature == 'comb':
            walk = json.loads(open('rmMultiPart1WOZeroGraph.json').read())
            dv = json.loads(open('rmMultiPart1Doc2vec.json').read())
            docFeature = {}
            for doc in walk:
                val = walk[doc] + dv[doc]
                docFeature[doc] = val
        groundTruth = json.loads(open('rmMultiPart1CluInd.json').read())
        num_clu = len(groundTruth)  # number of clusters in each part
    else:
        rmMulti = True  # False #
        if rmMulti:
            if feature == 'graph':
                docFeature = json.loads(
                    open('rmMultiCluDatabaseWOZeroGraph.json').read())
            if feature == 'doc2vec':
                docFeature = json.loads(
                    open('rmMultiCluDatabaseDoc2vec.json').read())
            if feature == 'comb':
                walk = json.loads(
                    open('rmMultiCluDatabaseWOZeroGraph.json').read())
                dv = json.loads(open('rmMultiCluDatabaseDoc2vec.json').read())
                docFeature = {}
                for doc in walk:
                    val = walk[doc] + dv[doc]
                    docFeature[doc] = val
            groundTruth = json.loads(open('rmMultiGroundTruth.json').read())
            num_clu = len(
                groundTruth
            )  # number of clusters after removing documents appearing multi-cluster, #doc = 1274 (3 all 0s for walk)
        else:
            if feature == 'graph':
                docFeature = json.loads(
                    open('cluDatabaseWOZeroGraph.json').read())
            if feature == 'doc2vec':
                docFeature = json.loads(open('cluDatabaseDoc2vec.json').read())
            if feature == 'comb':
                walk = json.loads(open('cluDatabaseWOZeroGraph.json').read())
                dv = json.loads(open('cluDatabaseDoc2vec.json').read())
                docFeature = {}
                for doc in walk:
                    val = walk[doc] + dv[doc]
                    docFeature[doc] = val
            groundTruth = json.loads(open('groundTruth.json').read())
            num_clu = len(
                groundTruth
            )  # number of clusters before removing documents appearing multi-cluster, #doc = 1393 (3 all 0s for walk)

    features = docFeature.values()
    if model_selection == 'AC':
        model = AC(n_clusters=num_clu, affinity='cosine', linkage='average')
    if model_selection == 'SC':
        model = SC(n_clusters=num_clu, affinity='cosine')
    if model_selection == 'GMM':
        model = GMM(n_components=num_clu, covariance_type='full')
    if model_selection == 'KMeans':
        model = KMeans(n_clusters=num_clu)
    if model_selection == 'GM':
        model = GM(n_components=num_clu)
        model.fit(features)
        res = model.predict(features)
    else:
        res = model.fit_predict(features)
    resDic = {}
    for i in range(len(res)):
        if not resDic.has_key(res[i]):
            resDic[res[i]] = []
            resDic[res[i]].append(int(docFeature.keys()[i]))
        else:
            resDic[res[i]].append(int(docFeature.keys()[i]))
    result = resDic.values()

    return (result, groundTruth)
Exemple #19
0
    # ---------- Plot Histogram ---------- #
    # Plot the histogram of your training dataset, here.
    plt.figure()
    plt.hist(data_ir_tr, 50)
    plt.xlabel("IR reading (training data)")
    plt.ylabel("Count (#)")
    plt.title("IR Signal Histogram")
    

    ##########
    # Step 4 #
    ##########
    # ---------- Find GMM ---------- #
    # Create GMM object
    means_init = [[np.min(data_ir_tr)], [np.max(data_ir_tr)]]
    gmm = GM(n_components=2, means_init= means_init)
    # Fit 2 component Gaussian to the data
    #changed param
    X = data_ir_tr.reshape(-1,1)   #Create a 2D - array   
    gmm_fit = gmm.fit(X)                                # Pass correct parameters. Remember that this expects a 2D array.
    # Retrieve Gaussian parameters
    mu0 = gmm_fit.means_[0]
    mu1 = gmm_fit.means_[1]
    sig0 = np.sqrt(gmm_fit.covariances_[0])
    sig1 = np.sqrt(gmm_fit.covariances_[1])
    w0 = gmm_fit.weights_[0]
    w1 = gmm_fit.weights_[1]

    # ---------- Plot Gaussians sum over histogram ---------- #
    # Create an "x" array from which to compute the Gaussians
    #changed param
Exemple #20
0
            pvals.append(float(line[P_IND]))
            #lfcvals.append(line[LFC_IND])
            lfcvals.append(abs(float(line[LFC_IND])))
            allvals.append([line[ID_IND], float(line[P_IND]), abs(float(line[LFC_IND]))])
"""
sorted1 = sorted(allvals, key=lambda x: x[1])
with open('sorted_pval.txt', 'w') as outfile:
    for e in sorted1:
        outfile.write('{}\t{}\t{}\n'.format(e[0], e[1], e[2]))
sorted1 = sorted(allvals, key=lambda x: x[2])
with open('sorted_lfc_abs.txt', 'w') as outfile:
    for e in sorted1:
        outfile.write('{}\t{}\t{}\n'.format(e[0], e[1], e[2]))
"""
pvals = np.array(pvals)
lfcvals = np.array(lfcvals)
print(max(lfcvals))
X = []
for i in range(len(ids)):
    X.append([pvals[i], lfcvals[i]])
X = np.array(X)
gmm = GM(n_components=2).fit(X).score_samples(X)
scored = []
for i, score in enumerate(gmm):
    scored.append([ids[i], score, lfcvals[i], pvals[i]])
scored = sorted(scored, key=lambda x: x[1])
with open('results_all_abs.txt', 'w') as outfile:
    outfile.write('gene\tscore\tLFC\tPVAL\n')
    for e in scored:
        outfile.write('{}\t{}\t{}\t{}\n'.format(e[0], e[1], e[2], e[3]))
Exemple #21
0
def EMObject(n_components, covariance_type):
    return GM(n_components=n_components,
              covariance_type=covariance_type,
              n_init=1,
              warm_start=True,
              random_state=100)
    # ---------- Plot Histogram ---------- #
    # Plot the histogram of your training dataset, here.
    plt.figure()
    plt.hist(data_ir_tr, 50)
    plt.xlabel("IR reading")
    plt.ylabel("Count (#)")
    plt.title("IR Signal Histogram")

    numsamps = data_ir_tr.shape[0]

    ##########
    # Step 4 #
    ##########
    # ---------- Find GMM ---------- #
    # Create GMM object
    gmm = GM(n_components=2)
    # Fit 2 component Gaussian to the data
    gmm_fit = gmm.fit(data_ir_tr.reshape(
        -1,
        1))  # Pass correct parameters. Remember that this expects a 2D array.
    # Retrieve Gaussian parameters
    mu0 = gmm_fit.means_[0]
    mu1 = gmm_fit.means_[1]
    sig0 = np.sqrt(gmm_fit.covariances_[0])
    sig1 = np.sqrt(gmm_fit.covariances_[1])
    w0 = gmm_fit.weights_[0]
    w1 = gmm_fit.weights_[1]

    # ---------- Plot Gaussians sum over histogram ---------- #
    # Create an "x" array from which to compute the Gaussians
    temp = np.shape(data_ir_tr)
Exemple #23
0
from jaco_arm import JacoStackEnv as JacoEnv
import mujoco_py
import gym
import numpy as np
import glfw
from sklearn.mixture import GaussianMixture as GM
import cv2
from sklearn.decomposition import PCA

env = JacoEnv()

traj_data = np.load('new_stack.npz', allow_pickle=True)
obs = traj_data['obs'][:30]
acs = traj_data['acs'][:30]

ret_save_list = []

#pca = PCA(n_components=3)
#nobs = pca.fit_transform(np.vstack(obs))

print(np.vstack(obs).shape)
print(obs[0].shape)
gm = GM(n_components=3, init_params='random', random_state=0)
gm.fit(np.vstack(obs))
for i in range(len(obs)):
    print('traj [', i, '] :', gm.predict(obs[i]))

#np.savez('new_stack.npz', obs = obs[:100], acs=acs[:100], rets=ret_save_list)
def myGMM(lendict, MinSup=2, mparameters=None, commonoptions=None):
	isillumina = False;
	if (not commonoptions==None) and commonoptions.has_key('SeqTech') and commonoptions['SeqTech']=="Illumina":
		isillumina = True;

	truecounts = None;
	if (not commonoptions==None) and commonoptions.has_key('truecounts'):
		truecounts = commonoptions['truecounts']

	minreads = int(MinSup);
	if minreads<2: minreads = 2

	ldkeys = lendict.keys(); ldkeys.sort();
	allocr = 'allocr:';
	for ldk in ldkeys:
		allocr += ('%d:%d, ' % (ldk, lendict[ldk]))
	logging.info(allocr)
	if commonoptions['outlog'] <= M_INFO: print allocr, minreads, MinSup

	minrepcount = 5;
	for ldk in ldkeys:
		if ldk<minrepcount: del lendict[ldk]
	ldkeys = lendict.keys(); ldkeys.sort();

	ldkeys = lendict.keys(); ldkeys.sort();
	while len(ldkeys)>1:
		lastk = ldkeys[-1]; secondlk=ldkeys[-2];
		curw = getWindowForCounts(secondlk)
		if lendict[lastk]<minreads and lastk-secondlk>curw*10:
			del lendict[lastk]
		else: break;
		ldkeys = lendict.keys(); ldkeys.sort();
	while len(ldkeys)>2:
		firstk = ldkeys[0]; secondk = ldkeys[1];
		curw = getWindowForCounts(secondk)
		if lendict[firstk]<minreads and secondk-firstk>curw*10:
			del lendict[firstk]
		else: break;
		ldkeys = lendict.keys(); ldkeys.sort();

	#for special case;
	if len(ldkeys)<1: return [[0], allocr]
	elif len(ldkeys)<2: peak2 = [ldkeys[0]];
	elif len(ldkeys)==2 or len(ldkeys)==3 or isillumina:
		maxk = ldkeys[0]
		if maxk==0: maxk = ldkeys[1]
		for ldk in ldkeys:
			if lendict[ldk]>=lendict[maxk] and ldk>0: maxk = ldk
		peak2 = [maxk]
	if len(ldkeys)<4 or isillumina:
		peak2 = checkSmallSupport(peak2, lendict, minreads)
		if len(peak2)==0: peak2 = [0]

		if isillumina:
			peak2 = getNewRepeatForIllumina(lendict, minreads, commonoptions, peak2)
		peak2.sort()
		return [peak2, allocr[:-1]];

	total_point = 0;
	mkeys = lendict.keys(); mkeys.sort();
	for mk in mkeys:
		curnum = lendict[mk] - minreads;
		if curnum<1 or mk<minrepcount: 
			continue;
		total_point += curnum
	lowcov = False;
	if total_point<50:
		total_point = 0;
		lowcov = True;
		for mk in mkeys:
			curnum = lendict[mk]
			if curnum<1 or mk<minrepcount:
				continue;
			total_point += curnum

	X = np.zeros((total_point,1))
	xi = 0;
	for mk in mkeys:
		if not lowcov:
			curnum = lendict[mk] - minreads;
		else:
			curnum = lendict[mk]
		#if curnum<1 or mk<minrepcount:
		if lendict[mk]<2 or mk<minrepcount: 
			continue;
		for j in range(curnum):
			X[xi][0] = mk; xi += 1;
	#f len(X)<200: print total_point, lowcov, len(X), X

	default_n_components = [4] #[4,3,2,5,6,4];
	#for nc in range(2, 7):
	for nc in range(3, 7):
		if nc>=total_point: break;
		if nc==4: continue
		default_n_components.append(nc)
	default_n_components.append(4)
	gmm_times = 20;
	small_covars_threhold = 0.01

	for cur_n_component_ind in range(len(default_n_components)):
		cur_n_component = default_n_components[cur_n_component_ind]
		
		atedge = True;
		for run_time in range(gmm_times):
			N = np.arange(1, (cur_n_component+1))
			models = [None for i in range(len(N))]

			for i in range(len(N)):
				models[i] = GM(N[i]).fit(X) 

			# compute the AIC and the BIC
			AIC = [m.aic(X) for m in models]
			calbic = False;
			if calbic:
				BIC = [m.bic(X) for m in models]

			mbest = models[np.argmin(AIC)]

			if commonoptions['outlog'] <= M_DEBUG:
				print 'aic', np.argmin(AIC), 
				for aic in range(len(AIC)):
					if aic==np.argmin(AIC):
						print ('<%.3f>' % (AIC[aic])),
					else:	print (' %.3f ' % (AIC[aic])),
				print ''
				if calbic:
					print 'bic', np.argmin(BIC),
					for bic in range(len(BIC)):
						if bic==np.argmin(BIC):
							print ('<%.3f>' % (BIC[bic])),
						else: print (' %.3f ' % (BIC[bic])),
					print ''
			
			if 0<np.argmin(AIC)<len(AIC)-1: 
				atedge = False;
				break;
			
		has_too_small_std = False;
		for i in range(len(mbest.means_)):
			if mbest.covariances_[i,0][0]<small_covars_threhold:
				has_too_small_std = True;
		#if (not has_too_small_std) and (not atedge): break;
		if (not atedge): break;
		elif cur_n_component_ind==len(default_n_components)-1:
			if commonoptions['outlog'] <= M_WARNING:
				print 'Warning!!!! could not find optimized model'
				logging.info('Warning!!!! could not find optimized model')
			
	#print mbest.covariances_
	mean_covars = []; 
	for i in range(len(mbest.means_)):
		curk = int(mbest.means_[i,0]+0.5) #75)
		if lendict.has_key(curk):
			if commonoptions['outlog'] <= M_DEBUG: print '>>', i, ('%9.3fm' % (mbest.means_[i,0])), ('%6d' % (lendict[curk])), ('%20.9fst' % (mbest.covariances_[i,0][0]))
			mean_covars.append([curk, mbest.means_[i,0], lendict[curk], mbest.covariances_[i,0][0]])
		else:
			closedif = sys.maxint; closekey = -1;
			for mk in mkeys:
				if closedif>abs(mk-curk):
					closedif=abs(mk-curk)
					closekey = mk
			if commonoptions['outlog'] <= M_DEBUG: print '>>', i, ('%9.3fm' % (mbest.means_[i,0])), ('%6d' % (lendict[closekey])), ('%20.9fst' % (mbest.covariances_[i,0][0])), closekey
			mean_covars.append([closekey, mbest.means_[i,0], lendict[closekey], mbest.covariances_[i,0][0]])

	fixed_boundarywidth = 50; close_ratio_threhold = 0.8
	
	remove_larger_covar_smaller_means = []
	for i in range(len(mean_covars)):
		mean_covars[i].append(getNeighbors_reads(lendict, mean_covars[i][0], minreads)[0])
	for i in range(len(mean_covars)):
		should_remove = False;
		for j in range(len(mean_covars)):
			if i==j: continue;
			if mean_covars[j][3]<small_covars_threhold: continue;

			if mean_covars[i][3]<=mean_covars[j][3] and mean_covars[i][1]<mean_covars[j][1]: pass
			elif mean_covars[i][3]>mean_covars[j][3] and mean_covars[i][1]<mean_covars[j][1]:
				meandif = (mean_covars[j][1]-mean_covars[i][1])*3
				#if meandif>10: meandif = 10
				if meandif>20: meandif = 20
				if mean_covars[i][3]>mean_covars[j][3]*meandif: should_remove = True
				else:
					cur_window_i = getWindowForCounts(mean_covars[i][0]) #int(mean_covars[i][0]/200.0+0.75)
					cur_window_j = getWindowForCounts(mean_covars[j][0]) #int(mean_covars[j][0]/200.0+0.75)
					if commonoptions['outlog'] <= M_INFO: print should_remove, mean_covars[i][0], mean_covars[j][0], cur_window_i, cur_window_j, mean_covars[i][0], mean_covars[j][0], mean_covars[i][4], mean_covars[j][4], mean_covars[j][4]*close_ratio_threhold, abs(cur_window_i-cur_window_j)==1, mean_covars[i][4]<mean_covars[j][4]*close_ratio_threhold,
					
					if abs(cur_window_i-cur_window_j)==1 and abs(mean_covars[i][0]-mean_covars[j][0])<fixed_boundarywidth:
						newneighbors = getNeighbors_reads_fixed(lendict, mean_covars[i][0], cur_window_j)
						if newneighbors[0]<mean_covars[j][4]*close_ratio_threhold: should_remove = True;
					elif mean_covars[i][4]<mean_covars[j][4]*close_ratio_threhold:
						should_remove = True;
					if commonoptions['outlog'] <= M_INFO: print should_remove
		if not should_remove:
			remove_larger_covar_smaller_means.append(mean_covars[i])

	furtherremove = []
	for i in range(len(remove_larger_covar_smaller_means)):
		should_remove = False;
		for j in range(len(remove_larger_covar_smaller_means)):
			if i==j: continue;
			if abs(remove_larger_covar_smaller_means[i][1]-remove_larger_covar_smaller_means[j][1])<1.5 and (remove_larger_covar_smaller_means[i][3]<small_covars_threhold or remove_larger_covar_smaller_means[j][3]<small_covars_threhold):
				if remove_larger_covar_smaller_means[i][3]<small_covars_threhold and (not remove_larger_covar_smaller_means[j][3]<small_covars_threhold):
					should_remove = True;
				elif (not remove_larger_covar_smaller_means[i][3]<small_covars_threhold) and (remove_larger_covar_smaller_means[j][3]<small_covars_threhold): pass
				else:
					cur_window_i = getWindowForCounts(remove_larger_covar_smaller_means[i][0])
					cur_window_j = getWindowForCounts(remove_larger_covar_smaller_means[j][0])
					if abs(cur_window_i-cur_window_j)==1 and abs(remove_larger_covar_smaller_means[i][0]-remove_larger_covar_smaller_means[j][0])<fixed_boundarywidth:
						if cur_window_i<cur_window_j:
							newneighbors = getNeighbors_reads_fixed(lendict, remove_larger_covar_smaller_means[i][0], cur_window_j)
							if newneighbors[0]<remove_larger_covar_smaller_means[j][4]: should_remove = True;
						else:
							newneighbors = getNeighbors_reads_fixed(lendict, remove_larger_covar_smaller_means[j][0], cur_window_i)
							if newneighbors[0]>remove_larger_covar_smaller_means[i][4]: should_remove = True;
					elif remove_larger_covar_smaller_means[i][4]<remove_larger_covar_smaller_means[j][4]: should_remove = True;
		if not should_remove:
			furtherremove.append(remove_larger_covar_smaller_means[i]);

	remove_larger_covar_smaller_means, furtherremove = furtherremove, remove_larger_covar_smaller_means

	if commonoptions['outlog'] <= M_DEBUG:
		print 'keep'
		for i in range(len(remove_larger_covar_smaller_means)):
			print '>>', i, ('%9.3fm' % (remove_larger_covar_smaller_means[i][1])), ('%6d' % (remove_larger_covar_smaller_means[i][2])), ('%20.9fst' % (remove_larger_covar_smaller_means[i][3])), mean_covars[i][4]

	peak2 = []; max_p1 = 0; max_p1_reads = 0;
	for i in range(len(remove_larger_covar_smaller_means)):
		cur_window_i = getWindowForCounts(remove_larger_covar_smaller_means[i][0]) #int(remove_larger_covar_smaller_means[i][0]/200.0+0.75)
		cur_window_max = getWindowForCounts(max_p1) #int(max_p1/200.0+0.75)
		if commonoptions['outlog'] <= M_INFO: print max_p1, max_p1_reads, cur_window_i, cur_window_max, remove_larger_covar_smaller_means[i][0],max_p1, '<', abs(cur_window_i-cur_window_max)==1, '>', remove_larger_covar_smaller_means[i][4]>max_p1_reads
		if abs(cur_window_i-cur_window_max)==1 and abs(remove_larger_covar_smaller_means[i][0]-max_p1)<fixed_boundarywidth:
			newneighbors = getNeighbors_reads_fixed(lendict, remove_larger_covar_smaller_means[i][0], cur_window_max) 
			if newneighbors[0] > max_p1_reads:
				max_p1 = remove_larger_covar_smaller_means[i][0]
				max_p1_reads = remove_larger_covar_smaller_means[i][4] 
		elif remove_larger_covar_smaller_means[i][4]>max_p1_reads:
			max_p1 = remove_larger_covar_smaller_means[i][0]
			max_p1_reads = remove_larger_covar_smaller_means[i][4]

	peak2.append(max_p1)
	secondpeak = {}
	for i in range(len(remove_larger_covar_smaller_means)):
		if remove_larger_covar_smaller_means[i][0]==max_p1: continue;
		if selectFromTwoX(max_p1, remove_larger_covar_smaller_means[i][0], lendict, minreads)==remove_larger_covar_smaller_means[i][0]:
			secondpeak[remove_larger_covar_smaller_means[i][0]] = []
		else:
			if remove_larger_covar_smaller_means[i][0]<max_p1 and remove_larger_covar_smaller_means[i][4]>max_p1_reads*close_ratio_threhold:
				secondpeak[remove_larger_covar_smaller_means[i][0]] = []

	if commonoptions['outlog'] <= M_INFO: print 'peak2', peak2, secondpeak
	secondpeakkeys = secondpeak.keys();
	for spk in secondpeakkeys:
		for spkj in secondpeakkeys:	
			if spkj in [max_p1, spk]: continue;
			if selectFromTwoX(spk, spkj, lendict, minreads)==spk:
				if spk not in secondpeak[spkj]: secondpeak[spkj].append(spk)
			else:
				if spkj not in secondpeak[spk]: secondpeak[spk].append(spkj)

	for spk in secondpeakkeys:
		if  len(secondpeak[spk])==0: 
			peak2.append(spk)

	if commonoptions['outlog'] <= M_INFO: print 'peak2', peak2, secondpeak	
	if len(secondpeakkeys)>0 and len(peak2)<2:
		max_p2 = 0; max_p2_reads = 0;
		for i in range(len(remove_larger_covar_smaller_means)):
			if secondpeak.has_key(remove_larger_covar_smaller_means[i][0]):
				if remove_larger_covar_smaller_means[i][4]>max_p2_reads: 
					max_p2_reads = remove_larger_covar_smaller_means[i][4]
					max_p2 = remove_larger_covar_smaller_means[i][0]

		peak2.append(max_p2)
	if commonoptions['outlog'] <= M_DEBUG: print 'peak2', peak2
	peak2 = checkSmallSupport(peak2, lendict, minreads)
	if commonoptions['outlog'] <= M_DEBUG: print 'peak2', peak2
	
	if len(peak2)==2:
		if peak2[0]==0 and (not peak2[1]==0): peak2[0] = peak2[1]
		if peak2[1]==0 and (not peak2[0]==0): peak2[1] = peak2[0]
	if len(peak2)==1 or (len(peak2)>1 and abs(peak2[0]-peak2[1])>2):
		curlen = len(peak2)
		if curlen>2: peak2 = 2;
		for i in range(curlen):
			cur_window_max = getWindowForCounts(peak2[i])
			for npdif in range(1, cur_window_max+1):
				newpeak = peak2[i]+npdif
				if lendict.has_key(newpeak) and lendict[newpeak]>1.5*lendict[peak2[i]] and lendict[newpeak]>5*minreads:
					peak2[i] = newpeak
				newpeak = peak2[i]-npdif
				if lendict.has_key(newpeak) and lendict[newpeak]>1.5*lendict[peak2[i]] and lendict[newpeak]>5*minreads:
					peak2[i] = newpeak
			#for newpeak in range(peak2[i]-cur_window_max, peak2[i]+cur_window_max+1):
			#	if lendict.has_key(newpeak) and lendict[newpeak]>2*lendict[peak2[i]]:
			#		peak2[i] = newpeak
		if len(peak2)==1:
			peak2.append(peak2[0]);
	if len(peak2)==0: peak2 = [0,0]

	peak2.sort()

	if not truecounts==None:
		if abs(truecounts[0]-peak2[0])>5 or abs(truecounts[1]-peak2[1])>5:
			if commonoptions['outlog'] <= M_INFO: print 'Big dif, please check', peak2, truecounts

	return [peak2, allocr[:-1]]
Exemple #25
0
    def perform_gmm(self, component_select = 1, gmm_comps = 2):
        
        #Perform gaussian mixture modeling, using the component selection and number of gmm components specified
        
        if component_select!=self.component_select: 
            print('You chose a different number of components to that used previously. Recalculating!')
            self.plot_eigenvalue_composition_violin(component_select = component_select)
           
        gm_list = []
        nneighbors = self.atomseries[0].atom_neighbor_positions.shape[-2]
       
        for ind in range(len(self.all_results)):
            gm_comp = []
            for comp in range(self.num_comps):
                gm_n = []
                for m in range(nneighbors+1):

                    y = np.array(self.nn_eigenvalue_data[ind][comp][m])

                    if y.shape[0]>2:
                        gm = GM(n_components = gmm_comps)
                        gm.fit(y.reshape(-1,1))
                        means = gm.means_
                        covariances = gm.covariances_
                    else:
                        means = [np.nan,np.nan]
                        covariances = np.zeros(shape=(2,1))
                    gm_n.append((means,covariances,m))
                gm_comp.append(gm_n)
            gm_list.append(gm_comp)
        
        self.gmm_list = gm_list #This list contains the gmm results
        
        
        if self.num_comps==4:
            colors = ['r', 'k', 'b', 'g']
        else:
            cm = plt.cm.get_cmap('jet', self.num_comps)
            colors = [cm(ind) for ind in range(self.num_comps)]
        
        for ind in range(len(self.all_results)):
            fig = plt.figure()   
            
            key = list(self.atomseries[0].atom_descriptors.keys())[ind]
            not_key = [itm for itm in list(self.atomseries[0].atom_descriptors.keys()) if key not in itm]
            
            for comp in range(self.num_comps):
                for neighbor in range(nneighbors+1):
                    for gmm_comp in range(gmm_comps):
                        x = np.array(neighbor).ravel()
                        y = np.array(gm_list[ind][comp][neighbor][0][gmm_comp]).ravel()
                        yerr = np.array(np.sqrt(gm_list[ind][comp][neighbor][1][gmm_comp])).ravel()
                        plt.errorbar(neighbor,y, yerr=yerr, color = colors[comp],capsize = 6);
                        plt.plot(neighbor, y, 'o', color = colors[comp], markersize = 5);
        
            plt.xlabel('NN of ' + not_key[0], fontsize = 14)
            plt.ylabel('PCA Comp. # ' + str(self.component_select) + ' Scores', fontsize = 14)
            plt.title('GMM with ' + key + ' Neighbors', fontsize = 16)
            
            #Need to create a custom legend
            
        return fig
# dimensionality reduction
pca = PCA(n_components=2)
new_x = pca.fit_transform(x)
xtr_new, xte_new, ytr_new, yte_new = t_t_s(new_x,
                                           y,
                                           test_size=0.25,
                                           random_state=0)
print(xtr_new.shape, yte_new.shape)  # (112, 2) (38,)
model1 = GNB()
model1.fit(xtr_new, ytr_new)
ypred1 = model1.predict(xte_new)
print("PCA后分类准确率:{0:.2%}".format(a_s(yte_new, ypred1)))
df['PCA1'] = new_x[:, 0]
df['PCA2'] = new_x[:, 1]

sns.set(style='darkgrid')
sns.lmplot('PCA1', 'PCA2', data=df, hue='species', fit_reg=False)

# clustering
model2 = GM(n_components=3, covariance_type='full')
model2.fit(x)
ypred2 = model2.predict(x)
df['cluster'] = ypred2
sns.lmplot('PCA1',
           'PCA2',
           data=df,
           hue='species',
           col='cluster',
           fit_reg=False)

plt.show()
def read_catalog_in_and_fit(input_filename,
                            output_filename_prefix,
                            mag_cutoff=None,
                            initial_means=[[-12.5, -19], [-3, -4]],
                            random_state=546):
    """
    Read in the Gaia data and fit the Gaussian mixture model

    input_filename - the input catalog, assume to be a VOTable and assumed
         to have similar format as the Gaia DR2 source catalog
    output_filename_prefix - the prefix of the name for the output pickle file, 
         which will store the fitted model
    mag_cutoff - the magnitude to split the model over.  Separate fits will
         be made for objects brighter and dimmer than this magnitude.
         A value of None will use all stars for a fit, no splitting
    initial_means - the initial guesses for the means of the two Gaussian
         components.  Random initializations (the default) lead to random
         assignments: sometimes the first component is what gets fit to 
         the cluster proper motions, other times its the second component.
         For consistency, the default initial_means provided here is intended
         so that the first component fits the cluster proper motions, which
         have a mean value around the default value shown above.
    random_state - here we have provided a fixed seed value for the random
         number generator for consistency across runs.  Changing this value
         should have little to no effect on the final fit.

    Returns the fitted model and the read-in catalog data
    """

    # Read in the catalog
    table = parse_single_table(input_filename)
    catalog_data = table.array

    # Set up the Gaussian mixture model
    if mag_cutoff is not None:
        gm_model_bright = GM(n_components=2,
                             max_iter=300,
                             means_init=initial_means,
                             random_state=random_state)
        gm_model_dim = GM(n_components=2,
                          max_iter=300,
                          means_init=initial_means,
                          random_state=random_state)
    else:
        gm_model = GM(n_components=2,
                      max_iter=300,
                      means_init=initial_means,
                      random_state=random_state)

    # Mask out catalog entries without measured proper motions and dimmer
    # than the cutoff magnitude
    pm_mask_fitting = (~np.isnan(catalog_data['pmra'])) &\
        (~np.isnan(catalog_data['pmdec']))
    if mag_cutoff is not None:
        pm_mask_fitting_bright = pm_mask_fitting &\
            (catalog_data['phot_g_mean_mag'] < mag_cutoff)
        pm_mask_fitting_dim = pm_mask_fitting &\
            (catalog_data['phot_g_mean_mag'] >= mag_cutoff)

    # Extract the data to fit and fit it
    if mag_cutoff is not None:
        data_to_fit_bright = [
            item for item in zip(catalog_data['pmra'][pm_mask_fitting_bright],
                                 catalog_data['pmdec'][pm_mask_fitting_bright])
        ]
        data_to_fit_dim = [
            item for item in zip(catalog_data['pmra'][pm_mask_fitting_dim],
                                 catalog_data['pmdec'][pm_mask_fitting_dim])
        ]
        gm_model_bright.fit(data_to_fit_bright)
        gm_model_dim.fit(data_to_fit_dim)
    else:
        data_to_fit = [
            item for item in zip(catalog_data['pmra'][pm_mask_fitting],
                                 catalog_data['pmdec'][pm_mask_fitting])
        ]
        gm_model.fit(data_to_fit)

    # Dump the fitted model to a pickle file, and also return it
    if mag_cutoff is not None:
        with open(output_filename_prefix + "_maglessthan_" +\
                      str(mag_cutoff) + ".pkl","wb") as f:
            pickle.dump(gm_model_bright, f)
        with open(output_filename_prefix + "_maggreaterthan_" +\
                      str(mag_cutoff) + ".pkl","wb") as f:
            pickle.dump(gm_model_dim, f)
        return ((gm_model_bright, gm_model_dim), catalog_data)
    else:
        with open(output_filename_prefix + ".pkl", "wb") as f:
            pickle.dump(gm_model, f)
        return (gm_model, catalog_data)
Exemple #28
0
def em_selection(X, y, title):
    lowest_bic = np.infty
    bic = []
    n_components_range = range(1, 10)
    cv_types = ['spherical', 'tied', 'diag', 'full']
    for cv_type in cv_types:
        for n_components in n_components_range:
            print(cv_type + ": " + str(n_components))
            # Fit a Gaussian mixture with EM
            gmm = GM(n_components=n_components, covariance_type=cv_type)
            gmm.fit(X)
            bic.append(gmm.bic(X))
            if bic[-1] < lowest_bic:
                lowest_bic = bic[-1]
                best_gmm = gmm

    bic = np.array(bic)
    color_iter = itertools.cycle(
        ['navy', 'turquoise', 'cornflowerblue', 'darkorange'])
    clf = best_gmm
    bars = []

    # Plot the BIC scores
    plt.figure(figsize=(8, 6))
    spl = plt.subplot(1, 1, 1)
    for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
        xpos = np.array(n_components_range) + .2 * (i - 2)
        bars.append(
            plt.bar(xpos,
                    bic[i * len(n_components_range):(i + 1) *
                        len(n_components_range)],
                    width=.2,
                    color=color))
    plt.xticks(n_components_range)
    plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()])
    # plt.title('BIC score per model')
    params = best_gmm.get_params()
    plt.title(title + ' Best EM: ' + params['covariance_type'] + ' model, ' +
              str(params['n_components']) + ' components')
    xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\
        .2 * np.floor(bic.argmin() / len(n_components_range))
    plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14)
    spl.set_xlabel('Number of components')
    spl.set_ylabel("BIC Score")
    spl.legend([b[0] for b in bars], cv_types)

    # # Plot the winner
    # splot = plt.subplot(2, 1, 2)
    # Y_ = clf.predict(X)
    # print(clf.covariances_.shape)
    # for i, (mean, cov, color) in enumerate(zip(clf.means_, clf.covariances_,
    #                                         color_iter)):
    #     #v, w = linalg.eigh(cov)
    #     if not np.any(Y_ == i):
    #         continue
    #     plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)

    #     # Plot an ellipse to show the Gaussian component
    #     #angle = np.arctan2(w[0][1], w[0][0])
    #     #angle = 180. * angle / np.pi  # convert to degrees
    #     #v = 2. * np.sqrt(2.) * np.sqrt(v)
    #     #ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
    #     #ell.set_clip_box(splot.bbox)
    #     #ell.set_alpha(.5)
    #     #splot.add_artist(ell)

    # plt.xticks(())
    # plt.yticks(())

    # plt.subplots_adjust(hspace=.35, bottom=.02)

    d = plotsdir + "/" + title
    if not os.path.exists(d):
        os.makedirs(d)

    plt.savefig(d + "/EM_BestFit.png")
Exemple #29
0
    tnsr_learn = tnsr[~bad_data, :].reshape((-1, tnsr.shape[-1]))
    #tnsr_learn = tnsr.reshape((-1, tnsr.shape[-1]))
    print(tnsr_learn.shape)
    tnsr_learn = shuffle(tnsr_learn)

    predictor = MiniBatchKMeans(n_clusters=7,
                                batch_size=1000000,
                                compute_labels=False).fit(tnsr_learn)

    dump(predictor, open('predictor.pkl', 'wb'))
    np.save('tnorm.npy', tnorm)

    cc = np.array(predictor.cluster_centers_)
    print(cc)
    gm = GM(cc.shape[0], max_iter=10, means_init=cc, tol=0.01)
    gm.fit(shuffle(tnsr_learn)[:(4000000 if mode_size == 'f' else 2000000)])
    print('gm')
    dump(gm, open('gm.pkl', 'wb'))
    system("say 'learning done'")
    if mode == 'fp':
        tnsr = tnsr_or
    else:
        exit(0)

tnorm = np.load('tnorm.npy')
#predictor = load(open('predictor.pkl','rb'))
gm = load(open('gm.pkl', 'rb'))
Ncc = len(gm.weights_)
prob_pred = np.empty(tnsr.shape[:-1] + (Ncc, ), dtype=np.float32)
                                 random_state=n_clusters)),
                         ("forest_clf", RFC(n_estimators=150,
                                            random_state=42))])
    pipeline.fit(X_train_pca, y_train)
    # print(n_clusters, pipeline.score(X_valid_pca, y_valid))

X_train_extended = np.c_[X_train_pca, X_train_reduced]
X_valid_extended = np.c_[X_valid_pca, X_valid_reduced]
X_test_extended = np.c_[X_test_pca, X_test_reduced]

rfc = RFC(n_estimators=150, random_state=42)
rfc.fit(X_train_extended, y_train)
# print(rfc.score(X_valid_extended, y_valid)) # 0.825

from sklearn.mixture import GaussianMixture as GM
gm = GM(n_components=40, random_state=42)
y_pred = gm.fit_predict(X_train_pca)

n_gen_faces = 20
gen_faces_reduced, y_gen_faces = gm.sample(n_samples=n_gen_faces)
gen_faces = pca.inverse_transform(gen_faces_reduced)
# plot_faces(gen_faces, y_gen_faces)

n_rotated = 4
rotated = np.transpose(X_train[:n_rotated].reshape(-1, 64, 64), axes=[0, 2, 1])
rotated = rotated.reshape(-1, 64 * 64)
y_rotated = y_train[:n_rotated]

n_flipped = 3
flipped = X_train[:n_flipped].reshape(-1, 64,
                                      64)[:, ::-1]  # x[:, ::-1] <- 每一列倒序