Ejemplo n.º 1
0
def PComponent_(train_Set, test_Set, var_Threshold=None, components=None):
    if (var_Threshold == None and components == None):
        print(
            "please give a threshold for PComponent - either var threshold or components"
        )
        quit()
    if (var_Threshold != None and components != None):
        print("give only one threshold")
        quit()
    if (var_Threshold != None):
        pca = PCA()
        pca.fit(train_Set)

        #variance ratio in percentage
        explain_Variance = around(pca.explained_variance_ratio_, decimals=4)
        explain_Variance = explain_Variance.tolist()
        explain_Variance = [x * 100 for x in explain_Variance]

        #cumulative variance
        temp = 0
        for x in range(len(explain_Variance)):
            explain_Variance[x] = temp + explain_Variance[x]
            temp = explain_Variance[x]
        explain_Variance = [x for x in explain_Variance if x < var_Threshold]
        n_components = len(explain_Variance)
        pca = PCA(n_components=n_components)
        return (pca.fit_transform(train_Set), pca.transform(test_Set))
    else:
        pca = PCA(n_components=components)
        return (pca.fit_transform(train_Set), pca.transform(test_Set))
Ejemplo n.º 2
0
def classify_for_benchmark(data_set_df, user_info_df, features, label='gender', classifier=None, num=None):
    instance_num = len(data_set_df.columns)
    x = data_set_df.loc[features]
    x = x.dropna(how='all', axis=0)
    x = x.dropna(how='all', axis=1)

    imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=1)
    x_replaced = x.replace([np.inf, -np.inf], np.nan)
    x_imp = imp.transform(x_replaced)

    y = user_info_df.get(label)
    y_filtered = y[(map(int, x.columns.values))]

    clf = nb.BernoulliNB() if classifier is None else classifier
    cv_num = min(len(y_filtered), 10)
    if cv_num <= 1 or len(y_filtered.unique()) <= 1:
        return 0.0, 100.0
    else:
        final_score = 0.0
        for i in range(100):
            score = 0.0
            cnt = 0
            skf = StratifiedKFold(y_filtered, n_folds=cv_num, shuffle=True)
            for tr_index, te_index in skf:
                x_train, x_test = x_imp.T[tr_index], x_imp.T[te_index]
                y_train, y_test = y_filtered.iloc[tr_index], y_filtered.iloc[te_index]
                min_num = min(len(x_train), len(x_train.T), len(x_test), len(x_test.T), num)
                pca = PCA(min_num)
                x_train = pca.fit_transform(x_train)
                x_test = pca.fit_transform(x_test)

                try:
                    clf.fit(x_train, y_train)
                    score += clf.score(x_test, y_test)
                    cnt += 1
                    # cv_score = cross_validation.cross_val_score(clf, x_imp.T, y_filtered, cv=cv_num)
                except ValueError:
                    traceback.print_exc()
                    print i, "why error? skip!"
            if cnt > 0:
                score /= cnt
                print i, score
            else:
                return 0.0, (float(instance_num - len(y_filtered)) / instance_num)
            final_score += score
        final_score /= 100
        miss_clf_rate = (float(instance_num - len(y_filtered)) / instance_num)
        return final_score, miss_clf_rate
Ejemplo n.º 3
0
 def cross_validate(self, train_x, train_y, test_x, test_y, **params):
     if not params:
         params = {"dummy": [0]}
     keys, values = list(zip(*list(params.items())))
     for param_list in itertools.product(*values):
         cv_params = list(self.params.items()) + list(zip(keys, param_list))
         for use_pca in (False, True):
             if self.have_tested(cv_params, use_pca):
                 continue
             if use_pca:
                 pca = PCA(n_components=0.99)
                 proc_train_x = pca.fit_transform(train_x)
                 proc_test_x = pca.transform(test_x)
             else:
                 proc_train_x = train_x
                 proc_test_x = test_x
             if "dummy" in params:
                 model = self.func().fit(proc_train_x, train_y)
             else:
                 model = self.func(**dict(cv_params)).fit(
                     proc_train_x, train_y)
             predictions = model.predict_proba(proc_test_x)
             if len(predictions.shape) == 2:
                 predictions = predictions[:, 1]
             num_right = (test_y == predictions.round()).sum()
             self.json["tests"].append({})
             test_data = self.json["tests"][-1]
             test_data["use_pca"] = use_pca
             test_data["pct_right"] = 100 * num_right / float(len(test_y))
             test_data["loss"] = log_loss(test_y, predictions)
             test_data["num_right"] = num_right
             test_data["num_tests"] = len(test_y)
             test_data["params"] = dict(cv_params)
             self._write()
             print((self.print_test(test_data)))
Ejemplo n.º 4
0
    def write_predictions(self, model):
        if not os.path.exists(self.pred_dir):
            os.mkdir(self.pred_dir)

        raw_train_x, train_y = features_labels(self.season + 1)
        scaler = StandardScaler()

        train_x = scaler.fit_transform(raw_train_x)
        pca = PCA()
        if model.json.get("use_pca", False):
            train_x = pca.fit_transform(train_x)

        clf = model.func(**model.best_params()["params"]).fit(train_x, train_y)

        features, ids = self.get_features_and_ids()

        features = scaler.transform(features)
        if model.json.get("use_pca", False):
            features = pca.transform(features)

        predictions = clf.predict_proba(features)
        if len(predictions.shape) == 2:
            predictions = predictions[:, 1]

        with open(self.pred_path, 'w') as buff:
            buff.write("id,pred\n")
            for (label, pred) in zip(ids, predictions):
                buff.write("{:s},{:s}\n".format(label, str(pred)))
Ejemplo n.º 5
0
class PCACCLayer(Layer):
    def __init__(self, n_out):
        self.pca = PCA(n_components=n_out)

    def get_train_output_for(self, inputX):
        batches, n_in, rows, cols = inputX.shape
        # 归一化
        # inputX = norm4d(inputX)
        # inputX, self.P1 = whiten4d(inputX)
        myUtils.visual.save_map(inputX[[10, 100, 1000]], dir_name, 'norm4d')
        inputX = inputX.transpose((0, 2, 3, 1)).reshape((-1, n_in))
        outputX = self.pca.fit_transform(inputX)
        outputX = outputX.reshape((batches, rows, cols, -1)).transpose(
            (0, 3, 1, 2))
        myUtils.visual.save_map(outputX[[10, 100, 1000]], dir_name, 'pca')
        return outputX

    def get_test_output_for(self, inputX):
        batches, n_in, rows, cols = inputX.shape
        # 归一化
        # inputX = norm4d(inputX)
        # inputX = whiten4d(inputX, self.P1)
        myUtils.visual.save_map(inputX[[10, 100, 1000]], dir_name, 'norm4dte')
        inputX = inputX.transpose((0, 2, 3, 1)).reshape((-1, n_in))
        outputX = self.pca.transform(inputX)
        outputX = outputX.reshape((batches, rows, cols, -1)).transpose(
            (0, 3, 1, 2))
        myUtils.visual.save_map(outputX[[10, 100, 1000]], dir_name, 'pcate')
        return outputX
Ejemplo n.º 6
0
def recommender_system_using_svd_pca(user_id, user_movies, movie_tag_vector,
                                     genome_tags, model):
    movies_watched = list(user_movies[user_id])

    movies_watched_tags = {}
    for movie in movies_watched:
        movie_tags = movie_tag_vector[movie]
        for tag in list(movie_tags.keys()):
            movies_watched_tags[tag] = 1
    tags = list(movies_watched_tags.keys())

    # tags = list(genome_tags.keys())
    all_movies = list(movie_tag_vector.keys())
    all_movies, movie_tag_matrix = build_movie_tag_matrix(
        all_movies, tags, movie_tag_vector)

    if model == 'PCA':
        pca = PCA(n_components=min(10, len(tags)))
        U = pca.fit_transform(movie_tag_matrix)
    else:
        U, S, Vt = np.linalg.svd(movie_tag_matrix, full_matrices=False)

    watched_indexed, U_watched, rest_indexed, U_rest \
        = split_output(U, movies_watched, all_movies)

    similarity_mapping = get_similarity_mapping(watched_indexed, U_watched,
                                                rest_indexed, U_rest)

    weighted_similarities = weigh_similarities(user_id, similarity_mapping)

    return weighted_similarities
Ejemplo n.º 7
0
def build_images_KMeans(spectra,
                        spectrum_columns,
                        spectra_distances,
                        TSNE_learning_rate=500,
                        TSNE_n_iter=1500,
                        TSNE_learning_rate2=300):
    colors = ['red', 'black']
    c = spectra['marked'].apply(lambda x: colors[x])

    plt.subplots(figsize=(18, 6))
    plt.subplot(131)
    plt.title("PCA")
    pca = PCA(n_components=2, random_state=42)
    spectra_2D = pca.fit_transform(spectra[spectrum_columns])
    plt.scatter(spectra_2D[:, 0], spectra_2D[:, 1], c=c, alpha=0.5)

    plt.subplot(132)
    plt.title("TSNE, Euclidean distance")
    tsne = TSNE(n_components=2,
                random_state=42,
                learning_rate=TSNE_learning_rate,
                n_iter=TSNE_n_iter)
    spectra_2D = tsne.fit_transform(spectra[spectrum_columns])
    plt.scatter(spectra_2D[:, 0], spectra_2D[:, 1], c=c, alpha=0.5)

    plt.subplot(133)
    plt.title("TSNE, Chosen distance")
    tsne = TSNE(n_components=2,
                random_state=42,
                metric="precomputed",
                learning_rate=TSNE_learning_rate2)
    spectra_2D = tsne.fit_transform(spectra_distances)
    plt.scatter(spectra_2D[:, 0], spectra_2D[:, 1], c=c, alpha=0.5)
    plt.show()

    # visualization - tsne with chosen distance
    print('Clustering')
    plt.subplots(figsize=(18, 12))
    plt.subplot(3, 3, 1)

    colors = ['red', 'black']
    c = spectra['marked'].apply(lambda x: colors[x])
    plt.title("true labels")
    plt.scatter(spectra_2D[:, 0], spectra_2D[:, 1], c=c, alpha=0.5)

    colors = [
        'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple',
        'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan'
    ]

    for n in range(2, 10):
        kmeans = cluster.KMeans(n_clusters=n, random_state=42)
        cluster_labels = kmeans.fit_predict(spectra_distances)

        plt.subplot(3, 3, n)
        c = [colors[l] for l in cluster_labels]
        plt.title("cluster labels ({} clusters)".format(n))
        plt.scatter(spectra_2D[:, 0], spectra_2D[:, 1], c=c, alpha=0.5)

    plt.show()
Ejemplo n.º 8
0
class LogisticClassifier(object):
    def __init__(self, learning_rate=0.01, reg=0., momentum=0.5):
        self.classifier = LogisticRegression(learning_rate, reg, momentum)
        self.pca = None
        self.scaler = None

    def sgd_optimize(self, data, n_epochs, mini_batch_size):
        data = self._preprocess_data(data)
        sgd_optimization(data, self.classifier, n_epochs, mini_batch_size)

    def _preprocess_data(self, data):
        # center data and scale to unit std
        if self.scaler is None:
             self.scaler = StandardScaler()
             data = self.scaler.fit_transform(data)
        else:
            data = self.scaler.transform(data)

        if self.pca is None:
            # use minika's mle to guess appropriate dimension
            self.pca = PCA(n_components='mle')
            data = self.pca.fit_transform(data)
        else:
            data = self.pca.transform(data)

        return data
Ejemplo n.º 9
0
class EnsembleModel:
    def __init__(self, models, **params):
        self.models = models.values()
        self.model_funcs = [j.model for j in models.values()]
        self.params = params
        self._pca = PCA(n_components=0.99)
        self._clf = None

    def fit(self, x, y):
        train_x, test_x, train_y, test_y, = train_test_split(x, y, test_size=0.2)
        pca_train_x = self._pca.fit_transform(train_x)
        pca_test_x = self._pca.transform(test_x)
        for model, model_func in zip(self.models, self.model_funcs):
            if model.json.get("use_pca", False):
                train_x = pca_train_x
                test_x = pca_test_x
            else:
                pass
            model_func.fit(train_x, train_y)
        self._fit_meta_estimator(test_x, test_y)
        return self

    def _fit_meta_estimator(self, x, y):
        predictions = self._predictions(x).T
        y = numpy.atleast_2d(y).T
        labels = numpy.argmin(abs(predictions - y * numpy.ones((1, predictions.shape[1]))), 1)
        self._clf = GaussianNB().fit(x, labels)

    def _predictions(self, x):
        pca_x = self._pca.transform(x)
        predictions = []
        weights = []

        for model, model_func in zip(self.models, self.model_funcs):
            if model.json.get("use_pca", False):
                test_x = pca_x
            else:
                test_x = x
            predictions.append(model_func.predict_proba(test_x)[:, 1])
            weights.append(model.best_params()["loss"])
        return numpy.array(predictions)

    def predict_proba(self, x):
        blend = self.params.get("blend", "mean")
        predictions = self._predictions(x)
        if blend == "median":
            return numpy.median(predictions, 0)
        if blend == "meta":
            probs = self._clf.predict_proba(x)
            preds = []
            for row, prob in zip(predictions.T, probs):
                if max(prob) > 0.99:
                    preds.append(row[numpy.argmax(prob)])
                else:
                    preds.append(numpy.median(row))
            return numpy.array(preds)

        return predictions.mean(0)
Ejemplo n.º 10
0
def plot_pca_2d(data, ax=None, fpath='', show_fig=False):
    ax = _get_ax(ax)

    pca_2d = PCA(n_components=2)
    data_hat = pca_2d.fit_transform(data)
    ax.scatter(data_hat[:, 0], data_hat[:, 1])
    _save_show_fig(fpath, show_fig)

    return pca_2d
Ejemplo n.º 11
0
def test_pass_pca_corr_pca_out():
    X, y = iris_data()
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    eigen = pca.explained_variance_

    plot_pca_correlation_graph(X,
                               variables_names=['1', '2', '3', '4'],
                               X_pca=X_pca,
                               explained_variance=eigen)
Ejemplo n.º 12
0
def pca_scatter2d(data: np.ndarray,
                  labels: Optional[np.ndarray] = None,
                  label_mapping: Optional[Mapping[int, str]] = None,
                  ax=None,
                  fpath='',
                  show_fig=False,
                  title=None,
                  **kwargs):

    pca_2d = PCA(n_components=2)
    data_hat = pca_2d.fit_transform(data)
    scatter2d(data_hat, labels, label_mapping, ax, fpath, show_fig, title,
              **kwargs)
Ejemplo n.º 13
0
def build_images_KMeans(spectra, spectrum_columns, spectra_distances, colors, TSNE_learning_rate=500, TSNE_n_iter=1500, TSNE_learning_rate2=300):
    colors_m = ['red','black']
    
    cols = spectra['marked'].apply(lambda x: colors[x])
    col = spectra['marked']

    plt.subplots(figsize=(18, 6))
    plt.subplot(131)
    plt.title("PCA")
    pca = PCA(n_components=2, random_state=42)
    spectra_2D = pca.fit_transform(spectra[spectrum_columns])
    for i in range(len(spectra_2D)):
        #print(i)
        #print(spectra_2D[i, 0])
        #print(spectra_2D[i, 1])
        #print(cols[i])
        #print(col[i])
        plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]], s=markersizes[col[i]])

    plt.subplot(132)
    plt.title("TSNE, Euclidean distance")
    tsne = TSNE(n_components=2, random_state=42, learning_rate=TSNE_learning_rate, n_iter=TSNE_n_iter)
    spectra_2D = tsne.fit_transform(spectra[spectrum_columns])
    for i in range(len(col)):
        plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]], s=markersizes[col[i]])

    plt.subplot(133)
    plt.title("TSNE, Chosen distance")
    tsne = TSNE(n_components=2, random_state=42, metric="precomputed", learning_rate=TSNE_learning_rate2)
    spectra_2D = tsne.fit_transform(spectra_distances)
    for i in range(len(col)):
        plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]],s=markersizes[col[i]])
    
    # visualization - tsne with chosen distance
    print('Clustering')
    plt.subplots(figsize=(18, 12))

    for n in range(2, 10):
        kmeans = cluster.KMeans(n_clusters=n, random_state=42)
        cluster_labels = kmeans.fit_predict(spectra_distances)

        plt.subplot(3, 3, n-1)
        cols = [colors[l] for l in cluster_labels]
        plt.title("cluster labels ({} clusters)".format(n))
        for i in range(len(col)):
            plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]], s=markersizes[col[i]])
    
    plt.show()

    plt.show()
    return spectra_2D
Ejemplo n.º 14
0
    def compute_pca(self, model):
        X = model.wv.vectors
        #self.plot_matrix(X, 'Before')
        scaler = preprocessing.StandardScaler()
        X_scaled = scaler.fit_transform(X)
        #self.plot_matrix(X_scaled, 'After')

        pca = PCA(n_components=2)
        principalComponents = pca.fit_transform(X_scaled)

        print(pca.explained_variance_ratio_)
        self.scatterplot(principalComponents)

        return principalComponents
Ejemplo n.º 15
0
def test_X_PCA_but_no_explained_variance():
    with pytest.raises(
            ValueError,
            match='If `X_pca` is not None, the `explained variance` '
            'values should not be `None`.'):

        X, y = iris_data()
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)

        plot_pca_correlation_graph(X,
                                   variables_names=['1', '2', '3', '4'],
                                   X_pca=X_pca,
                                   explained_variance=None)
Ejemplo n.º 16
0
def apply_pca(X_train, X_test, pca_thresh):
    """ apply principal component analysis to reduce dimensionality of feature
    vectors"""
    feature_labels = X_train.columns
    pca = PCA(n_components=pca_thresh)
    shape_orig = X_train.shape
    X_train = pca.fit_transform(X_train)
    shape_reduced = X_train.shape
    X_test = pca.transform(X_test)
    logging.info("reduced dimensionality from {} to {}".format(
        shape_orig, shape_reduced))
    rows = ["PC-{}".format(i) for i in range(len(pca.components_))]
    pcs = pd.DataFrame(pca.components_, columns=feature_labels, index=rows)
    return X_train, X_test, pcs
Ejemplo n.º 17
0
def find_distributions(query_areas, query_energies, binned_image,
                       n_classes=2,
                       bimages_path='/home/fin/Documents/Timepix/particle-tk/datagen/images.pkl',
                       segments_path='/home/fin/Documents/Timepix/particle-tk/datagen/segments.pkl'):

    # Load binned images and segments
    b_im = pkl.load(open(bimages_path, 'rb'))
    segments = pkl.load(open(segments_path, 'rb'))
    reductor = PCA(n_components=3)
    b_im = reductor.fit_transform(b_im)
    queried_binned_image = reductor.transform(binned_image.reshape(1,-1))

    areas = [[] for i in range(0,n_classes)]
    pixel_energies = [[] for i in range(0,n_classes)]
    binned_images = [[] for i in range(0,n_classes)]
    binned_images_energies = [[] for i in range(0,n_classes)]

    for segment in segments:
        for lbl in range(1,n_classes+1):
            if segment.get_metadata('label') == lbl:
                areas[lbl-1].append(area(segment.get_bitmap()))
                nonzeroE = segment.get_bitmap().flatten()[segment.get_bitmap().flatten() > 0]
                for e in nonzeroE:
                    pixel_energies[lbl-1].append(e)
                    binned_images_energies[lbl-1].append(b_im[segment.get_metadata('parent_im_id')])
                binned_images[lbl-1].append(b_im[segment.get_metadata('parent_im_id')])
                break

    # Estimation of size density given image
    sizes = list() # for each particle type one array of size
    sizes.append(np.linspace(0,20,100))
    sizes.append(np.linspace(0,10,100))
    energies = list()
    energies.append(np.linspace(0,400,100))
    energies.append(np.linspace(0,400,100))
    p_SgX = list()
    p_EgX = list()

    for lbl in range(1,n_classes+1):
        print(areas[lbl-1])
        estimator_P_SgX = estimate_P_SgX(areas[lbl-1], binned_images[lbl-1])
        estimator_P_EgX = estimate_P_SgX(pixel_energies[lbl-1], binned_images_energies[lbl-1])
        p_SgX.append(estimator_P_SgX.score_samples(query_areas[lbl-1,:],
                                                   np.repeat(np.atleast_2d(queried_binned_image),
                                                             query_areas[lbl-1,:].shape[0], axis=0)))
        p_EgX.append(estimator_P_EgX.score_samples(query_energies[lbl-1,:],
                                                   np.repeat(np.atleast_2d(queried_binned_image),
                                                             query_energies[lbl-1,:].shape[0], axis=0)))
    return np.array(p_SgX), np.array(p_EgX)
Ejemplo n.º 18
0
def test_not_enough_components():
    s = (
        'Number of principal components must match the number of eigenvalues. Got 2 != 1'
    )
    with pytest.raises(ValueError, match=s):

        X, y = iris_data()
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)
        eigen = pca.explained_variance_

        plot_pca_correlation_graph(X,
                                   variables_names=['1', '2', '3', '4'],
                                   X_pca=X_pca,
                                   explained_variance=eigen[:-1])
Ejemplo n.º 19
0
def do_pca(X, c=3):
    """Do PCA"""

    from sklearn import preprocessing
    from sklearn.decomposition.pca import PCA, RandomizedPCA
    #do PCA
    #S = standardize_data(X)
    S = pd.DataFrame(preprocessing.scale(X),columns = X.columns)
    pca = PCA(n_components=c)
    pca.fit(S)
    print (pca.explained_variance_ratio_)
    #print pca.components_
    w = pd.DataFrame(pca.components_,columns=S.columns)#,index=['PC1','PC2'])
    #print w.T.max(1).sort_values()
    pX = pca.fit_transform(S)
    pX = pd.DataFrame(pX,index=X.index)
    return pX
Ejemplo n.º 20
0
def build_images_DBSCAN(spectra, spectrum_columns, spectra_distances, colors, eps_l=[0.01 * i for i in range(12, 0, -2)], TSNE_learning_rate=500, TSNE_n_iter=1500, TSNE_learning_rate2=300):
    markers = ['x', 'o']
    cols = spectra['marked'].apply(lambda x: colors[x])
    col = spectra['marked']

    plt.subplots(figsize=(18, 6))
    plt.subplot(131)
    plt.title("PCA")
    pca = PCA(n_components=2, random_state=42)
    spectra_2D = pca.fit_transform(spectra[spectrum_columns])
    for i in range(len(col)):
        plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]], s=markersizes[col[i]])

    plt.subplot(132)
    plt.title("TSNE, Euclidean distance")
    tsne = TSNE(n_components=2, random_state=42, learning_rate=TSNE_learning_rate, n_iter=TSNE_n_iter)
    spectra_2D = tsne.fit_transform(spectra[spectrum_columns])
    for i in range(len(col)):
        plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]], s=markersizes[col[i]])

    plt.subplot(133)
    plt.title("TSNE, Chosen distance")
    tsne = TSNE(n_components=2, random_state=42, metric="precomputed", learning_rate=TSNE_learning_rate2)
    spectra_2D = tsne.fit_transform(spectra_distances)
    for i in range(len(col)):
        plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]], s=markersizes[col[i]])
    plt.show()
    
    
    # visualization - tsne with chosen distance
    print('Clustering')
    plt.subplots(figsize=(18, 18))

    for i, eps in enumerate(eps_l):
        dbscan = cluster.DBSCAN(eps=eps, min_samples=4)
        cluster_labels = dbscan.fit_predict(spectra_distances)

        plt.subplot(3, 3, i + 1)
        cols = [colors[l] for l in cluster_labels]
        plt.title("cluster labels (eps = {:.2})".format(eps))
        for j in range(len(col)):
            plt.scatter(spectra_2D[j, 0], spectra_2D[j, 1], c=cols[j], alpha=0.5, marker=markers[col[j]], s=markersizes[col[j]])

    plt.show()
    return spectra_2D
Ejemplo n.º 21
0
def test_pipeline_transform():
    # Test whether pipeline works with a transformer at the end.
    # Also test pipline.transform and pipeline.inverse_transform
    iris = load_iris()
    X = iris.data
    pca = PCA(n_components=2)
    pipeline = Pipeline([('pca', pca)])

    # test transform and fit_transform:
    X_trans = pipeline.fit(X).transform(X)
    X_trans2 = pipeline.fit_transform(X)
    X_trans3 = pca.fit_transform(X)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(X_trans, X_trans3)

    X_back = pipeline.inverse_transform(X_trans)
    X_back2 = pca.inverse_transform(X_trans)
    assert_array_almost_equal(X_back, X_back2)
Ejemplo n.º 22
0
def test_pipeline_transform():
    # Test whether pipeline works with a transformer at the end.
    # Also test pipline.transform and pipeline.inverse_transform
    iris = load_iris()
    X = iris.data
    pca = PCA(n_components=2)
    pipeline = Pipeline([('pca', pca)])

    # test transform and fit_transform:
    X_trans = pipeline.fit(X).transform(X)
    X_trans2 = pipeline.fit_transform(X)
    X_trans3 = pca.fit_transform(X)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(X_trans, X_trans3)

    X_back = pipeline.inverse_transform(X_trans)
    X_back2 = pca.inverse_transform(X_trans)
    assert_array_almost_equal(X_back, X_back2)
Ejemplo n.º 23
0
def do_pca(X, c=3):
    """Do PCA"""

    from sklearn import preprocessing
    from sklearn.decomposition.pca import PCA, RandomizedPCA
    #do PCA
    #S = standardize_data(X)
    #remove non numeric
    X = X._get_numeric_data()
    S = pd.DataFrame(preprocessing.scale(X), columns=X.columns)
    pca = PCA(n_components=c)
    pca.fit(S)
    out = 'explained variance %s' % pca.explained_variance_ratio_
    print(out)
    #print pca.components_
    w = pd.DataFrame(pca.components_, columns=S.columns)
    #print w.T.max(1).sort_values()
    pX = pca.fit_transform(S)
    pX = pd.DataFrame(pX, index=X.index)
    return pX, pca
Ejemplo n.º 24
0
    def plot(self):
        pca = PCA(n_components=2)
        df = pca.fit_transform(self.data.values)
        df = pd.DataFrame(df, index=self.data.index)
        ids = sorted(list(self.data.columns))

        df = df.reset_index()
        fig = plt.figure()
        ax = sns.scatterplot(x=0, y=1, data=df, hue='class', edgecolor='black')
        sns.despine(fig=fig, top=True, right=True)
        plt.xlabel('PC1 ({}%)'.format(round(pca.explained_variance_ratio_[0], 4) * 100))
        plt.ylabel('PC2 ({}%)'.format(round(pca.explained_variance_ratio_[1], 4) * 100))
        from functools import reduce
        annot = reduce(lambda x, y: f'{x}\n{y}', ids)
        plt.title("Silhouette Score={}".format(round(self.score, 3)))
        plt.annotate('Proteins\n' + annot, xycoords=ax.transAxes, xy=(1, 0.1))
        if self.filename is None:
            plt.show()
        else:
            plt.savefig(self.filename, dpi=300, bbox_inches='tight')
            print('Results saved to "{}"'.format(self.filename))
Ejemplo n.º 25
0
class Classifier_pca(Layer):
    def __init__(self, C, n_times):
        self.C = C
        self.n_times = n_times
        self.pca = PCA(n_components=0.99)

    def get_train_output_for(self, inputX, inputy=None):
        inputX = self.pca.fit_transform(inputX)
        n_hidden = int(self.n_times * inputX.shape[1])
        self.W = init.GlorotNormal().sample((inputX.shape[1], n_hidden))
        self.b = init.Normal().sample(n_hidden)
        H = dotbiasact_decomp(inputX, self.W, self.b)
        self.beta = compute_beta(H, inputy, self.C)
        out = dot_decomp(H, self.beta)
        return out

    def get_test_output_for(self, inputX):
        inputX = self.pca.transform(inputX)
        H = dotbiasact_decomp(inputX, self.W, self.b)
        out = dot_decomp(H, self.beta)
        return out
Ejemplo n.º 26
0
 def compute_pca_explain_power(self, model, num_dims=2):
     '''
     Given a computed course vectors embedding model, 
     extract the vectors, standardize them, perform
     a 2-dim PCA, and return the two-tuple with each of
     the dimensions explained-variance ratio. 
     
     @param model: course context model as trained by neural net
     @type model: gensim.model.word_vectors
     @return: ratio of explained variance for each of the two dims
     @rtype: (float,float)
     '''
     
     vectors = model.wv.vectors
     #********
     #vectors_standardized = preprocessing.scale(vectors)
     vectors_standardized = vectors
     vectors_standardized_normalized = preprocessing.normalize(vectors_standardized, norm='l2')
     #********
     pca = PCA(n_components=num_dims)
     _principalComponents = pca.fit_transform(vectors_standardized_normalized)
     explained_variance_ratios = pca.explained_variance_ratio_
     return explained_variance_ratios
Ejemplo n.º 27
0
X_test = pd.read_csv('Data/test.csv', header=None).as_matrix()
y = pd.read_csv('Data/trainLabels.csv', header=None)[0].as_matrix()
X = pd.read_csv('Data/train.csv', header=None).as_matrix()

pca2 = PCA(n_components=2, whiten=True)
pca2.fit(np.r_[X, X_test])
X_pca = pca2.transform(X)
i0 = np.argwhere(y == 0)[:, 0]
i1 = np.argwhere(y == 1)[:, 0]
X0 = X_pca[i0, :]
X1 = X_pca[i1, :]
plt.plot(X0[:, 0], X0[:, 1], 'ro')
plt.plot(X1[:, 0], X1[:, 1], 'b*')

pca = PCA(whiten=True)
X_all = pca.fit_transform(np.r_[X, X_test])
print (pca.explained_variance_ratio_)

def kde_plot(x):
        from scipy.stats.kde import gaussian_kde
        kde = gaussian_kde(x)
        positions = np.linspace(x.min(), x.max())
        smoothed = kde(positions)
        plt.plot(positions, smoothed)

def qq_plot(x):
    from scipy.stats import probplot
    probplot(x, dist='norm', plot=plt)
    
kde_plot(X_all[:, 0])
kde_plot(X_all[:, 2])
Ejemplo n.º 28
0
                   sep='\s+',
                   header=None,
                   names=cols)
#%%
X = data.iloc[:, 1:8].values
X = StandardScaler().fit_transform(X)
y = data.iloc[:, 8:]
#%%
data.groupby('Target')['Name'].nunique().plot(kind='bar')
plt.show()
#%%
from pandas.plotting import scatter_matrix
scatter_matrix(data, alpha=0.2, figsize=(12, 12), diagonal='kde')
#%%
model = PCA(n_components=3)
principle_comps = model.fit_transform(X)
#%%
principle_df = pd.DataFrame(data=principle_comps,
                            columns=["PC1", "PC2", "PC3"])
final_df = pd.concat([principle_df, y], axis=1)

#%%
new_y = y.values.tolist()
new_y = [entry[0] for entry in new_y]
#%%
labels = set(new_y)
print(labels)

#%%
sns.lmplot('PC1', 'PC2', final_df, hue='Target', fit_reg=False, size=10)
Ejemplo n.º 29
0
                plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
                if i_dataset == 0:
                    plt.title(name, size=18)
        
#                 colours = numpy.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
#                                                      '#f781bf', '#a65628', '#984ea3',
#                                                      '#999999', '#e41a1c', '#dede00']),
#                                               int(max(y_pred) + 1))))
                
                #import random
                r = lambda: random.randint(255)
                colours = numpy.array([ '#%02X%02X%02X'%(r(),r(),r()) for i in range(max(y_pred)+1) ])
                print(colours)
                
                pca = PCA()
                to_plot = pca.fit_transform(X)

                #print(y_pred)
                c = Counter(y_pred)
                print(c.most_common())
                exp_mx.loc[:,"cluster"] = y_pred
                exp_mx.to_csv("exp_w_clusters.csv")
                
                
                plt.scatter(to_plot[:,0], to_plot[:,1], alpha=0.5, c=colours[y_pred])
        
#                 plt.xlim(-2.5, 2.5)
#                 plt.ylim(-2.5, 2.5)
                plt.xticks(())
                plt.yticks(())
                plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
Ejemplo n.º 30
0
                df_info = pd.read_excel(os.path.join(d, f))
                df_info["Instrument"] = pd.Series([d] * len(df_info),
                                                  index=df_info.index)
            else:
                df = pd.read_excel(os.path.join(d, f))
                df["Instrument"] = pd.Series([d] * len(df), index=df.index)
                df_info = df_info.append(df, ignore_index=True)

# %% Apply PCA, plot some components, see variance explained etc

from sklearn.decomposition.pca import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

mdl = PCA()
new_data = mdl.fit_transform(df_seg)

p = df_info["Instrument"] == "Piano"
v = df_info["Instrument"] == "Violin"

plt.figure()
plt.scatter(new_data[p, 0], new_data[p, 1], label="Piano")
plt.scatter(new_data[v, 0], new_data[v, 1], label="Violin")
plt.legend()
plt.grid(b=True)
plt.savefig("pca_2d.png", dpi=300, transparent=True)

plt.figure()
plt.plot(mdl.explained_variance_)
plt.grid(b=True)
plt.savefig("explained_variance.png", dpi=300, transparent=True)
Ejemplo n.º 31
0
df = pd.read_pickle(data_path / "1_koinworks_cleaned.pkl")
df = df[["id", "date", "username", "cleaned", "tweet", "name"]]
df["date"] = pd.to_datetime(df["date"])
print(f"before drop duplicate: {len(df)}")
df = df.drop_duplicates(subset=["cleaned"])
print(f"after drop duplicate: {len(df)}")
df.dropna(inplace=True)

# TFIDF embeddings
vectorizer = TfidfVectorizer()
pca = PCA(n_components=2, svd_solver="full")
um = UMAP(n_components=5, n_neighbors=15, metric='euclidean')
X = vectorizer.fit_transform(df.cleaned.values)
X_umap = um.fit_transform(X.toarray())
X_pca = pca.fit_transform(X.toarray())
df["pca"] = [a for a in X_pca]
um = UMAP(n_components=2, n_neighbors=15, metric='euclidean')
df["umap_2d"] = [a for a in um.fit_transform(X)]
df["umap"] = [a for a in X_umap]
df["tfidf"] = X
df.to_pickle(data_path / "2_koinworks_fix.pkl")
tweets = df.cleaned.values
x, y = train_test_split(tweets)
y_test, y_val = train_test_split(y)
with open(data_path / "flair_format/train/train.txt", "w") as f:
    for t in tweets:
        f.writelines(f"{t}\n")
with open(data_path / "flair_format/test.txt", "w") as f:
    for t in y_test:
        f.writelines(f"{t}\n")
Ejemplo n.º 32
0
# eng = matlab.engine.start_matlab()
X=[]
for i in xrange(1,30):
    file=open('J_Left/'+`i`)
    mylist=[]
    x=0
    for line in file:
        line=line[:-1]
        temp=line.split(' ')
        for i in range(len(temp)-1):
#             print temp[i]
            mylist.append(float(temp[i]))
    mylist=mylist+[0]*(5000*9-len(mylist))
    X.append(mylist)
print 'len of X',len(X)
pca=PCA(n_components=4)
t=pca.fit_transform(X)
l=[]
for v in t:
    arr=[]
    for e in v:
        f=float(e)
        arr.append(f)
    l.append(arr)
# ret = eng.moh_pca(l)

print l
print type(l)
print 'len of t',len(t)

Ejemplo n.º 33
0
gc = GridSearchCV(estimator=Lasso(), param_grid=gs_params)
lasso_model = gc.fit(X_train, Y_train)
Y_pred = lasso_model.predict(X_test)
best_alpha = lasso_model.best_params_['alpha']
print 'The best lasso model is obtained wiht an alpha of', best_alpha
print 'RMSE of the best standardized lasso regression model:', mean_squared_error(
    Y_test, Y_pred)**0.5

# split data into train, dev and test
# todo: do this before, so that we have errors measured on the same test set
for i in range(1, 9):
    n_components = 2**i
    #n_components = i
    pca = PCA(n_components=n_components)
    X_reduced_train = pca.fit_transform(X_train)
    X_reduced_test = pca.transform(X_test)

    vanilla_lr = LinearRegression()
    vanilla_lr = vanilla_lr.fit(X_reduced_train, Y_train)
    Y_pred = vanilla_lr.predict(X_reduced_test)
    print 'MSE for ', n_components, ' components with LR ', mean_squared_error(
        Y_test, Y_pred)**0.5

    gs_params = {'alpha': [2**i for i in range(-10, 20)]}
    gc = GridSearchCV(estimator=Ridge(), param_grid=gs_params)
    ridge_model = gc.fit(X_reduced_train, Y_train)
    Y_pred = ridge_model.predict(X_reduced_test)
    print 'RMSE for ', n_components, ' components with Ridge ', mean_squared_error(
        Y_test, Y_pred)**0.5
# plt.show()
plt.draw()
# plt.savefig("some_digits.png")

### Preprocessing data ###

# Standardize features by scaling them to the range (0,1)
# Note: this standardization is often used as an alternative to
# zero mean, unit variance scaling (performed with sklearn.preprocessing.scale)
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
train_minmax = min_max_scaler.fit_transform(train)

# PCA
n_pc = 78  # principal components to keep
pca = PCA(n_components=n_pc, whiten=True)  # whitening to remove correlations
train_pca = pca.fit_transform(train_minmax)

# plot all the principal components with their relative explained variance
features = [x for x in range(1, n_pc + 1)]
plt.figure(2)
# percentage of variance explained by each of the selected components.
# The sum of explained variances is equal to 1.0
# plt.plot(features, pca.explained_variance_ratio_, 'g--', marker='o')
plt.semilogy(features, pca.explained_variance_ratio_, "g--", marker="o")
plt.axis([1, n_pc, 0, pca.explained_variance_ratio_.max()])
plt.grid(True)
plt.xlabel("principal components"), plt.ylabel("variance explained (log)")
plt.title("scree plot")
# plt.savefig("screeplot_" + str(n_pc) + "_PC.png")

### Train a SVM Classifier ###
Ejemplo n.º 35
0
def bhtsne(vectors, vecs_with_center, args):
    # if args.bhtsne or not(args.timeline or args.bhtsne or args.wordclouds):
    # bhtnse

    pca = PCA(n_components=50)
    vectors = pca.fit_transform(vectors)

    print('Bhtsne..')
    Y = tsne(vectors, perplexity=args["tsne_perplexity"])
    pd.DataFrame(Y).to_csv('{}/bhtsne.csv'.format(args['path']))
    plt.scatter(Y[:, 0], Y[:, 1], s=0.3)
    plt.savefig('{}/bhtsne.svg'.format(args['path']), bbox_inches='tight')
    plt.savefig('{}/bhtsne.png'.format(args['path']), bbox_inches='tight')
    pd.DataFrame(Y).to_csv('{}/bhtsne_2d.csv'.format(args['path']))
    pvtm_utils.svg_to_pdf('{}/bhtsne.svg'.format(args['path']))
    plt.close()

    print('Bhtsne with center..')
    Y = tsne(vecs_with_center.values, perplexity=args["tsne_perplexity"])
    pd.DataFrame(Y).to_csv('{}/bhtsne_with_center.csv'.format(args['path']))
    plt.scatter(Y[:len(vectors), 0], Y[:len(vectors), 1], s=0.3)
    plt.scatter(Y[len(vectors):, 0],
                Y[len(vectors):, 1],
                s=0.8,
                c='r',
                marker='x')
    plt.savefig('{}/bhtsne_with_center.svg'.format(args['path']),
                bbox_inches='tight')
    plt.savefig('{}/bhtsne_with_center.png'.format(args['path']),
                bbox_inches='tight')
    pd.DataFrame(Y).to_csv('{}/bhtsne_with_center_2d.csv'.format(args['path']))

    pvtm_utils.svg_to_pdf('{}/bhtsne_with_center.svg'.format(args['path']))
    plt.close()

    print('3D tsne...')

    Y = tsne(vectors, dimensions=3, perplexity=args["tsne_perplexity"])
    fig = pyplot.figure(frameon=False, figsize=(8, 5))
    ax = Axes3D(fig)

    ax.scatter(Y[:, 0], Y[:, 1], Y[:, 2], s=1, c='b', marker='^')
    ax.scatter(Y[:, 0], Y[:, 1], Y[:, 2], s=20, c='r', marker='^')
    # pyplot.axis('off')
    xmax, ymax, zmax = Y[:len(vectors), 0].max(), Y[:len(vectors),
                                                    1].max(), Y[:len(vectors),
                                                                2].max()
    xmin, ymin, zmin = Y[:len(vectors), 0].min(), Y[:len(vectors),
                                                    1].min(), Y[:len(vectors),
                                                                2].min()

    ax.set_xlim(xmin + 4, xmax - 4)
    ax.set_ylim(ymin + 4, ymax - 4)
    ax.set_zlim(zmin + 4, zmax - 4)
    pyplot.savefig('{}/bhtsne_3d.svg'.format(args['path']),
                   bbox_inches='tight')
    pyplot.savefig('{}/bhtsne_3d.png'.format(args['path']),
                   bbox_inches='tight')
    pvtm_utils.svg_to_pdf('{}/bhtsne_3d.svg'.format(args['path']))

    pd.DataFrame(Y).to_csv('{}/bhtsne_3d.csv'.format(args['path']))

    Y = tsne(vecs_with_center.values,
             dimensions=3,
             perplexity=args["tsne_perplexity"])
    fig = pyplot.figure(frameon=False, figsize=(8, 5))
    ax = Axes3D(fig)

    ax.scatter(Y[:len(vectors), 0],
               Y[:len(vectors), 1],
               Y[:len(vectors), 2],
               s=1,
               c='b',
               marker='^')
    ax.scatter(Y[len(vectors):, 0],
               Y[len(vectors):, 1],
               Y[len(vectors):, 2],
               s=20,
               c='r',
               marker='^')
    # pyplot.axis('off')
    xmax, ymax, zmax = Y[:len(vectors), 0].max(), Y[:len(vectors),
                                                    1].max(), Y[:len(vectors),
                                                                2].max()
    xmin, ymin, zmin = Y[:len(vectors), 0].min(), Y[:len(vectors),
                                                    1].min(), Y[:len(vectors),
                                                                2].min()

    ax.set_xlim(xmin + 4, xmax - 4)
    ax.set_ylim(ymin + 4, ymax - 4)
    ax.set_zlim(zmin + 4, zmax - 4)
    pyplot.savefig('{}/bhtsne_with_center_3d.svg'.format(args['path']),
                   bbox_inches='tight')
    pyplot.savefig('{}/bhtsne_with_center_3d.png'.format(args['path']),
                   bbox_inches='tight')
    pvtm_utils.svg_to_pdf('{}/bhtsne_with_center_3d.svg'.format(args['path']))

    pd.DataFrame(Y).to_csv('{}/bhtsne_with_center_3d.csv'.format(args['path']))
Ejemplo n.º 36
0
np.max(sel.variances_)


#plot
import matplotlib.pyplot as plt
plt.figure(figsize = (7,4))
plt.plot(sel.variances_)
plt.text(55,0.135,"0.135, 5th nt", style='italic',
        bbox={'facecolor':'red', 'alpha':0.5, 'pad':1})
plt.title("Cleavage site reactivity variance("+deg+")")
plt.show()

#generate files
for length in [21,71,121]:
    for deg in ["wt","xrn4"]:
        X = np.loadtxt("cs_datasets/cs_reactivity_" + deg +"_"+ str(length) + ".csv",delimiter = ",")
        #y = np.loadtxt("cs_efficiency/cs_efficiency_log_"+deg+".csv",delimiter = ",")
        model = PCA(length / 2)
        X2 = model.fit_transform(X)
        X2.shape
        sum( model.explained_variance_ratio_)
        #np.savetxt("cs_datasets/cs_reactivity_" + deg +"_"+ str(length) + "_pca2.csv",X2,delimiter=",",fmt="%0.7g")

a = [1,2]
a.index(1)