Ejemplo n.º 1
0
def pipeline(data_file,
             cords_file,
             results_dir,
             n_components=50,
             n_components_plot=5):
    """
    Pipeline for generating PCA plots from data file containing features.

    :param data_file: Data file containing features matrix. Must have one
    column with labels named "label" and features in columns "aaaaaa" to
    "tttttt" in header. Include directory and extension.
    :param cords_file: File to store transformed coordinates to, including
    directory and extension.
    :param results_dir: Directory to store scatter plots to.
    :param n_components: Number of components to reduce to (default = 50).
    :param n_components_plot: Number of principal components to generate
    combinations to plot (default = 5).
    :return: None.
    """
    # Load in data file
    print("Loading data file...")
    data_frame = pd.read_table(data_file)
    # Get features used in PCA
    features_df = data_frame.loc[:, "aaaaaa":"tttttt"]
    # Get labels
    labels_df = data_frame.loc[:, "label"]
    labels_list = sorted(list(set(labels_df)))

    # Create the PCA
    print("Creating PCA...")
    ipca = IncrementalPCA(n_components=n_components)
    features_transformed = ipca.fit_transform(features_df)

    # Label the transformed coordinates
    transformed_df = pca.label_coordinates(
        transformed_coordinates=features_transformed, labels=labels_df)

    # Save the transformed coordinates
    print("Saving transformed coordinates...")
    transformed_df.to_csv(cords_file, sep='\t', index=False)

    # Save the explained variances
    print("Saving explained variances...")
    pca.save_variances(pca=ipca, file_name=results_dir + "variances.txt")

    # Create combinations of principal components to plot
    components = range(1, n_components_plot + 1)
    combinations_list = generate_combinations(components)

    # Plot all the combinations of components. Choose colors list with colors
    # in matplotlib.
    plot_combinations(transformed_df,
                      results_dir=results_dir,
                      labels_list=labels_list,
                      combinations_list=combinations_list,
                      colors_list=[
                          "saddlebrown", "gold", "darkgreen", "cyan",
                          "darkblue", "magenta"
                      ],
                      ipca=ipca)
Ejemplo n.º 2
0
def test_generate_combinations_4():
    assert sp.generate_combinations("abcd", n = 3) == [('a', 'b', 'c'), ('a', 'b', 'd'),
                                                      ('a', 'c', 'd'), ('b', 'c', 'd')]
Ejemplo n.º 3
0
def test_generate_combinations_3():
    assert sp.generate_combinations("ab", n = 3) == []
Ejemplo n.º 4
0
def test_generate_combinations_2():
    assert sp.generate_combinations([1, 2, 3]) == [(1, 2), (1, 3), (2, 3)]
Ejemplo n.º 5
0
def test_generate_combinations_1():
    assert sp.generate_combinations([]) == []