def pipeline(data_file, cords_file, results_dir, n_components=50, n_components_plot=5): """ Pipeline for generating PCA plots from data file containing features. :param data_file: Data file containing features matrix. Must have one column with labels named "label" and features in columns "aaaaaa" to "tttttt" in header. Include directory and extension. :param cords_file: File to store transformed coordinates to, including directory and extension. :param results_dir: Directory to store scatter plots to. :param n_components: Number of components to reduce to (default = 50). :param n_components_plot: Number of principal components to generate combinations to plot (default = 5). :return: None. """ # Load in data file print("Loading data file...") data_frame = pd.read_table(data_file) # Get features used in PCA features_df = data_frame.loc[:, "aaaaaa":"tttttt"] # Get labels labels_df = data_frame.loc[:, "label"] labels_list = sorted(list(set(labels_df))) # Create the PCA print("Creating PCA...") ipca = IncrementalPCA(n_components=n_components) features_transformed = ipca.fit_transform(features_df) # Label the transformed coordinates transformed_df = pca.label_coordinates( transformed_coordinates=features_transformed, labels=labels_df) # Save the transformed coordinates print("Saving transformed coordinates...") transformed_df.to_csv(cords_file, sep='\t', index=False) # Save the explained variances print("Saving explained variances...") pca.save_variances(pca=ipca, file_name=results_dir + "variances.txt") # Create combinations of principal components to plot components = range(1, n_components_plot + 1) combinations_list = generate_combinations(components) # Plot all the combinations of components. Choose colors list with colors # in matplotlib. plot_combinations(transformed_df, results_dir=results_dir, labels_list=labels_list, combinations_list=combinations_list, colors_list=[ "saddlebrown", "gold", "darkgreen", "cyan", "darkblue", "magenta" ], ipca=ipca)
def test_generate_combinations_4(): assert sp.generate_combinations("abcd", n = 3) == [('a', 'b', 'c'), ('a', 'b', 'd'), ('a', 'c', 'd'), ('b', 'c', 'd')]
def test_generate_combinations_3(): assert sp.generate_combinations("ab", n = 3) == []
def test_generate_combinations_2(): assert sp.generate_combinations([1, 2, 3]) == [(1, 2), (1, 3), (2, 3)]
def test_generate_combinations_1(): assert sp.generate_combinations([]) == []