def test_correlations_selection(self): user_request = AnalysisTestUtils.create_default_user_request() user_request.set_custom_attr("expvar", "MetadataSignificant") otu_table = AnalysisTestUtils.get_test_input_as_table( AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT) headers, sample_labels = AnalysisTestUtils.get_test_input_as_metadata( AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT) metadata_table = AnalysisTestUtils.get_test_input_as_table( AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT, SAMPLE_METADATA_FILENAME) taxonomic_map = AnalysisTestUtils.get_test_taxonomy( AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT) metadata = Metadata("test", "test", False) metadata.set_table(metadata_table) plugin = CorrelationsSelection() actual_output = plugin.analyse(user_request, otu_table, headers, sample_labels, metadata, taxonomic_map) print(json.dumps(actual_output)) expected_output = AnalysisTestUtils.get_expected_output( AnalysisTestUtils.SIMPLE_TEST_CASE_OUTPUT_ROOT, "correlations_selection.json") comparison_output = AnalysisTestUtils.compare_two_objects( expected_output, actual_output) if not comparison_output: print("Expected: ") print(expected_output) print("Actual: ") print(actual_output) self.assertTrue(comparison_output)
def get_metadata_obj(test_dir, csv_name=SAMPLE_METADATA_FILENAME, sep="\t"): output = [] csv_name = os.path.join(test_dir, csv_name) print("Opening file with name " + csv_name) with open(csv_name, 'r') as csvfile: base_csv = csv.reader(csvfile, delimiter=sep, quotechar='|') i = 0 for o in base_csv: output.append(o) i += 1 metadata = Metadata("", "", load_samples=False) metadata.set_table(output) return metadata
def test_simple_differential_selection_with_ancom(self): user_request = AnalysisTestUtils.create_default_user_request() user_request.catvar = "Category" user_request.set_custom_attr("pvalthreshold", "0.01") user_request.set_custom_attr("pwVar1", "Control") user_request.set_custom_attr("pwVar2", "Disease") otu_table = AnalysisTestUtils.get_test_input_as_table( AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT) headers, sample_labels = AnalysisTestUtils.get_test_input_as_metadata( AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT) metadata_table = AnalysisTestUtils.get_test_input_as_table( AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT, SAMPLE_METADATA_FILENAME) metadata_col = AnalysisTestUtils.get_disease_metadata_values( AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT) taxonomic_map = AnalysisTestUtils.get_test_taxonomy( AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT) sample_ids_from_metadata = AnalysisTestUtils.get_sample_ids_from_metadata( AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT) sample_id_to_metadata = {} i = 0 while i < len(sample_ids_from_metadata): sample_id_to_metadata[ sample_ids_from_metadata[i]] = metadata_col[i] i += 1 metadata = Metadata("test", "test", False) metadata.set_table(metadata_table) plugin = DifferentialSelection() abundances = plugin.analyse_with_ancom(user_request, otu_table, headers, sample_labels, sample_id_to_metadata, taxonomic_map) print(json.dumps(abundances)) expected_output = AnalysisTestUtils.get_expected_output( AnalysisTestUtils.SIMPLE_TEST_CASE_OUTPUT_ROOT, "differential_selection_with_ancom_control_disease.json") comparison_output = AnalysisTestUtils.compare_two_objects( expected_output, abundances) if not comparison_output: print("Expected: ") print(expected_output) print("Actual: ") print(abundances) self.assertTrue(comparison_output)
def load_otu_table(self, user_id, pid, use_raw, use_np): self.user_id = user_id self.pid = pid logger.info("Before load") self.sample_metadata = Metadata(user_id, pid) logger.info("Finished metadata loading") self.otu_metadata = Taxonomy(user_id, pid) logger.info("Finished taxonomy loading") logger.info("Using raw data") if use_np: self.table = DataIO.tsv_to_np_table(self.user_id, self.pid, RAW_GENE_TABLE_FILENAME) else: self.table = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_FILENAME) labels = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_LABELS_FILENAME) self.headers = labels[0] self.sample_labels = labels[1]
def test_simple_correlations(self): user_request = AnalysisTestUtils.create_default_user_request() user_request.set_custom_attr("corrvar1", "MetadataSignificant") user_request.set_custom_attr("corrvar2", "MetadataNonSignificant") user_request.set_custom_attr("corrvar1SpecificTaxonomies", "[]") user_request.set_custom_attr("corrvar2SpecificTaxonomies", "[]") user_request.set_custom_attr("colorvar", "MetadataSignificant") user_request.set_custom_attr("sizevar", "MetadataNonSignificant") user_request.set_custom_attr("samplestoshow", "both") otu_table = AnalysisTestUtils.get_test_input_as_table( AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT) headers, sample_labels = AnalysisTestUtils.get_test_input_as_metadata( AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT) metadata_table = AnalysisTestUtils.get_test_input_as_table( AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT, SAMPLE_METADATA_FILENAME) metadata = Metadata("test", "test", False) metadata.set_table(metadata_table) plugin = Correlations() actual_output = plugin.analyse(user_request, otu_table, headers, sample_labels, metadata) expected_output = AnalysisTestUtils.get_expected_output( AnalysisTestUtils.SIMPLE_TEST_CASE_OUTPUT_ROOT, "correlation_sign_nonsign_sign_nonsign.json") comparison_output = AnalysisTestUtils.compare_two_objects( expected_output, actual_output) print(json.dumps(actual_output)) if not comparison_output: print("Expected: ") print(expected_output) print("Actual: ") print(actual_output) self.assertTrue(comparison_output)
class OTUTable(object): def __init__(self, user_id, pid, use_raw=False, use_np=True): self.user_id = "" self.pid = "" self.sample_metadata = "" self.otu_metadata = "" self.phylogenetic_tree = "" self.table = [] self.headers = [] self.sample_labels = [] self.load_otu_table(user_id, pid, use_raw, use_np) logger.info(DataIO.tsv_to_table.cache_info()) def load_otu_table(self, user_id, pid, use_raw, use_np): self.user_id = user_id self.pid = pid logger.info("Before load") self.sample_metadata = Metadata(user_id, pid) logger.info("Finished metadata loading") self.otu_metadata = Taxonomy(user_id, pid) logger.info("Finished taxonomy loading") logger.info("Using raw data") if use_np: self.table = DataIO.tsv_to_np_table(self.user_id, self.pid, RAW_GENE_TABLE_FILENAME) else: self.table = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_FILENAME) labels = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_LABELS_FILENAME) self.headers = labels[0] self.sample_labels = labels[1] def load_phylogenetic_tree_if_exists(self): self.phylogenetic_tree = "" def get_table(self): return self.table def get_headers(self): return self.headers def get_sample_labels(self): return self.sample_labels def get_table_after_filtering(self, user_request): t, h, s = self.filter_otu_table_by_metadata(self.table, self.headers, self.sample_labels, user_request) t, h, s = self.filter_otu_table_by_taxonomic_items(t, h, s, self.otu_metadata.get_taxonomy_map(), user_request) return t, h, s def get_table_after_filtering_and_aggregation(self, user_request): logger.info("Starting filtering and aggregation") t, h, s = self.filter_otu_table_by_metadata(self.table, self.headers, self.sample_labels, user_request) logger.info("Finished filtering by metadata") t, h, s = self.filter_otu_table_by_taxonomic_items(t, h, s, self.otu_metadata.get_taxonomy_map(), user_request) logger.info("Finished filtering by taxonomic items") # TODO: No aggregation needed in the gene world # t, h, s = self.aggregate_otu_table_at_taxonomic_level(t, h, s, user_request) logger.info("Finished aggregation") return t, h, s def get_table_after_filtering_and_aggregation_and_low_count_aggregation(self, user_request): logger.info("Starting filtering and aggregation") t, h, s = self.filter_otu_table_by_metadata(self.table, self.headers, self.sample_labels, user_request) logger.info("Finished filtering by metadata") t, h, s = self.filter_otu_table_by_taxonomic_items(t, h, s, self.otu_metadata.get_taxonomy_map(), user_request) logger.info("Finished filtering by taxonomic items") # TODO: No aggregation needed in the gene world # t, h, s = self.aggregate_otu_table_at_taxonomic_level(t, h, s, user_request) logger.info("Finished filtering by low counts") # t, h, s = self.aggregate_low_count_np(t, h, s, user_request) logger.info("Finished aggregation") return t, h, s def get_otu_metadata(self): return self.otu_metadata def get_sample_metadata(self): return self.sample_metadata def get_phylogenetic_tree(self): return self.phylogenetic_tree def filter_otu_table_by_metadata(self, base, headers, sample_labels, user_request): """ Filters an OTU table by a particular metadata category by identifying the samples that fall under the metadata category :param base: :param metadata: :param catvar: :param values: :return: """ catvar = user_request.sample_filter role = user_request.sample_filter_role values = user_request.sample_filter_vals if catvar == "none" or catvar == "" or (len(values) == 1 and values[0] == "mian-select-all"): # Filtering is not enabled or everything is selected logger.info("Sample filtering not enabled or all samples are selected") return base, headers, sample_labels metadata_map = self.sample_metadata.get_sample_id_to_metadata_map(catvar) samples = {} row = 0 while row < len(base): sample_id = sample_labels[row] if sample_id in metadata_map: if role == "Include": if metadata_map[sample_id] in values: samples[sample_id] = 1 else: if metadata_map[sample_id] not in values: samples[sample_id] = 1 row += 1 if samples is None or samples == "": samples = [] new_otu_table = [] new_sample_labels = [] num_filtered_samples = 0 i = 0 while i < len(base): sample_id = sample_labels[i] if sample_id in samples: new_otu_table.append(base[i]) new_sample_labels.append(sample_id) else: num_filtered_samples += 1 i += 1 logger.info("Filtered out " + str(num_filtered_samples) + "/" + str(len(base)) + " samples") return new_otu_table, headers, new_sample_labels def filter_otu_table_by_taxonomic_items(self, base, headers, sample_labels, taxonomic_map, user_request): """ Returns an OTU table that has been filtered by specific taxonomic items of interest (eg. if the user selected that they only wanted to see Staphylococcus genus, an OTU table will be returned that only contains Staphylococcus OTUs) :param base: :param taxonomic_map: :param items_of_interest: :param level: :return: """ level = user_request.taxonomy_filter role = user_request.taxonomy_filter_role items_of_interest = user_request.taxonomy_filter_vals if int(level) == -2 or (len(items_of_interest) == 1 and items_of_interest[0] == "mian-select-all"): # -2 indicates that we should not filter by taxonomic items or everything is selected logger.info("OTU filtering not enabled or all OTUs are selected") return base, headers, sample_labels otus = {} for otu, classification in taxonomic_map.items(): if 0 <= int(level) < len(classification): if role == "Include": if classification[int(level)] in items_of_interest: otus[otu] = 1 else: if classification[int(level)] not in items_of_interest: otus[otu] = 1 elif int(level) == -1: if role == "Include": if otu in items_of_interest: otus[otu] = 1 else: if otu not in items_of_interest: otus[otu] = 1 else: otus[otu] = 1 new_otu_table = [] new_headers = [] num_filtered_otus = 0 i = 0 while i < len(base): new_row = [] j = 0 while j < len(base[i]): if headers[j] in otus: new_row.append(base[i][j]) if i == 0: new_headers.append(headers[j]) else: if i == 0: num_filtered_otus += 1 j += 1 new_otu_table.append(new_row) i += 1 logger.info("Table cols = " + str(len(new_otu_table[0])) + " header cols = " + str(len(new_headers))) logger.info("Filtered out " + str(num_filtered_otus) + "/" + str(len(base[0])) + " OTUs/taxas") return new_otu_table, new_headers, sample_labels def aggregate_otu_table_at_taxonomic_level(self, base, headers, sample_labels, user_request): """ Returns an OTU table that has been transformed according to the functional annotations :param base: :param level: :return: """ level = user_request.level if int(level) < 0: # We want to aggregate at the OTU level, which is essentially not aggregating at all return base, headers, sample_labels taxonomy_map = self.otu_metadata.get_taxonomy_map() taxonomies = [] taxonomy_to_cols = {} j = 0 while j < len(headers): otu = headers[j] if otu not in taxonomy_map: # TODO: This actually indicates bad input data j += 1 continue taxonomy = "; ".join(taxonomy_map[otu][:int(level) + 1]) if taxonomy in taxonomy_to_cols: taxonomy_to_cols[taxonomy].append(j) else: taxonomy_to_cols[taxonomy] = [j] taxonomies.append(taxonomy) j += 1 with Pool() as pool: func = partial(process_row_aggregate_otu_table_at_taxonomic_level, taxonomies, taxonomy_to_cols) aggregated_base = pool.map(func, base) aggregated_headers = taxonomies logger.info("Agg Table cols = " + str(len(aggregated_base[0])) + " header cols = " + str(len(aggregated_headers))) return aggregated_base, aggregated_headers, sample_labels def aggregate_otu_table_at_taxonomic_level_np(self, base, headers, sample_labels, user_request): """ Returns an OTU table that has been aggregated at a specific taxonomic level (eg. this could return a table that is grouped at the Family taxonomic level). Approx 5x slower than non-np version :param base: :param level: :return: """ level = user_request.level if int(level) < 0: # We want to aggregate at the OTU level, which is essentially not aggregating at all return base, headers, sample_labels taxonomy_map = self.otu_metadata.get_taxonomy_map() taxonomies = [] taxonomy_to_cols = {} i = 0 while i < len(headers): otu = headers[i] if otu not in taxonomy_map: # TODO: This actually indicates bad input data i += 1 continue taxonomy = "; ".join(taxonomy_map[otu][:int(level) + 1]) if taxonomy in taxonomy_to_cols: taxonomy_to_cols[taxonomy].append(i) else: taxonomy_to_cols[taxonomy] = [i] taxonomies.append(taxonomy) i += 1 rows = len(base) cols = len(taxonomies) aggregated_base = np.zeros((rows, cols)) i = 0 for taxonomy in taxonomies: cols_to_aggregate = taxonomy_to_cols[taxonomy] aggregated_base[:, i] += np.sum(base[:, cols_to_aggregate], axis=1) i += 1 aggregated_headers = taxonomies return aggregated_base, aggregated_headers, sample_labels def filter_out_low_count_np(self, base, headers, sample_labels, user_request): logger.info("Starting filtering by low count") count_threshold = user_request.taxonomy_filter_count min_prevalence = user_request.taxonomy_filter_prevalence base = np.array(base) headers = np.array(headers) num_samples = base.shape[0] min_prevalence_percentage = min_prevalence / float(100) otus_over_threshold = (base > count_threshold).astype(int) otus_to_keep = np.divide(np.sum(otus_over_threshold, axis=0), num_samples) >= min_prevalence_percentage logger.info( "Done filtering by low count. Kept " + str(sum(otus_to_keep)) + " cols out of " + str(len(headers))) return base[:, otus_to_keep], headers[otus_to_keep], sample_labels def aggregate_low_count_np(self, base, headers, sample_labels, user_request): count_threshold = user_request.taxonomy_filter_count min_prevalence = user_request.taxonomy_filter_prevalence base = np.array(base) headers = np.array(headers) num_samples = base.shape[0] min_prevalence_percentage = min_prevalence / float(100) otus_over_threshold = (base > count_threshold).astype(int) otus_to_keep = np.divide(np.sum(otus_over_threshold, axis=0), num_samples) >= min_prevalence_percentage otus_to_aggregate = np.divide(np.sum(otus_over_threshold, axis=0), num_samples) < min_prevalence_percentage aggregated_col = np.sum(base[:, otus_to_aggregate], axis=1) aggregated_base = np.c_[base[:, otus_to_keep], aggregated_col] aggregated_headers = headers[otus_to_keep].append("Other") logger.info( "Aggregate low count cols = " + str(len(otus_to_aggregate)) + " header cols = " + str(len(aggregated_headers))) return aggregated_base, aggregated_headers, sample_labels @staticmethod def get_otu_table_headers_at_taxonomic_level(user_id, pid, level, use_raw=False): logger.info("Using raw data") labels = DataIO.tsv_to_table(user_id, pid, RAW_GENE_TABLE_LABELS_FILENAME) headers = labels[0] if int(level) == -1: # OTUs requested return headers taxonomy = Taxonomy(user_id, pid) taxonomy_map = taxonomy.get_taxonomy_map() taxonomies = [] taxonomy_to_cols = {} j = 0 while j < len(headers): otu = headers[j] if otu in taxonomy_map: # Uncomment below if we want to use the fully quantified taxonomy string # taxonomy = "; ".join(taxonomy_map[otu][:int(level) + 1]) taxonomy = taxonomy_map[otu][int(level)] if taxonomy != "": if taxonomy in taxonomy_to_cols: taxonomy_to_cols[taxonomy].append(j) else: taxonomy_to_cols[taxonomy] = [j] taxonomies.append(taxonomy) j += 1 return taxonomies