def test_invalid_overlap_method(self): t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O3'], ['S1', 'S5', 'S6']) with self.assertRaisesRegex(ValueError, 'overlap method'): merge([t1, t2], 'peanut')
def test_invalid_overlapping_feature_ids(self): t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O3'], ['S4', 'S5', 'S6']) with self.assertRaisesRegex(ValueError, 'features are present'): merge([t1, t2], 'error_on_overlapping_feature')
def test_invalid_overlapping_sample_ids(self): t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O3'], ['S1', 'S5', 'S6']) with self.assertRaisesRegex(ValueError, 'samples.*S1'): merge([t1, t2])
def test_average(self): t1 = Table(np.array([[1, 1, 1], [1, 1, 1]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) obs = merge([t1] * 3, 'average') exp = Table(np.array([[1, 1, 1], [1, 1, 1]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) self.assertEqual(obs, exp)
def test_sum_triple_overlap(self): t1 = Table(np.array([[1, 1, 1], [1, 1, 1]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) obs = merge([t1] * 3, 'sum') exp = Table(np.array([[3, 3, 3], [3, 3, 3]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) self.assertEqual(obs, exp)
def test_single_table(self): t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) obs = merge([t]) self.assertEqual(t, obs)
def make_hierarchy(csi_results: CSIDirFmt, feature_tables: biom.Table, qc_properties: bool = True) -> (TreeNode, biom.Table, pd.DataFrame): ''' This function generates a hierarchy of mass-spec features based on predicted chemical fingerprints. It filters the feature table to retain only the features with fingerprints and relables each feature with a hash (MD5) of its binary fingerprint vector. Parameters ---------- csi_results : CSIDirFmt one or more CSI:FingerID output folder feature_table : biom.Table one or more feature tables with mass-spec feature intensity per sample qc_properties : bool, default True flag to filter molecular properties to keep only PUBCHEM fingerprints Raises ------ ValueError If ``feature_table`` in empty If collated fingerprint table is empty UserWarning If features in collated fingerprint table are not a subset of features in ``feature_table`` Returns ------- skbio.TreeNode a tree of relatedness of molecules biom.Table merged feature table that is filtered to contain only the features present in the tree pd.DataFrame merged feature data ''' fps, fts, fdata = [], [], [] if len(feature_tables) != len(csi_results): raise ValueError("The feature tables and CSI results should have a " "one-to-one correspondance.") for feature_table, csi_result in zip(feature_tables, csi_results): if feature_table.is_empty(): raise ValueError("Cannot have empty feature table") fingerprints = collate_fingerprint(csi_result, qc_properties) relabeled_fp, matched_ft, feature_data = match_label(fingerprints, feature_table) fps.append(relabeled_fp) fts.append(matched_ft) fdata.append(feature_data) merged_fdata = merge_feature_data(fdata) merged_fps = pd.concat(fps) merged_fps = merged_fps[~merged_fps.index.duplicated(keep='first')] merged_fts = merge(fts, overlap_method='error_on_overlapping_sample') tree = build_tree(merged_fps) return tree, merged_fts, merged_fdata
def test_average_relative_frequency(self): t1 = Table(np.array([[0.75, 0.75, 0.75], [0.75, 0.75, 0.75]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) t2 = Table(np.array([[0.25, 0.25, 0.25], [0.25, 0.25, 0.25]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) obs = merge([t1, t2], 'average') exp = Table(np.array([[0.5, 0.5, 0.5], [0.5, 0.5, 0.5]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) self.assertEqual(obs, exp)
def test_sum_full_overlap(self): t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) obs = merge([t1, t2], 'sum') exp = Table(np.array([[0, 3, 9], [3, 3, 6]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) self.assertEqual(obs, exp)
def test_sum_some_overlap(self): # Did I stutter? t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O3'], ['S4', 'S2', 'S5']) obs = merge([t1, t2], 'sum') exp = Table( np.array([[0, 3, 3, 0, 6], [1, 1, 2, 0, 0], [0, 2, 0, 2, 4]]), ['O1', 'O2', 'O3'], ['S1', 'S2', 'S3', 'S4', 'S5']) self.assertEqual(obs, exp)
def test_valid_overlapping_sample_ids(self): t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O3', 'O4'], ['S1', 'S5', 'S6']) obs = merge([t1, t2], 'error_on_overlapping_feature') exp = Table( np.array([[0, 1, 3, 0, 0], [1, 1, 2, 0, 0], [0, 0, 0, 2, 6], [2, 0, 0, 2, 4]]), ['O1', 'O2', 'O3', 'O4'], ['S1', 'S2', 'S3', 'S5', 'S6']) self.assertEqual(obs, exp)
def test_sum_overlapping_feature_ids(self): # This should produce the same result as `error_on_overlapping_sample` t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O3'], ['S4', 'S5', 'S6']) obs = merge([t1, t2], 'sum') exp = Table( np.array([[0, 1, 3, 0, 2, 6], [1, 1, 2, 0, 0, 0], [0, 0, 0, 2, 2, 4]]), ['O1', 'O2', 'O3'], ['S1', 'S2', 'S3', 'S4', 'S5', 'S6']) self.assertEqual(obs, exp)
def test_valid_overlapping_sample_ids(self): t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O3', 'O4'], ['S1', 'S5', 'S6']) obs = merge([t1, t2], 'error_on_overlapping_feature') exp = Table(np.array([[0, 1, 3, 0, 0], [1, 1, 2, 0, 0], [0, 0, 0, 2, 6], [2, 0, 0, 2, 4]]), ['O1', 'O2', 'O3', 'O4'], ['S1', 'S2', 'S3', 'S5', 'S6']) self.assertEqual(obs, exp)
def test_sum_overlapping_feature_ids(self): # This should produce the same result as `error_on_overlapping_sample` t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O3'], ['S4', 'S5', 'S6']) obs = merge([t1, t2], 'sum') exp = Table(np.array([[0, 1, 3, 0, 2, 6], [1, 1, 2, 0, 0, 0], [0, 0, 0, 2, 2, 4]]), ['O1', 'O2', 'O3'], ['S1', 'S2', 'S3', 'S4', 'S5', 'S6']) self.assertEqual(obs, exp)
def test_sum_some_overlap(self): # Did I stutter? t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O3'], ['S4', 'S2', 'S5']) obs = merge([t1, t2], 'sum') exp = Table(np.array([[0, 3, 3, 0, 6], [1, 1, 2, 0, 0], [0, 2, 0, 2, 4]]), ['O1', 'O2', 'O3'], ['S1', 'S2', 'S3', 'S4', 'S5']) self.assertEqual(obs, exp)
def make_hierarchy( csi_results: CSIDirFmt, feature_tables: biom.Table, ms2_matches: pd.DataFrame = None, qc_properties: bool = False, metric: str = 'euclidean') -> (TreeNode, biom.Table, pd.DataFrame): ''' This function generates a hierarchy of mass-spec features based on predicted chemical fingerprints. It filters the feature table to retain only the features with fingerprints and relables each feature with a hash (MD5) of its binary fingerprint vector. Parameters ---------- csi_results : CSIDirFmt one or more CSI:FingerID output folder feature_table : biom.Table one or more feature tables with mass-spec feature intensity per sample ms2_matches: pd.DataFrame one or more tables with MS/MS library match for mass-spec features qc_properties : bool, default False flag to filter molecular properties to keep only PUBCHEM fingerprints metric : str, default `euclidean` metric for hierarchical clustering of fingerprints Raises ------ ValueError If ``feature_table`` in empty If collated fingerprint table is empty Returns ------- skbio.TreeNode a tree of relatedness of molecules biom.Table merged feature table that is filtered to contain only the features present in the tree; indexed by the MD5 hash of fingerprint vectors of mass-spec features pd.DataFrame merged feature data; indexed by the MD5 hash of the fingerprint vectors of mass-spec features ''' fps, fts, fdata = [], [], [] if len(feature_tables) != len(csi_results): raise ValueError("The feature tables and CSI results should have a " "one-to-one correspondance.") if ms2_matches and len(ms2_matches) != len(feature_tables): raise ValueError("The MS2 match tables should have a one-to-one " "correspondance with feature tables and CSI results.") for n, (feature_table, csi_result) in enumerate(zip(feature_tables, csi_results)): if feature_table.is_empty(): raise ValueError("Cannot have empty feature table") if ms2_matches: ms2_match = ms2_matches[n] if 'Smiles' not in ms2_match.columns: raise ValueError("MS2 match tables must contain the " "column `Smiles`") collated_fps, smiles = process_csi_results(csi_result, ms2_match, qc_properties, metric=metric) else: collated_fps, smiles = process_csi_results(csi_result, None, qc_properties, metric) relabeled_fp, matched_ft, feature_data = get_matched_tables( collated_fps, smiles, feature_table) fps.append(relabeled_fp) fts.append(matched_ft) fdata.append(feature_data) merged_fdata = merge_feature_data(fdata) merged_fps = pd.concat(fps) merged_fps = merged_fps[~merged_fps.index.duplicated(keep='first')] merged_fts = merge(fts, overlap_method='error_on_overlapping_sample') tree = build_tree(merged_fps, metric) return tree, merged_fts, merged_fdata