Ejemplo n.º 1
0
 def test_invalid_overlap_method(self):
     t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O3'],
                ['S1', 'S5', 'S6'])
     with self.assertRaisesRegex(ValueError, 'overlap method'):
         merge([t1, t2], 'peanut')
Ejemplo n.º 2
0
 def test_invalid_overlapping_feature_ids(self):
     t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O3'],
                ['S4', 'S5', 'S6'])
     with self.assertRaisesRegex(ValueError, 'features are present'):
         merge([t1, t2], 'error_on_overlapping_feature')
Ejemplo n.º 3
0
 def test_invalid_overlapping_sample_ids(self):
     t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O3'],
                ['S1', 'S5', 'S6'])
     with self.assertRaisesRegex(ValueError, 'samples.*S1'):
         merge([t1, t2])
Ejemplo n.º 4
0
 def test_invalid_overlapping_feature_ids(self):
     t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]),
                ['O1', 'O3'],
                ['S4', 'S5', 'S6'])
     with self.assertRaisesRegex(ValueError, 'features are present'):
         merge([t1, t2], 'error_on_overlapping_feature')
Ejemplo n.º 5
0
 def test_invalid_overlapping_sample_ids(self):
     t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]),
                ['O1', 'O3'],
                ['S1', 'S5', 'S6'])
     with self.assertRaisesRegex(ValueError, 'samples.*S1'):
         merge([t1, t2])
Ejemplo n.º 6
0
 def test_invalid_overlap_method(self):
     t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]),
                ['O1', 'O3'],
                ['S1', 'S5', 'S6'])
     with self.assertRaisesRegex(ValueError, 'overlap method'):
         merge([t1, t2], 'peanut')
Ejemplo n.º 7
0
 def test_average(self):
     t1 = Table(np.array([[1, 1, 1], [1, 1, 1]]), ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     obs = merge([t1] * 3, 'average')
     exp = Table(np.array([[1, 1, 1], [1, 1, 1]]), ['O1', 'O2'],
                 ['S1', 'S2', 'S3'])
     self.assertEqual(obs, exp)
Ejemplo n.º 8
0
 def test_sum_triple_overlap(self):
     t1 = Table(np.array([[1, 1, 1], [1, 1, 1]]), ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     obs = merge([t1] * 3, 'sum')
     exp = Table(np.array([[3, 3, 3], [3, 3, 3]]), ['O1', 'O2'],
                 ['S1', 'S2', 'S3'])
     self.assertEqual(obs, exp)
Ejemplo n.º 9
0
    def test_single_table(self):
        t = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                  ['O1', 'O2'],
                  ['S1', 'S2', 'S3'])
        obs = merge([t])

        self.assertEqual(t, obs)
Ejemplo n.º 10
0
    def test_single_table(self):
        t = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                  ['O1', 'O2'],
                  ['S1', 'S2', 'S3'])
        obs = merge([t])

        self.assertEqual(t, obs)
Ejemplo n.º 11
0
def make_hierarchy(csi_results: CSIDirFmt,
                   feature_tables: biom.Table,
                   qc_properties: bool = True) -> (TreeNode, biom.Table,
                                                   pd.DataFrame):
    '''
    This function generates a hierarchy of mass-spec features based on
    predicted chemical fingerprints. It filters the feature table to
    retain only the features with fingerprints and relables each feature with
    a hash (MD5) of its binary fingerprint vector.

    Parameters
    ----------
    csi_results : CSIDirFmt
        one or more CSI:FingerID output folder
    feature_table : biom.Table
        one or more feature tables with mass-spec feature intensity per sample
    qc_properties : bool, default True
        flag to filter molecular properties to keep only PUBCHEM fingerprints

    Raises
    ------
    ValueError
        If ``feature_table`` in empty
        If collated fingerprint table is empty
    UserWarning
        If features in collated fingerprint table are not a subset of
        features in ``feature_table``

    Returns
    -------
    skbio.TreeNode
        a tree of relatedness of molecules
    biom.Table
        merged feature table that is filtered to contain only the
        features present in the tree
    pd.DataFrame
        merged feature data
    '''

    fps, fts, fdata = [], [], []
    if len(feature_tables) != len(csi_results):
        raise ValueError("The feature tables and CSI results should have a "
                         "one-to-one correspondance.")
    for feature_table, csi_result in zip(feature_tables, csi_results):
        if feature_table.is_empty():
            raise ValueError("Cannot have empty feature table")
        fingerprints = collate_fingerprint(csi_result, qc_properties)
        relabeled_fp, matched_ft, feature_data = match_label(fingerprints,
                                                             feature_table)
        fps.append(relabeled_fp)
        fts.append(matched_ft)
        fdata.append(feature_data)
    merged_fdata = merge_feature_data(fdata)
    merged_fps = pd.concat(fps)
    merged_fps = merged_fps[~merged_fps.index.duplicated(keep='first')]
    merged_fts = merge(fts, overlap_method='error_on_overlapping_sample')
    tree = build_tree(merged_fps)

    return tree, merged_fts, merged_fdata
Ejemplo n.º 12
0
 def test_average_relative_frequency(self):
     t1 = Table(np.array([[0.75, 0.75, 0.75], [0.75, 0.75, 0.75]]),
                ['O1', 'O2'], ['S1', 'S2', 'S3'])
     t2 = Table(np.array([[0.25, 0.25, 0.25], [0.25, 0.25, 0.25]]),
                ['O1', 'O2'], ['S1', 'S2', 'S3'])
     obs = merge([t1, t2], 'average')
     exp = Table(np.array([[0.5, 0.5, 0.5], [0.5, 0.5, 0.5]]), ['O1', 'O2'],
                 ['S1', 'S2', 'S3'])
     self.assertEqual(obs, exp)
Ejemplo n.º 13
0
 def test_sum_triple_overlap(self):
     t1 = Table(np.array([[1, 1, 1], [1, 1, 1]]),
                ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     obs = merge([t1] * 3, 'sum')
     exp = Table(np.array([[3, 3, 3], [3, 3, 3]]),
                 ['O1', 'O2'],
                 ['S1', 'S2', 'S3'])
     self.assertEqual(obs, exp)
Ejemplo n.º 14
0
 def test_sum_full_overlap(self):
     t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     obs = merge([t1, t2], 'sum')
     exp = Table(np.array([[0, 3, 9], [3, 3, 6]]), ['O1', 'O2'],
                 ['S1', 'S2', 'S3'])
     self.assertEqual(obs, exp)
Ejemplo n.º 15
0
 def test_sum_some_overlap(self):
     # Did I stutter?
     t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O3'],
                ['S4', 'S2', 'S5'])
     obs = merge([t1, t2], 'sum')
     exp = Table(
         np.array([[0, 3, 3, 0, 6], [1, 1, 2, 0, 0], [0, 2, 0, 2, 4]]),
         ['O1', 'O2', 'O3'], ['S1', 'S2', 'S3', 'S4', 'S5'])
     self.assertEqual(obs, exp)
Ejemplo n.º 16
0
 def test_valid_overlapping_sample_ids(self):
     t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O3', 'O4'],
                ['S1', 'S5', 'S6'])
     obs = merge([t1, t2], 'error_on_overlapping_feature')
     exp = Table(
         np.array([[0, 1, 3, 0, 0], [1, 1, 2, 0, 0], [0, 0, 0, 2, 6],
                   [2, 0, 0, 2, 4]]), ['O1', 'O2', 'O3', 'O4'],
         ['S1', 'S2', 'S3', 'S5', 'S6'])
     self.assertEqual(obs, exp)
Ejemplo n.º 17
0
 def test_sum_full_overlap(self):
     t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]),
                ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     obs = merge([t1, t2], 'sum')
     exp = Table(np.array([[0, 3, 9], [3, 3, 6]]),
                 ['O1', 'O2'],
                 ['S1', 'S2', 'S3'])
     self.assertEqual(obs, exp)
Ejemplo n.º 18
0
 def test_sum_overlapping_feature_ids(self):
     # This should produce the same result as `error_on_overlapping_sample`
     t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]), ['O1', 'O3'],
                ['S4', 'S5', 'S6'])
     obs = merge([t1, t2], 'sum')
     exp = Table(
         np.array([[0, 1, 3, 0, 2, 6], [1, 1, 2, 0, 0, 0],
                   [0, 0, 0, 2, 2, 4]]), ['O1', 'O2', 'O3'],
         ['S1', 'S2', 'S3', 'S4', 'S5', 'S6'])
     self.assertEqual(obs, exp)
Ejemplo n.º 19
0
 def test_valid_overlapping_sample_ids(self):
     t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]),
                ['O3', 'O4'],
                ['S1', 'S5', 'S6'])
     obs = merge([t1, t2], 'error_on_overlapping_feature')
     exp = Table(np.array([[0, 1, 3, 0, 0], [1, 1, 2, 0, 0],
                           [0, 0, 0, 2, 6], [2, 0, 0, 2, 4]]),
                 ['O1', 'O2', 'O3', 'O4'],
                 ['S1', 'S2', 'S3', 'S5', 'S6'])
     self.assertEqual(obs, exp)
Ejemplo n.º 20
0
 def test_sum_overlapping_feature_ids(self):
     # This should produce the same result as `error_on_overlapping_sample`
     t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]),
                ['O1', 'O3'],
                ['S4', 'S5', 'S6'])
     obs = merge([t1, t2], 'sum')
     exp = Table(np.array([[0, 1, 3, 0, 2, 6], [1, 1, 2, 0, 0, 0],
                           [0, 0, 0, 2, 2, 4]]),
                 ['O1', 'O2', 'O3'],
                 ['S1', 'S2', 'S3', 'S4', 'S5', 'S6'])
     self.assertEqual(obs, exp)
Ejemplo n.º 21
0
 def test_sum_some_overlap(self):
     # Did I stutter?
     t1 = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                ['O1', 'O2'],
                ['S1', 'S2', 'S3'])
     t2 = Table(np.array([[0, 2, 6], [2, 2, 4]]),
                ['O1', 'O3'],
                ['S4', 'S2', 'S5'])
     obs = merge([t1, t2], 'sum')
     exp = Table(np.array([[0, 3, 3, 0, 6], [1, 1, 2, 0, 0],
                           [0, 2, 0, 2, 4]]),
                 ['O1', 'O2', 'O3'],
                 ['S1', 'S2', 'S3', 'S4', 'S5'])
     self.assertEqual(obs, exp)
Ejemplo n.º 22
0
def make_hierarchy(
        csi_results: CSIDirFmt,
        feature_tables: biom.Table,
        ms2_matches: pd.DataFrame = None,
        qc_properties: bool = False,
        metric: str = 'euclidean') -> (TreeNode, biom.Table, pd.DataFrame):
    '''
    This function generates a hierarchy of mass-spec features based on
    predicted chemical fingerprints. It filters the feature table to
    retain only the features with fingerprints and relables each feature with
    a hash (MD5) of its binary fingerprint vector.

    Parameters
    ----------
    csi_results : CSIDirFmt
        one or more CSI:FingerID output folder
    feature_table : biom.Table
        one or more feature tables with mass-spec feature intensity per sample
    ms2_matches: pd.DataFrame
        one or more tables with MS/MS library match for mass-spec features
    qc_properties : bool, default False
        flag to filter molecular properties to keep only PUBCHEM fingerprints
    metric : str, default `euclidean`
        metric for hierarchical clustering of fingerprints

    Raises
    ------
    ValueError
        If ``feature_table`` in empty
        If collated fingerprint table is empty

    Returns
    -------
    skbio.TreeNode
        a tree of relatedness of molecules
    biom.Table
        merged feature table that is filtered to contain only the
        features present in the tree; indexed by the MD5 hash of
        fingerprint vectors of mass-spec features
    pd.DataFrame
        merged feature data; indexed by the MD5 hash of the fingerprint
        vectors of mass-spec features
    '''
    fps, fts, fdata = [], [], []
    if len(feature_tables) != len(csi_results):
        raise ValueError("The feature tables and CSI results should have a "
                         "one-to-one correspondance.")
    if ms2_matches and len(ms2_matches) != len(feature_tables):
        raise ValueError("The MS2 match tables should have a one-to-one "
                         "correspondance with feature tables and CSI results.")
    for n, (feature_table,
            csi_result) in enumerate(zip(feature_tables, csi_results)):
        if feature_table.is_empty():
            raise ValueError("Cannot have empty feature table")
        if ms2_matches:
            ms2_match = ms2_matches[n]
            if 'Smiles' not in ms2_match.columns:
                raise ValueError("MS2 match tables must contain the "
                                 "column `Smiles`")
            collated_fps, smiles = process_csi_results(csi_result,
                                                       ms2_match,
                                                       qc_properties,
                                                       metric=metric)
        else:
            collated_fps, smiles = process_csi_results(csi_result, None,
                                                       qc_properties, metric)
        relabeled_fp, matched_ft, feature_data = get_matched_tables(
            collated_fps, smiles, feature_table)
        fps.append(relabeled_fp)
        fts.append(matched_ft)
        fdata.append(feature_data)
    merged_fdata = merge_feature_data(fdata)
    merged_fps = pd.concat(fps)
    merged_fps = merged_fps[~merged_fps.index.duplicated(keep='first')]
    merged_fts = merge(fts, overlap_method='error_on_overlapping_sample')
    tree = build_tree(merged_fps, metric)
    return tree, merged_fts, merged_fdata