def test_from_file_error(self):
        for test_path in self.fferror_test_paths:
            with open(get_data_path(test_path), 'U') as f:
                with npt.assert_raises(FileFormatError):
                    OrdinationResults.from_file(f)

        for test_path in self.verror_test_paths:
            with open(get_data_path(test_path), 'U') as f:
                with npt.assert_raises(ValueError):
                    OrdinationResults.from_file(f)
    def test_from_file_error(self):
        for test_path in self.fferror_test_paths:
            with open(get_data_path(test_path), 'U') as f:
                with npt.assert_raises(FileFormatError):
                    OrdinationResults.from_file(f)

        for test_path in self.verror_test_paths:
            with open(get_data_path(test_path), 'U') as f:
                with npt.assert_raises(ValueError):
                    OrdinationResults.from_file(f)
    def test_from_file(self):
        for exp_scores, test_path in zip(self.scores, self.test_paths):
            for file_type in ('file like', 'file name'):
                fname = get_data_path(test_path)
                if file_type == 'file like':
                    with open(fname) as fh:
                        obs = OrdinationResults.from_file(fh)
                elif file_type == 'file name':
                    obs = OrdinationResults.from_file(fname)

                yield self.check_OrdinationResults_equal, obs, exp_scores
    def test_from_file(self):
        for exp_scores, test_path in zip(self.scores, self.test_paths):
            for file_type in ('file like', 'file name'):
                fname = get_data_path(test_path)
                if file_type == 'file like':
                    with open(fname) as fh:
                        obs = OrdinationResults.from_file(fh)
                elif file_type == 'file name':
                    obs = OrdinationResults.from_file(fname)

                yield self.check_OrdinationResults_equal, obs, exp_scores
    def test_get_procrustes_results(self):
        sample_id_map = {
            'CP3A1': 'S1',
            'CC1A1': 'S2',
            'CC2A1': 'S3',
            'CP1A1': 'S4'
        }
        actual = get_procrustes_results(self.pcoa1_f,
                                        self.pcoa1_f,
                                        sample_id_map=sample_id_map,
                                        randomize=None,
                                        max_dimensions=None)
        # just some sanity checks as the individual componenets are
        # already tested -- these are based on looking at the output of the
        # run, and testing to ensure that it hasn't changed
        eigvals = array([
            8976580.24393, 6044862.67619, 4372581.39431, 3161360.10319,
            2583594.45275, 2407555.39787
        ])
        prop_expl = array([
            23.1764657118, 15.6071186064, 11.2894866423, 8.16225689998,
            6.67053450426, 6.21602253997
        ])

        site = array([[
            -0.199225958574, -0.250846540029, -0.119813087305, -0.155652031006,
            0.18495315824, -0.160875399364
        ],
                      [
                          -0.238263544222, -0.37724227779, -0.169458651217,
                          0.0305157004776, 0.112181007345, 0.0677415967093
                      ],
                      [
                          0.116737988534, 0.414627960015, 0.201315243115,
                          0.113769076804, -0.283025353088, -0.144278863311
                      ],
                      [
                          0.320751514262, 0.213460857804, 0.0879564954067,
                          0.0113672537238, -0.0141088124974, 0.237412665966
                      ]])
        site_ids = ['S3', 'S2', 'S1', 'S4']
        expected = OrdinationResults(eigvals=eigvals,
                                     proportion_explained=prop_expl,
                                     site=site,
                                     site_ids=site_ids)

        assert_almost_equal(actual[0].eigvals, expected.eigvals)
        assert_almost_equal(actual[0].proportion_explained,
                            expected.proportion_explained)
        self.assertEqual(actual[0].site_ids, expected.site_ids)
        assert_almost_equal(actual[0].site, expected.site)

        assert_almost_equal(actual[1].eigvals, expected.eigvals)
        assert_almost_equal(actual[1].proportion_explained,
                            expected.proportion_explained)
        assert_almost_equal(actual[1].site, expected.site)
        self.assertEqual(actual[1].site_ids, expected.site_ids)

        self.assertTrue(actual[2] < 6e-30)
Beispiel #6
0
    def setUp(self):
        or_f = StringIO(PCOA_STRING)
        self.ord_res = OrdinationResults.from_file(or_f)

        self.data = [['PC.354', 'Control', '20061218', 'Ctrol_mouse_I.D._354'],
            ['PC.355', 'Control', '20061218', 'Control_mouse_I.D._355'],
            ['PC.356', 'Control', '20061126', 'Control_mouse_I.D._356'],
            ['PC.481', 'Control', '20070314', 'Control_mouse_I.D._481'],
            ['PC.593', 'Control', '20071210', 'Control_mouse_I.D._593'],
            ['PC.607', 'Fast', '20071112', 'Fasting_mouse_I.D._607'],
            ['PC.634', 'Fast', '20080116', 'Fasting_mouse_I.D._634'],
            ['PC.635', 'Fast', '20080116', 'Fasting_mouse_I.D._635'],
            ['PC.636', 'Fast', '20080116', 'Fasting_mouse_I.D._636']]
        self.headers = ['SampleID', 'Treatment', 'DOB', 'Description']
Beispiel #7
0
def parse_coords(lines):
    """Parse skbio's ordination results file into  coords, labels, eigvals,
        pct_explained.

    Returns:
    - list of sample labels in order
    - array of coords (rows = samples, cols = axes in descending order)
    - list of eigenvalues
    - list of percent variance explained

    For the file format check
    skbio.math.stats.ordination.OrdinationResults.from_file

    Strategy: read the file using skbio's parser and return the objects
              we want
    """
    pcoa_results = OrdinationResults.from_file(lines)
    return (pcoa_results.site_ids, pcoa_results.site, pcoa_results.eigvals,
            pcoa_results.proportion_explained)
Beispiel #8
0
def parse_coords(lines):
    """Parse skbio's ordination results file into  coords, labels, eigvals,
        pct_explained.

    Returns:
    - list of sample labels in order
    - array of coords (rows = samples, cols = axes in descending order)
    - list of eigenvalues
    - list of percent variance explained

    For the file format check
    skbio.math.stats.ordination.OrdinationResults.from_file

    Strategy: read the file using skbio's parser and return the objects
              we want
    """
    pcoa_results = OrdinationResults.from_file(lines)
    return (pcoa_results.site_ids, pcoa_results.site, pcoa_results.eigvals,
            pcoa_results.proportion_explained)
if __name__ == '__main__':
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    ord_fp = opts.input_fp
    mapping_fp = opts.map_fp
    categories = opts.categories.split(',')
    output_dir = opts.output_dir
    sort_by = opts.sort_by
    algorithm = opts.algorithm
    axes = opts.axes
    weighted = opts.weight_by_vector
    window_size = opts.window_size

    # Parse the ordination results
    with open(ord_fp, 'U') as f:
        ord_res = OrdinationResults.from_file(f)

    # Parse the mapping file
    with open(mapping_fp, 'U') as f:
        map_dict = parse_mapping_file_to_dict(f)[0]
    metamap = pd.DataFrame.from_dict(map_dict, orient='index')

    for category in categories:
        if category not in metamap.keys():
            option_parser.error("Category %s does not exist in the mapping "
                                "file" % categories)

    sort_category = None
    if sort_by:
        if sort_by == 'SampleID':
            sort_category = None
if __name__ == '__main__':
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    ord_fp = opts.input_fp
    mapping_fp = opts.map_fp
    categories = opts.categories.split(',')
    output_dir = opts.output_dir
    sort_by = opts.sort_by
    algorithm = opts.algorithm
    axes = opts.axes
    weighted = opts.weight_by_vector
    window_size = opts.window_size

    # Parse the ordination results
    with open(ord_fp, 'U') as f:
        ord_res = OrdinationResults.from_file(f)

    # Parse the mapping file
    with open(mapping_fp, 'U') as f:
        map_dict = parse_mapping_file_to_dict(f)[0]
    metamap = pd.DataFrame.from_dict(map_dict, orient='index')

    for category in categories:
        if category not in metamap.keys():
            option_parser.error("Category %s does not exist in the mapping "
                                "file" % categories)

    sort_category = None
    if sort_by:
        if sort_by == 'SampleID':
            sort_category = None
    def setup_class(cls):
        # CA results
        eigvals = np.array([0.0961330159181, 0.0409418140138])
        species = np.array([[0.408869425742, 0.0695518116298],
                            [-0.1153860437, -0.299767683538],
                            [-0.309967102571, 0.187391917117]])
        site = np.array([[-0.848956053187, 0.882764759014],
                         [-0.220458650578, -1.34482000302],
                         [1.66697179591, 0.470324389808]])
        biplot = None
        site_constraints = None
        prop_explained = None
        species_ids = ['Species1', 'Species2', 'Species3']
        site_ids = ['Site1', 'Site2', 'Site3']
        ca_scores = OrdinationResults(eigvals=eigvals,
                                      species=species,
                                      site=site,
                                      biplot=biplot,
                                      site_constraints=site_constraints,
                                      proportion_explained=prop_explained,
                                      species_ids=species_ids,
                                      site_ids=site_ids)
        # CCA results
        eigvals = np.array([
            0.366135830393, 0.186887643052, 0.0788466514249, 0.082287840501,
            0.0351348475787, 0.0233265839374, 0.0099048981912,
            0.00122461669234, 0.000417454724117
        ])
        species = np.loadtxt(get_data_path('exp_OrdRes_CCA_species'))
        site = np.loadtxt(get_data_path('exp_OrdRes_CCA_site'))
        biplot = np.array(
            [[-0.169746767979, 0.63069090084, 0.760769036049],
             [-0.994016563505, 0.0609533148724, -0.0449369418179],
             [0.184352565909, -0.974867543612, 0.0309865007541]])
        site_constraints = np.loadtxt(
            get_data_path('exp_OrdRes_CCA_site_constraints'))
        prop_explained = None
        species_ids = [
            'Species0', 'Species1', 'Species2', 'Species3', 'Species4',
            'Species5', 'Species6', 'Species7', 'Species8'
        ]
        site_ids = [
            'Site0', 'Site1', 'Site2', 'Site3', 'Site4', 'Site5', 'Site6',
            'Site7', 'Site8', 'Site9'
        ]
        cca_scores = OrdinationResults(eigvals=eigvals,
                                       species=species,
                                       site=site,
                                       biplot=biplot,
                                       site_constraints=site_constraints,
                                       proportion_explained=prop_explained,
                                       species_ids=species_ids,
                                       site_ids=site_ids)
        # PCoA results
        eigvals = np.array([
            0.512367260461, 0.300719094427, 0.267912066004, 0.208988681078,
            0.19169895326, 0.16054234528, 0.15017695712, 0.122457748167, 0.0
        ])
        species = None
        site = np.loadtxt(get_data_path('exp_OrdRes_PCoA_site'))
        biplot = None
        site_constraints = None
        prop_explained = np.array([
            0.267573832777, 0.15704469605, 0.139911863774, 0.109140272454,
            0.100111048503, 0.0838401161912, 0.0784269939011, 0.0639511763509,
            0.0
        ])
        species_ids = None
        site_ids = [
            'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
            'PC.355', 'PC.607', 'PC.634'
        ]
        pcoa_scores = OrdinationResults(eigvals=eigvals,
                                        species=species,
                                        site=site,
                                        biplot=biplot,
                                        site_constraints=site_constraints,
                                        proportion_explained=prop_explained,
                                        species_ids=species_ids,
                                        site_ids=site_ids)
        # RDA results
        eigvals = np.array([
            25.8979540892, 14.9825779819, 8.93784077262, 6.13995623072,
            1.68070536498, 0.57735026919, 0.275983624351
        ])
        species = np.loadtxt(get_data_path('exp_OrdRes_RDA_species'))
        site = np.loadtxt(get_data_path('exp_OrdRes_RDA_site'))
        biplot = np.array([[0.422650019179, -0.559142585857, -0.713250678211],
                           [0.988495963777, 0.150787422017, -0.0117848614073],
                           [-0.556516618887, 0.817599992718, 0.147714267459],
                           [-0.404079676685, -0.9058434809, -0.127150316558]])
        site_constraints = np.loadtxt(
            get_data_path('exp_OrdRes_RDA_site_constraints'))
        prop_explained = None
        species_ids = [
            'Species0', 'Species1', 'Species2', 'Species3', 'Species4',
            'Species5'
        ]
        site_ids = [
            'Site0', 'Site1', 'Site2', 'Site3', 'Site4', 'Site5', 'Site6',
            'Site7', 'Site8', 'Site9'
        ]
        rda_scores = OrdinationResults(eigvals=eigvals,
                                       species=species,
                                       site=site,
                                       biplot=biplot,
                                       site_constraints=site_constraints,
                                       proportion_explained=prop_explained,
                                       species_ids=species_ids,
                                       site_ids=site_ids)

        cls.scores = [ca_scores, cca_scores, pcoa_scores, rda_scores]
        cls.test_paths = [
            'L&L_CA_data_scores', 'example3_scores',
            'PCoA_sample_data_3_scores', 'example2_scores'
        ]

        cls.fferror_test_paths = [
            'error1', 'error2', 'error3', 'error4', 'error5', 'error6'
        ]
        cls.verror_test_paths = [
            'v_error1', 'v_error2', 'v_error3', 'v_error4', 'v_error5',
            'v_error6', 'v_error7', 'v_error8', 'v_error9', 'v_error10'
        ]
def get_procrustes_results(coords_f1, coords_f2, sample_id_map=None,
                           randomize=None, max_dimensions=None,
                           get_eigenvalues=get_mean_eigenvalues,
                           get_percent_variation_explained=get_mean_percent_variation):
    """ """
    # Parse the PCoA files
    ord_res_1 = OrdinationResults.from_file(coords_f1)
    ord_res_2 = OrdinationResults.from_file(coords_f2)

    sample_ids1 = ord_res_1.site_ids
    coords1 = ord_res_1.site
    eigvals1 = ord_res_1.eigvals
    pct_var1 = ord_res_1.proportion_explained

    sample_ids2 = ord_res_2.site_ids
    coords2 = ord_res_2.site
    eigvals2 = ord_res_2.eigvals
    pct_var2 = ord_res_2.proportion_explained

    if sample_id_map:
        sample_ids1 = map_sample_ids(sample_ids1, sample_id_map)
        sample_ids2 = map_sample_ids(sample_ids2, sample_id_map)
    # rearrange the order of coords in coords2 to correspond to
    # the order of coords in coords1
    order = list(set(sample_ids1) & set(sample_ids2))
    coords1 = reorder_coords(coords1, sample_ids1, order)
    coords2 = reorder_coords(coords2, sample_ids2, order)
    if len(order) == 0:
        raise ValueError('No overlapping samples in the two files')

    # If this is a random trial, apply the shuffling function passed as
    # randomize()
    if randomize:
        coords2 = randomize(coords2)
        randomized_coords2 = OrdinationResults(eigvals=eigvals2,
                                               proportion_explained=pct_var2,
                                               site=coords2,
                                               site_ids=order)
    else:
        randomized_coords2 = None

    coords1, coords2 = pad_coords_matrices(coords1, coords2)
    if max_dimensions:
        coords1 = filter_coords_matrix(coords1, max_dimensions)
        coords2 = filter_coords_matrix(coords2, max_dimensions)
        pct_var1 = pct_var1[:max_dimensions]
        pct_var2 = pct_var2[:max_dimensions]
        eigvals1 = eigvals1[:max_dimensions]
        eigvals2 = eigvals2[:max_dimensions]
    else:
        if len(pct_var1) > len(pct_var2):
            pct_var2 = append(pct_var2, zeros(len(pct_var1) - len(pct_var2)))
            eigvals2 = append(eigvals2, zeros(len(eigvals1) - len(eigvals2)))
        elif len(pct_var1) < len(pct_var2):
            pct_var1 = append(pct_var1, zeros(len(pct_var2) - len(pct_var1)))
            eigvals1 = append(eigvals1, zeros(len(eigvals2) - len(eigvals1)))

    # Run the Procrustes analysis
    transformed_coords_m1, transformed_coords_m2, m_squared =\
        procrustes(coords1, coords2)
    # print coords2
    # print transformed_coords_m2

    eigvals = get_eigenvalues(eigvals1, eigvals2)
    pct_var = get_percent_variation_explained(pct_var1, pct_var2)

    transformed_coords1 = OrdinationResults(eigvals=asarray(eigvals),
                                            proportion_explained=asarray(pct_var),
                                            site=asarray(transformed_coords_m1),
                                            site_ids=order)
    transformed_coords2 = OrdinationResults(eigvals=asarray(eigvals),
                                            proportion_explained=asarray(pct_var),
                                            site=asarray(transformed_coords_m2),
                                            site_ids=order)

    # Return the results
    return (transformed_coords1, transformed_coords2,
            m_squared, randomized_coords2)
 def setUp(self):
     eigvals = np.array([
         0.512367260461, 0.300719094427, 0.267912066004, 0.208988681078,
         0.19169895326, 0.16054234528, 0.15017695712, 0.122457748167, 0.0
     ])
     site = np.array(
         [[
             -0.212230626531, 0.216034194368, 0.03532727349,
             -0.254450494129, -0.0687468542543, 0.231895596562,
             0.00496549154314, -0.0026246871695, 9.73837390723e-10
         ],
          [
              -0.277487312135, -0.0295483215975, -0.0744173437992,
              0.0957182357964, 0.204714844022, -0.0055407341857,
              -0.190287966833, 0.16307126638, 9.73837390723e-10
          ],
          [
              0.220886492631, 0.0874848360559, -0.351990132198,
              -0.00316535032886, 0.114635191853, -0.00019194106125,
              0.188557853937, 0.030002427212, 9.73837390723e-10
          ],
          [
              0.0308923744062, -0.0446295973489, 0.133996451689,
              0.29318228566, -0.167812539312, 0.130996149793,
              0.113551017379, 0.109987942454, 9.73837390723e-10
          ],
          [
              0.27616778138, -0.0341866951102, 0.0633000238256,
              0.100446653327, 0.123802521199, 0.1285839664, -0.132852841046,
              -0.217514322505, 9.73837390723e-10
          ],
          [
              0.202458130052, -0.115216120518, 0.301820871723,
              -0.18300251046, 0.136208248567, -0.0989435556722,
              0.0927738484879, 0.0909429797672, 9.73837390723e-10
          ],
          [
              0.236467470907, 0.21863434374, -0.0301637746424,
              -0.0225473129718, -0.205287183891, -0.180224615141,
              -0.165277751908, 0.0411933458557, 9.73837390723e-10
          ],
          [
              -0.105517545144, -0.41405687433, -0.150073017617,
              -0.116066751485, -0.158763393475, -0.0223918378516,
              -0.0263068046112, -0.0501209518091, 9.73837390723e-10
          ],
          [
              -0.371636765565, 0.115484234741, 0.0721996475289,
              0.0898852445906, 0.0212491652909, -0.184183028843,
              0.114877153051, -0.164938000185, 9.73837390723e-10
          ]])
     prop_expl = np.array([
         25.6216900347, 15.7715955926, 14.1215046787, 11.6913885817,
         9.83044890697, 8.51253468595, 7.88775505332, 6.56308246609,
         4.42499350906e-16
     ])
     site_ids = [
         'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
         'PC.355', 'PC.607', 'PC.634'
     ]
     self.ord_res = OrdinationResults(eigvals=eigvals,
                                      site=site,
                                      proportion_explained=prop_expl,
                                      site_ids=site_ids)
     metadata_map = {
         'PC.354': {
             'Treatment': 'Control',
             'DOB': '20061218',
             'Weight': '60',
             'Description': 'Control_mouse_I.D._354'
         },
         'PC.355': {
             'Treatment': 'Control',
             'DOB': '20061218',
             'Weight': '55',
             'Description': 'Control_mouse_I.D._355'
         },
         'PC.356': {
             'Treatment': 'Control',
             'DOB': '20061126',
             'Weight': '50',
             'Description': 'Control_mouse_I.D._356'
         },
         'PC.481': {
             'Treatment': 'Control',
             'DOB': '20070314',
             'Weight': '52',
             'Description': 'Control_mouse_I.D._481'
         },
         'PC.593': {
             'Treatment': 'Control',
             'DOB': '20071210',
             'Weight': '57',
             'Description': 'Control_mouse_I.D._593'
         },
         'PC.607': {
             'Treatment': 'Fast',
             'DOB': '20071112',
             'Weight': '65',
             'Description': 'Fasting_mouse_I.D._607'
         },
         'PC.634': {
             'Treatment': 'Fast',
             'DOB': '20080116',
             'Weight': '68',
             'Description': 'Fasting_mouse_I.D._634'
         },
         'PC.635': {
             'Treatment': 'Fast',
             'DOB': '20080116',
             'Weight': '70',
             'Description': 'Fasting_mouse_I.D._635'
         },
         'PC.636': {
             'Treatment': 'Fast',
             'DOB': '20080116',
             'Weight': '72',
             'Description': 'Fasting_mouse_I.D._636'
         }
     }
     self.metadata_map = pd.DataFrame.from_dict(metadata_map,
                                                orient='index')
     self.categories = ['Treatment']
     self.sort_by = 'Weight'
def get_procrustes_results(
        coords_f1,
        coords_f2,
        sample_id_map=None,
        randomize=None,
        max_dimensions=None,
        get_eigenvalues=get_mean_eigenvalues,
        get_percent_variation_explained=get_mean_percent_variation):
    """ """
    # Parse the PCoA files
    ord_res_1 = OrdinationResults.from_file(coords_f1)
    ord_res_2 = OrdinationResults.from_file(coords_f2)

    sample_ids1 = ord_res_1.site_ids
    coords1 = ord_res_1.site
    eigvals1 = ord_res_1.eigvals
    pct_var1 = ord_res_1.proportion_explained

    sample_ids2 = ord_res_2.site_ids
    coords2 = ord_res_2.site
    eigvals2 = ord_res_2.eigvals
    pct_var2 = ord_res_2.proportion_explained

    if sample_id_map:
        sample_ids1 = map_sample_ids(sample_ids1, sample_id_map)
        sample_ids2 = map_sample_ids(sample_ids2, sample_id_map)
    # rearrange the order of coords in coords2 to correspond to
    # the order of coords in coords1
    order = list(set(sample_ids1) & set(sample_ids2))
    coords1 = reorder_coords(coords1, sample_ids1, order)
    coords2 = reorder_coords(coords2, sample_ids2, order)
    if len(order) == 0:
        raise ValueError('No overlapping samples in the two files')

    # If this is a random trial, apply the shuffling function passed as
    # randomize()
    if randomize:
        coords2 = randomize(coords2)
        randomized_coords2 = OrdinationResults(eigvals=eigvals2,
                                               proportion_explained=pct_var2,
                                               site=coords2,
                                               site_ids=order)
    else:
        randomized_coords2 = None

    coords1, coords2 = pad_coords_matrices(coords1, coords2)
    if max_dimensions:
        coords1 = filter_coords_matrix(coords1, max_dimensions)
        coords2 = filter_coords_matrix(coords2, max_dimensions)
        pct_var1 = pct_var1[:max_dimensions]
        pct_var2 = pct_var2[:max_dimensions]
        eigvals1 = eigvals1[:max_dimensions]
        eigvals2 = eigvals2[:max_dimensions]
    else:
        if len(pct_var1) > len(pct_var2):
            pct_var2 = append(pct_var2, zeros(len(pct_var1) - len(pct_var2)))
            eigvals2 = append(eigvals2, zeros(len(eigvals1) - len(eigvals2)))
        elif len(pct_var1) < len(pct_var2):
            pct_var1 = append(pct_var1, zeros(len(pct_var2) - len(pct_var1)))
            eigvals1 = append(eigvals1, zeros(len(eigvals2) - len(eigvals1)))

    # Run the Procrustes analysis
    transformed_coords_m1, transformed_coords_m2, m_squared =\
        procrustes(coords1, coords2)
    # print coords2
    # print transformed_coords_m2

    eigvals = get_eigenvalues(eigvals1, eigvals2)
    pct_var = get_percent_variation_explained(pct_var1, pct_var2)

    transformed_coords1 = OrdinationResults(
        eigvals=asarray(eigvals),
        proportion_explained=asarray(pct_var),
        site=asarray(transformed_coords_m1),
        site_ids=order)
    transformed_coords2 = OrdinationResults(
        eigvals=asarray(eigvals),
        proportion_explained=asarray(pct_var),
        site=asarray(transformed_coords_m2),
        site_ids=order)

    # Return the results
    return (transformed_coords1, transformed_coords2, m_squared,
            randomized_coords2)