def test_get_procrustes_results(self):
        sample_id_map = {"CP3A1": "S1", "CC1A1": "S2", "CC2A1": "S3", "CP1A1": "S4"}
        actual = get_procrustes_results(
            StringIO(pcoa1_f), StringIO(pcoa1_f), sample_id_map=sample_id_map, randomize=None, max_dimensions=None
        )
        # just some sanity checks as the individual componenets are
        # already tested -- these are based on looking at the output of the
        # run, and testing to ensure that it hasn't changed
        eigvals = array([8976580.24393, 6044862.67619, 4372581.39431, 3161360.10319, 2583594.45275, 2407555.39787])
        prop_expl = array([23.1764657118, 15.6071186064, 11.2894866423, 8.16225689998, 6.67053450426, 6.21602253997])

        site = array(
            [
                [-0.199225958574, -0.250846540029, -0.119813087305, -0.155652031006, 0.18495315824, -0.160875399364],
                [-0.238263544222, -0.37724227779, -0.169458651217, 0.0305157004776, 0.112181007345, 0.0677415967093],
                [0.116737988534, 0.414627960015, 0.201315243115, 0.113769076804, -0.283025353088, -0.144278863311],
                [0.320751514262, 0.213460857804, 0.0879564954067, 0.0113672537238, -0.0141088124974, 0.237412665966],
            ]
        )
        site_ids = ["S3", "S2", "S1", "S4"]
        expected = OrdinationResults(eigvals=eigvals, proportion_explained=prop_expl, site=site, site_ids=site_ids)

        assert_almost_equal(actual[0].eigvals, expected.eigvals)
        assert_almost_equal(actual[0].proportion_explained, expected.proportion_explained)
        self.assertEqual(actual[0].site_ids, expected.site_ids)
        assert_almost_equal(actual[0].site, expected.site)

        assert_almost_equal(actual[1].eigvals, expected.eigvals)
        assert_almost_equal(actual[1].proportion_explained, expected.proportion_explained)
        assert_almost_equal(actual[1].site, expected.site)
        self.assertEqual(actual[1].site_ids, expected.site_ids)

        self.assertTrue(actual[2] < 6e-30)
Esempio n. 2
0
    def test_get_procrustes_results(self):
        sample_id_map = {
            'CP3A1': 'S1',
            'CC1A1': 'S2',
            'CC2A1': 'S3',
            'CP1A1': 'S4'
        }
        actual = get_procrustes_results(StringIO(pcoa1_f),
                                        StringIO(pcoa1_f),
                                        sample_id_map=sample_id_map,
                                        randomize=None,
                                        max_dimensions=None)
        # just some sanity checks as the individual componenets are
        # already tested -- these are based on looking at the output of the
        # run, and testing to ensure that it hasn't changed
        eigvals = array([
            8976580.24393, 6044862.67619, 4372581.39431, 3161360.10319,
            2583594.45275, 2407555.39787
        ])
        prop_expl = array([
            23.1764657118, 15.6071186064, 11.2894866423, 8.16225689998,
            6.67053450426, 6.21602253997
        ])

        site = array([[
            -0.199225958574, -0.250846540029, -0.119813087305, -0.155652031006,
            0.18495315824, -0.160875399364
        ],
                      [
                          -0.238263544222, -0.37724227779, -0.169458651217,
                          0.0305157004776, 0.112181007345, 0.0677415967093
                      ],
                      [
                          0.116737988534, 0.414627960015, 0.201315243115,
                          0.113769076804, -0.283025353088, -0.144278863311
                      ],
                      [
                          0.320751514262, 0.213460857804, 0.0879564954067,
                          0.0113672537238, -0.0141088124974, 0.237412665966
                      ]])
        site_ids = ['S3', 'S2', 'S1', 'S4']
        expected = OrdinationResults(eigvals=eigvals,
                                     proportion_explained=prop_expl,
                                     site=site,
                                     site_ids=site_ids)

        assert_almost_equal(actual[0].eigvals, expected.eigvals)
        assert_almost_equal(actual[0].proportion_explained,
                            expected.proportion_explained)
        self.assertEqual(actual[0].site_ids, expected.site_ids)
        assert_almost_equal(actual[0].site, expected.site)

        assert_almost_equal(actual[1].eigvals, expected.eigvals)
        assert_almost_equal(actual[1].proportion_explained,
                            expected.proportion_explained)
        assert_almost_equal(actual[1].site, expected.site)
        self.assertEqual(actual[1].site_ids, expected.site_ids)

        self.assertTrue(actual[2] < 6e-30)
 def test_get_procrustes_results(self):
     sample_id_map = {'CP3A1':'S1','CC1A1':'S2','CC2A1':'S3','CP1A1':'S4'}
     actual = get_procrustes_results(self.pcoa1_f,self.pcoa1_f,\
      sample_id_map=sample_id_map,randomize=None,max_dimensions=None)
     # just some sanity checks as the individual componenets are 
     # already tested -- these are based on looking at the output of the
     # run, and testing to ensure that it hasn't changed
     self.assertEqual(set(actual[0].split('\n')),set('pc vector number\t1\t2\t3\t4\t5\t6\nS1\t0.116737988534\t0.414627960015\t0.201315243115\t0.113769076804\t-0.283025353088\t-0.144278863311\nS2\t-0.238263544222\t-0.37724227779\t-0.169458651217\t0.0305157004776\t0.112181007345\t0.0677415967093\nS3\t-0.199225958574\t-0.250846540029\t-0.119813087305\t-0.155652031006\t0.18495315824\t-0.160875399364\nS4\t0.320751514262\t0.213460857804\t0.0879564954067\t0.0113672537238\t-0.0141088124974\t0.237412665966\n\n\neigvals\t8976580.24393\t6044862.67619\t4372581.39431\t3161360.10319\t2583594.45275\t2407555.39787\n% variation explained\t23.1764657118\t15.6071186064\t11.2894866423\t8.16225689998\t6.67053450426\t6.21602253997'.split('\n')))
     self.assertEqual(set(actual[1].split('\n')),set('pc vector number\t1\t2\t3\t4\t5\t6\nS1\t0.116737988534\t0.414627960015\t0.201315243115\t0.113769076804\t-0.283025353088\t-0.144278863311\nS2\t-0.238263544222\t-0.37724227779\t-0.169458651217\t0.0305157004776\t0.112181007345\t0.0677415967093\nS3\t-0.199225958574\t-0.250846540029\t-0.119813087305\t-0.155652031006\t0.18495315824\t-0.160875399364\nS4\t0.320751514262\t0.213460857804\t0.0879564954067\t0.0113672537238\t-0.0141088124974\t0.237412665966\n\n\neigvals\t8976580.24393\t6044862.67619\t4372581.39431\t3161360.10319\t2583594.45275\t2407555.39787\n% variation explained\t23.1764657118\t15.6071186064\t11.2894866423\t8.16225689998\t6.67053450426\t6.21602253997'.split('\n')))
     self.assertTrue(actual[2] < 6e-30)
Esempio n. 4
0
 def test_get_procrustes_results(self):
     sample_id_map = {'CP3A1':'S1','CC1A1':'S2','CC2A1':'S3','CP1A1':'S4'}
     actual = get_procrustes_results(self.pcoa1_f,self.pcoa1_f,\
      sample_id_map=sample_id_map,randomize=None,max_dimensions=None)
     # just some sanity checks as the individual componenets are 
     # already tested -- these are based on looking at the output of the
     # run, and testing to ensure that it hasn't changed
     self.assertEqual(set(actual[0].split('\n')),set('pc vector number\t1\t2\t3\t4\t5\t6\nS1\t0.116737988534\t0.414627960015\t0.201315243115\t0.113769076804\t-0.283025353088\t-0.144278863311\nS2\t-0.238263544222\t-0.37724227779\t-0.169458651217\t0.0305157004776\t0.112181007345\t0.0677415967093\nS3\t-0.199225958574\t-0.250846540029\t-0.119813087305\t-0.155652031006\t0.18495315824\t-0.160875399364\nS4\t0.320751514262\t0.213460857804\t0.0879564954067\t0.0113672537238\t-0.0141088124974\t0.237412665966\n\n\neigvals\t8976580.24393\t6044862.67619\t4372581.39431\t3161360.10319\t2583594.45275\t2407555.39787\n% variation explained\t23.1764657118\t15.6071186064\t11.2894866423\t8.16225689998\t6.67053450426\t6.21602253997'.split('\n')))
     self.assertEqual(set(actual[1].split('\n')),set('pc vector number\t1\t2\t3\t4\t5\t6\nS1\t0.116737988534\t0.414627960015\t0.201315243115\t0.113769076804\t-0.283025353088\t-0.144278863311\nS2\t-0.238263544222\t-0.37724227779\t-0.169458651217\t0.0305157004776\t0.112181007345\t0.0677415967093\nS3\t-0.199225958574\t-0.250846540029\t-0.119813087305\t-0.155652031006\t0.18495315824\t-0.160875399364\nS4\t0.320751514262\t0.213460857804\t0.0879564954067\t0.0113672537238\t-0.0141088124974\t0.237412665966\n\n\neigvals\t8976580.24393\t6044862.67619\t4372581.39431\t3161360.10319\t2583594.45275\t2407555.39787\n% variation explained\t23.1764657118\t15.6071186064\t11.2894866423\t8.16225689998\t6.67053450426\t6.21602253997'.split('\n')))
     self.assertTrue(actual[2] < 6e-30)
 def test_get_procrustes_results_imprefect_sample_overlap(self):
     sample_id_map = {'aaa':'S0','bbb':'S1','ccc':'S2','ddd':'S3','eee':'S4'}
     actual = get_procrustes_results(self.pcoa3_f,self.pcoa4_f,\
      sample_id_map=sample_id_map,randomize=None,max_dimensions=None)
     # Confirm that only the sample ids that are in both procrustes results 
     # show up in the output
     for a in actual[:2]:
         self.assertTrue('S1' in a)
         self.assertTrue('S2' in a)
         self.assertTrue('S3' in a)
         self.assertTrue('S0' not in a)
         self.assertTrue('S4' not in a)
Esempio n. 6
0
 def test_get_procrustes_results_imprefect_sample_overlap(self):
     sample_id_map = {'aaa':'S0','bbb':'S1','ccc':'S2','ddd':'S3','eee':'S4'}
     actual = get_procrustes_results(self.pcoa3_f,self.pcoa4_f,\
      sample_id_map=sample_id_map,randomize=None,max_dimensions=None)
     # Confirm that only the sample ids that are in both procrustes results 
     # show up in the output
     for a in actual[:2]:
         self.assertTrue('S1' in a)
         self.assertTrue('S2' in a)
         self.assertTrue('S3' in a)
         self.assertTrue('S0' not in a)
         self.assertTrue('S4' not in a)
 def test_get_procrustes_results_imprefect_sample_overlap(self):
     sample_id_map = {"aaa": "S0", "bbb": "S1", "ccc": "S2", "ddd": "S3", "eee": "S4"}
     actual = get_procrustes_results(
         StringIO(pcoa3_f), StringIO(pcoa4_f), sample_id_map=sample_id_map, randomize=None, max_dimensions=None
     )
     # Confirm that only the sample ids that are in both procrustes results
     # show up in the output
     for a in actual[:2]:
         self.assertTrue("S1" in a.site_ids)
         self.assertTrue("S2" in a.site_ids)
         self.assertTrue("S3" in a.site_ids)
         self.assertTrue("S0" not in a.site_ids)
         self.assertTrue("S4" not in a.site_ids)
Esempio n. 8
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fps = opts.input_fps
    sample_id_map_fps = opts.sample_id_map_fps
    num_dimensions = opts.num_dimensions
    max_dims_str = str(num_dimensions or 'alldim')
    output_dir = opts.output_dir
    random_trials = opts.random_trials

    if random_trials is not None and random_trials < 10:
        option_parser.error(
            'Must perform >= 10 trails for Monte Carlo analysis.')

    if sample_id_map_fps and \
       (len(sample_id_map_fps) + 1) != len(opts.input_fps):
        option_parser.error('If providing sample id maps, there must be '
                            'exactly one fewer sample id maps than input '
                            'coordinate matrices.')

    if not exists(output_dir):
        makedirs(output_dir)

    reference_input_fp = input_fps[0]
    reference_input_fp_dir, input_fn1 = split(reference_input_fp)
    reference_input_fp_basename, reference_input_fp_ext = splitext(input_fn1)
    output_summary_fp = join(output_dir, 'procrustes_results.txt')
    summary_file_lines = [
        '#FP1\tFP2\tNum included dimensions\tMonte Carlo '
        'p-value\tCount better\tM^2',
        '#Warning: p-values in this file are NOT currently '
        'adjusted for multiple comparisons.'
    ]

    for i, query_input_fp in enumerate(input_fps[1:]):
        query_input_fp_dir, query_input_fn = split(query_input_fp)
        query_input_fp_basename, query_input_fp_ext = splitext(query_input_fn)
        output_matrix1_fp = join(
            output_dir,
            '%s_transformed_reference.txt' % reference_input_fp_basename)
        output_matrix2_fp = join(
            output_dir,
            '%s_transformed_q%d.txt' % (query_input_fp_basename, i + 1))

        if sample_id_map_fps:
            with open(sample_id_map_fps[i], "U") as f:
                sample_id_map = dict([
                    (k, v[0]) for k, v in fields_to_dict(f).iteritems()
                ])
        else:
            sample_id_map = None

        with open(reference_input_fp, 'U') as ref_in_f:
            with open(query_input_fp, 'U') as query_in_f:
                transf_coords1, transf_coords2, m_squared, rand_coords2 =\
                    get_procrustes_results(ref_in_f, query_in_f,
                                           sample_id_map=sample_id_map,
                                           randomize=False,
                                           max_dimensions=num_dimensions)

        transf_coords1.write(output_matrix1_fp)
        transf_coords2.write(output_matrix2_fp)

        if random_trials:
            if opts.store_trial_details:
                trial_output_dir = join(output_dir, 'trial_details_%d' % i + 2)
            else:
                trial_output_dir = None
            coords_f1 = open(reference_input_fp, 'U')
            coords_f2 = open(query_input_fp, 'U')
            actual_m_squared, trial_m_squareds, count_better, mc_p_value =\
                procrustes_monte_carlo(coords_f1,
                                       coords_f2,
                                       trials=random_trials,
                                       max_dimensions=num_dimensions,
                                       sample_id_map=sample_id_map,
                                       trial_output_dir=trial_output_dir)
            # truncate the p-value to the correct number of significant
            # digits
            mc_p_value_str = format_p_value_for_num_iters(
                mc_p_value, random_trials)
            summary_file_lines.append(
                '%s\t%s\t%s\t%s\t%d\t%1.3f' %
                (reference_input_fp, query_input_fp, max_dims_str,
                 mc_p_value_str, count_better, actual_m_squared))
        else:
            summary_file_lines.append(
                '%s\t%s\t%s\tNA\tNA\t%1.3f' %
                (reference_input_fp, query_input_fp, max_dims_str, m_squared))
    # Write output summary
    with open(output_summary_fp, 'w') as f:
        f.write('\n'.join(summary_file_lines))
        f.write('\n')
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fps = opts.input_fps
    sample_id_map_fps = opts.sample_id_map_fps
    num_dimensions = opts.num_dimensions
    max_dims_str = str(num_dimensions or 'alldim')
    output_dir = opts.output_dir
    random_trials = opts.random_trials
    
    if random_trials != None and random_trials < 10:
        option_parser.error('Must perform >= 10 trails for Monte Carlo analysis.')
    
    if sample_id_map_fps and \
       (len(sample_id_map_fps) + 1) != len(opts.input_fps):
       option_parser.error('If providing sample id maps, there must be exactly'
                           ' one fewer sample id maps than input coordinate'
                           ' matrices.')
    
    if not exists(output_dir): 
        makedirs(output_dir)
  
    reference_input_fp = input_fps[0]
    reference_input_fp_dir, input_fn1 = split(reference_input_fp)
    reference_input_fp_basename, reference_input_fp_ext = splitext(input_fn1)
    output_summary_fp = join(output_dir,'procrustes_results.txt')
    summary_file_lines = \
     ['#FP1\tFP2\tNum included dimensions\tMonte Carlo p-value\tCount better\tM^2',
      '#Warning: p-values in this file are NOT currently adjusted for multiple comparisons.']
    
    for i,query_input_fp in enumerate(input_fps[1:]):
        query_input_fp_dir, query_input_fn = split(query_input_fp)
        query_input_fp_basename, query_input_fp_ext = splitext(query_input_fn)
        output_matrix1_fp = join(output_dir,
         '%s_transformed_reference.txt' % reference_input_fp_basename)
        output_matrix2_fp = join(output_dir,\
         '%s_transformed_q%d.txt' % (query_input_fp_basename, i+1))
        
        if sample_id_map_fps:
            sample_id_map = dict([(k,v[0]) \
             for k,v in fields_to_dict(open(sample_id_map_fps[i], "U")).items()])
        else:
            sample_id_map = None
        
        transformed_coords1, transformed_coords2, m_squared, randomized_coords2 =\
          get_procrustes_results(open(reference_input_fp,'U'),\
                                 open(query_input_fp,'U'),\
                                 sample_id_map=sample_id_map,\
                                 randomize=False,
                                 max_dimensions=num_dimensions)
        
        output_matrix1_f = open(output_matrix1_fp,'w')
        output_matrix1_f.write(transformed_coords1)
        output_matrix1_f.close()
        output_matrix2_f = open(output_matrix2_fp,'w')
        output_matrix2_f.write(transformed_coords2)
        output_matrix2_f.close()
        
        if random_trials:
            if opts.store_trial_details:
                trial_output_dir = join(output_dir,'trial_details_%d' % i+2)
            else:
                trial_output_dir = None
            coords_f1 = list(open(reference_input_fp,'U'))
            coords_f2 = list(open(query_input_fp,'U'))
            actual_m_squared, trial_m_squareds, count_better, mc_p_value =\
             procrustes_monte_carlo(coords_f1,
                                    coords_f2,
                                    trials=random_trials,
                                    max_dimensions=num_dimensions,
                                    sample_id_map=sample_id_map,
                                    trial_output_dir=trial_output_dir)
            # truncate the p-value to the correct number of significant
            # digits
            mc_p_value_str = format_p_value_for_num_iters(mc_p_value, random_trials)
            summary_file_lines.append('%s\t%s\t%s\t%s\t%d\t%1.3f' %\
             (reference_input_fp, query_input_fp, max_dims_str, mc_p_value_str,\
              count_better, actual_m_squared))
        else:
            summary_file_lines.append('%s\t%s\t%s\tNA\tNA\t%1.3f' %\
             (reference_input_fp, query_input_fp, max_dims_str, m_squared))
    # Write output summary
    f = open(output_summary_fp,'w')
    f.write('\n'.join(summary_file_lines))
    f.write('\n')
    f.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    random_trials = opts.random_trials
    if random_trials != None and random_trials < 10:
        option_parser.error('Must perform >= 10 trails for Monte Carlo analysis.')
        
    output_dir = opts.output_dir
    sample_id_map_fp = opts.sample_id_map_fp
    num_dimensions = opts.num_dimensions
    
    if not exists(output_dir): 
        makedirs(output_dir)
    
    if opts.store_trial_details:
        trial_output_dir = '%s/trial_details/' % output_dir
    else:
        trial_output_dir = None
  
    input_fp1 = opts.input_fps[0]
    input_fp2 = opts.input_fps[1]
    input_fp1_dir, input_fn1 = split(input_fp1)
    input_fp1_basename, input_fp1_ext = splitext(input_fn1)
    input_fp2_dir, input_fn2 = split(input_fp2)
    input_fp2_basename, input_fp2_ext = splitext(input_fn2)
    output_summary_fp = '%s/%s_%s_procrustes_results.txt' %\
     (output_dir,input_fp1_basename,input_fp2_basename)
    output_matrix1_fp = '%s/pc1_transformed.txt' % output_dir
    output_matrix2_fp = '%s/pc2_transformed.txt' % output_dir
    
    if sample_id_map_fp:
        sample_id_map = dict([(k,v[0]) \
         for k,v in fields_to_dict(open(sample_id_map_fp, "U")).items()])
    else:
        sample_id_map = None
    
    transformed_coords1, transformed_coords2, m_squared, randomized_coords2 =\
      get_procrustes_results(open(input_fp1,'U'),\
                             open(input_fp2,'U'),\
                             sample_id_map=sample_id_map,\
                             randomize=False,
                             max_dimensions=num_dimensions)
    output_matrix1_f = open(output_matrix1_fp,'w')
    output_matrix1_f.write(transformed_coords1)
    output_matrix1_f.close()
    output_matrix2_f = open(output_matrix2_fp,'w')
    output_matrix2_f.write(transformed_coords2)
    output_matrix2_f.close()
    
    if random_trials:
        summary_file_lines = ['FP1 FP2 Included_dimensions MC_p_value Count_better M^2']
        coords_f1 = list(open(input_fp1,'U'))
        coords_f2 = list(open(input_fp2,'U'))
        actual_m_squared, trial_m_squareds, count_better, mc_p_value =\
         procrustes_monte_carlo(coords_f1,\
                                coords_f2,\
                                trials=random_trials,\
                                max_dimensions=num_dimensions,
                                sample_id_map=sample_id_map,
                                trial_output_dir=trial_output_dir)
        # truncate the p-value to the correct number of significant
        # digits
        mc_p_value_str = format_p_value_for_num_iters(mc_p_value, random_trials)
        max_dims_str = str(num_dimensions or 'alldim')
        summary_file_lines.append('%s %s %s %s %d %1.3f' %\
         (input_fp1, input_fp2, str(max_dims_str), mc_p_value_str,\
          count_better, actual_m_squared))
        f = open(output_summary_fp,'w')
        f.write('\n'.join(summary_file_lines))
        f.write('\n')
        f.close()