def get_procrustes_results(coords_f1,coords_f2,sample_id_map=None,\
    randomize=None,max_dimensions=None,\
    get_eigenvalues=get_mean_eigenvalues,\
    get_percent_variation_explained=get_mean_percent_variation):
    """ """
    # Parse the PCoA files
    sample_ids1, coords1, eigvals1, pct_var1 = parse_coords(coords_f1)
    sample_ids2, coords2, eigvals2, pct_var2 = parse_coords(coords_f2)
    
    if sample_id_map:
        sample_ids1 = map_sample_ids(sample_ids1,sample_id_map)
        sample_ids2 = map_sample_ids(sample_ids2,sample_id_map)
    # rearrange the order of coords in coords2 to correspond to 
    # the order of coords in coords1 
    order = list(set(sample_ids1) & set(sample_ids2)) 
    coords1 = reorder_coords(coords1,sample_ids1,order)
    coords2 = reorder_coords(coords2,sample_ids2,order)
    
    # If this is a random trial, apply the shuffling function passed as 
    # randomize()
    if randomize:
        coords2 = randomize(coords2)
        
    coords1, coords2 = pad_coords_matrices(coords1,coords2)
    if max_dimensions:
        coords1 = filter_coords_matrix(coords1,max_dimensions)
        coords2 = filter_coords_matrix(coords2,max_dimensions)
        pct_var1 = pct_var1[:max_dimensions]
        pct_var2 = pct_var2[:max_dimensions]
        eigvals1 = eigvals1[:max_dimensions]
        eigvals2 = eigvals2[:max_dimensions]
    else:
        if len(pct_var1)>len(pct_var2):
            pct_var2 = append(pct_var2,zeros(len(pct_var1)-len(pct_var2)))
            eigvals2 = append(eigvals2,zeros(len(eigvals1)-len(eigvals2)))
        elif len(pct_var1)<len(pct_var2):
            pct_var1 = append(pct_var1,zeros(len(pct_var2)-len(pct_var1)))
            eigvals1 = append(eigvals1,zeros(len(eigvals2)-len(eigvals1)))

    # Run the Procrustes analysis
    transformed_coords_m1, transformed_coords_m2, m_squared =\
     procrustes(coords1,coords2)
    # print coords2
    #print transformed_coords_m2
    
    eigvals = get_eigenvalues(eigvals1, eigvals2)
    pct_var = get_percent_variation_explained(pct_var1,pct_var2)
    
    transformed_coords1 = format_coords(coord_header=order,\
                                        coords=transformed_coords_m1,\
                                        eigvals=eigvals,\
                                        pct_var=pct_var)
    transformed_coords2 = format_coords(coord_header=order,\
                                        coords=transformed_coords_m2,\
                                        eigvals=eigvals,\
                                        pct_var=pct_var)
    
    # Return the results
    return transformed_coords1, transformed_coords2, m_squared
Example #2
0
    def test_get_ordered_coordinates(self):
        """get_ordered_coordinates functions as expected """
        pc_lines = ["Eigvals\t4",
                    "191.54\t169.99\t30.45\t19.19",
                    "",
                    "Proportion explained\t4",
                    "18.13\t16.09\t2.88\t1.66",
                    "",
                    "Species\t0\t0",
                    "",
                    "Site\t5\t4",
                    "s1\t-0.049\t0.245\t0.146\t-0.036",
                    "s5\t-0.267\t-0.228\t-0.024\t-0.095",
                    "s3\t-0.285\t-0.260\t-0.017\t-0.070",
                    "s2\t-0.002\t0.216\t-0.052\t-0.085",
                    "s4\t-0.328\t-0.299\t-0.025\t0.051",
                    "",
                    "Biplot\t0\t0",
                    "",
                    "Site constraints\t0\t0",
                    ""]

        pc = parse_coords(StringIO('\n'.join(pc_lines)))
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.002, 0.216, -0.052, -0.085],
                           [-0.285, -0.260, -0.017, -0.070],
                           [-0.328, -0.299, -0.025, 0.051],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1', 's2', 's3', 's4', 's5']
        actual_coords, actual_sids = get_ordered_coordinates(
            pc[0], pc[1], ['s1', 's2', 's3', 's4', 's5'])
        assert_almost_equal(actual_coords, expected_coords)
        self.assertEqual(actual_sids, expected_sids)

        pc = parse_coords(StringIO('\n'.join(pc_lines)))
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1', 's5']
        actual_coords, actual_sids = get_ordered_coordinates(
            pc[0], pc[1], ['s1', 's5'])
        assert_almost_equal(actual_coords, expected_coords)
        self.assertEqual(actual_sids, expected_sids)

        pc = parse_coords(StringIO('\n'.join(pc_lines)))
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1', 's5']
        actual_coords, actual_sids = get_ordered_coordinates(
            pc[0], pc[1], ['s1', 's6', 's5'])
        assert_almost_equal(actual_coords, expected_coords)
        self.assertEqual(actual_sids, expected_sids)

        pc = parse_coords(StringIO('\n'.join(pc_lines)))
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1', 's5']
        self.assertRaises(ValueError, get_ordered_coordinates,
                          pc[0], pc[1], ['s1', 's6', 's5'], strict=True)
Example #3
0
    def test_get_ordered_coordinates(self):
        """get_ordered_coordinates functions as expected """
        pc_lines = [
            "Eigvals\t4", "191.54\t169.99\t30.45\t19.19", "",
            "Proportion explained\t4", "18.13\t16.09\t2.88\t1.66", "",
            "Species\t0\t0", "", "Site\t5\t4",
            "s1\t-0.049\t0.245\t0.146\t-0.036",
            "s5\t-0.267\t-0.228\t-0.024\t-0.095",
            "s3\t-0.285\t-0.260\t-0.017\t-0.070",
            "s2\t-0.002\t0.216\t-0.052\t-0.085",
            "s4\t-0.328\t-0.299\t-0.025\t0.051", "", "Biplot\t0\t0", "",
            "Site constraints\t0\t0", ""
        ]

        pc = parse_coords(StringIO('\n'.join(pc_lines)))
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.002, 0.216, -0.052, -0.085],
                           [-0.285, -0.260, -0.017, -0.070],
                           [-0.328, -0.299, -0.025, 0.051],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1', 's2', 's3', 's4', 's5']
        actual_coords, actual_sids = get_ordered_coordinates(
            pc[0], pc[1], ['s1', 's2', 's3', 's4', 's5'])
        assert_almost_equal(actual_coords, expected_coords)
        self.assertEqual(actual_sids, expected_sids)

        pc = parse_coords(StringIO('\n'.join(pc_lines)))
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1', 's5']
        actual_coords, actual_sids = get_ordered_coordinates(
            pc[0], pc[1], ['s1', 's5'])
        assert_almost_equal(actual_coords, expected_coords)
        self.assertEqual(actual_sids, expected_sids)

        pc = parse_coords(StringIO('\n'.join(pc_lines)))
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1', 's5']
        actual_coords, actual_sids = get_ordered_coordinates(
            pc[0], pc[1], ['s1', 's6', 's5'])
        assert_almost_equal(actual_coords, expected_coords)
        self.assertEqual(actual_sids, expected_sids)

        pc = parse_coords(StringIO('\n'.join(pc_lines)))
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1', 's5']
        self.assertRaises(ValueError,
                          get_ordered_coordinates,
                          pc[0],
                          pc[1], ['s1', 's6', 's5'],
                          strict=True)
 def setUp(self):
     """ """
     self.pcoa1_f = pcoa1_f.split('\n')
     self.sample_ids1, self.coords1, self.eigvals1, self.pct_var1 =\
       parse_coords(self.pcoa1_f)
     self.pcoa2_f = pcoa2_f.split('\n')
     self.sample_ids2, self.coords2, self.eigvals2, self.pct_var2 =\
       parse_coords(self.pcoa2_f)
     self.pcoa3_f = pcoa3_f.split('\n')
     self.sample_ids3, self.coords3, self.eigvals3, self.pct_var3 =\
       parse_coords(self.pcoa3_f)
     self.sample_id_map1 = sample_id_map1
    def setUp(self):
        """ """
        self.pcoa1_f = StringIO(pcoa1_f)
        self.sample_ids1, self.coords1, self.eigvals1, self.pct_var1 = parse_coords(self.pcoa1_f)
        self.pcoa2_f = StringIO(pcoa2_f)
        self.sample_ids2, self.coords2, self.eigvals2, self.pct_var2 = parse_coords(self.pcoa2_f)
        self.pcoa3_f = StringIO(pcoa3_f)
        self.sample_ids3, self.coords3, self.eigvals3, self.pct_var3 = parse_coords(self.pcoa3_f)
        self.pcoa4_f = StringIO(pcoa4_f)
        self.sample_ids4, self.coords4, self.eigvals4, self.pct_var4 = parse_coords(self.pcoa4_f)

        self.sample_id_map1 = sample_id_map1
 def setUp(self):
     """ """
     self.pcoa1_f = pcoa1_f.split('\n')
     self.sample_ids1, self.coords1, self.eigvals1, self.pct_var1 =\
       parse_coords(self.pcoa1_f)
     self.pcoa2_f = pcoa2_f.split('\n')
     self.sample_ids2, self.coords2, self.eigvals2, self.pct_var2 =\
       parse_coords(self.pcoa2_f)
     self.pcoa3_f = pcoa3_f.split('\n')
     self.sample_ids3, self.coords3, self.eigvals3, self.pct_var3 =\
       parse_coords(self.pcoa3_f)
     self.sample_id_map1 = sample_id_map1
Example #7
0
def get_procrustes_results(coords_f1,coords_f2,sample_id_map=None,\
    randomize=None,max_dimensions=None,\
    get_eigenvalues=get_mean_eigenvalues,\
    get_percent_variation_explained=get_mean_percent_variation):
    """ """
    # Parse the PCoA files
    sample_ids1, coords1, eigvals1, pct_var1 = parse_coords(coords_f1)
    sample_ids2, coords2, eigvals2, pct_var2 = parse_coords(coords_f2)
    
    if sample_id_map:
        sample_ids1 = map_sample_ids(sample_ids1,sample_id_map)
        sample_ids2 = map_sample_ids(sample_ids2,sample_id_map)
    # rearrange the order of coords in coords2 to correspond to 
    # the order of coords in coords1 
    order = list(set(sample_ids1) & set(sample_ids2)) 
    coords1 = reorder_coords(coords1,sample_ids1,order)
    coords2 = reorder_coords(coords2,sample_ids2,order)
    
    # If this is a random trial, apply the shuffling function passed as 
    # randomize()
    if randomize:
        coords2 = randomize(coords2)
        
    coords1, coords2 = pad_coords_matrices(coords1,coords2)
    if max_dimensions:
        coords1 = filter_coords_matrix(coords1,max_dimensions)
        coords2 = filter_coords_matrix(coords2,max_dimensions)
        pct_var1 = pct_var1[:max_dimensions]
        pct_var2 = pct_var2[:max_dimensions]
        eigvals1 = eigvals1[:max_dimensions]
        eigvals2 = eigvals2[:max_dimensions]
    
    # Run the Procrustes analysis
    transformed_coords_m1, transformed_coords_m2, m_squared =\
     procrustes(coords1,coords2)
    
    #print transformed_coords_m2
    
    eigvals = get_eigenvalues(eigvals1, eigvals2)
    pct_var = get_percent_variation_explained(pct_var1,pct_var2)
    
    transformed_coords1 = format_coords(coord_header=order,\
                                        coords=transformed_coords_m1,\
                                        eigvals=eigvals,\
                                        pct_var=pct_var)
    transformed_coords2 = format_coords(coord_header=order,\
                                        coords=transformed_coords_m2,\
                                        eigvals=eigvals,\
                                        pct_var=pct_var)
    
    # Return the results
    return transformed_coords1, transformed_coords2, m_squared
Example #8
0
    def test_get_ordered_coordinates(self):
        """get_ordered_coordinates functions as expected """
        pc_lines = [
            "pc vector number\t1\t2\t3\t4", "s1\t-0.049\t0.245\t0.146\t-0.036",
            "s5\t-0.267\t-0.228\t-0.024\t-0.095",
            "s3\t-0.285\t-0.260\t-0.017\t-0.070",
            "s2\t-0.002\t0.216\t-0.052\t-0.085",
            "s4\t-0.328\t-0.299\t-0.025\t0.051", "", "",
            "eigvals\t191.54\t169.99\t30.45\t19.19",
            "% variation explained\t18.13\t16.09\t2.88\t1.66"
        ]

        pc = parse_coords(pc_lines)
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.002, 0.216, -0.052, -0.085],
                           [-0.285, -0.260, -0.017, -0.070],
                           [-0.328, -0.299, -0.025, 0.051],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1', 's2', 's3', 's4', 's5']
        actual_coords, actual_sids = get_ordered_coordinates(
            pc[0], pc[1], ['s1', 's2', 's3', 's4', 's5'])
        self.assertEqual(actual_coords, expected_coords)
        self.assertEqual(actual_sids, expected_sids)

        pc = parse_coords(pc_lines)
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1', 's5']
        actual_coords, actual_sids = get_ordered_coordinates(
            pc[0], pc[1], ['s1', 's5'])
        self.assertEqual(actual_coords, expected_coords)
        self.assertEqual(actual_sids, expected_sids)

        pc = parse_coords(pc_lines)
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1', 's5']
        actual_coords, actual_sids = get_ordered_coordinates(
            pc[0], pc[1], ['s1', 's6', 's5'])
        self.assertEqual(actual_coords, expected_coords)
        self.assertEqual(actual_sids, expected_sids)

        pc = parse_coords(pc_lines)
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1', 's5']
        self.assertRaises(ValueError,
                          get_ordered_coordinates,
                          pc[0],
                          pc[1], ['s1', 's6', 's5'],
                          strict=True)
Example #9
0
    def test_get_ordered_coordinates(self):
        """get_ordered_coordinates functions as expected """
        pc_lines = ["pc vector number\t1\t2\t3\t4",
                    "s1\t-0.049\t0.245\t0.146\t-0.036",
                    "s5\t-0.267\t-0.228\t-0.024\t-0.095",
                    "s3\t-0.285\t-0.260\t-0.017\t-0.070",
                    "s2\t-0.002\t0.216\t-0.052\t-0.085",
                    "s4\t-0.328\t-0.299\t-0.025\t0.051",
                    "",
                    "",
                    "eigvals\t191.54\t169.99\t30.45\t19.19",
                    "%% variation explained\t18.13\t16.09\t2.88\t1.66"]

        pc = parse_coords(pc_lines)
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.002, 0.216, -0.052, -0.085],
                           [-0.285, -0.260, -0.017, -0.070],
                           [-0.328, -0.299, -0.025, 0.051],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1','s2','s3','s4','s5']
        actual_coords, actual_sids = get_ordered_coordinates(
         pc[0],pc[1],['s1','s2','s3','s4','s5'])
        self.assertEqual(actual_coords,expected_coords)
        self.assertEqual(actual_sids,expected_sids)

        pc = parse_coords(pc_lines)
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1','s5']
        actual_coords, actual_sids = get_ordered_coordinates(
         pc[0],pc[1],['s1','s5'])
        self.assertEqual(actual_coords,expected_coords)
        self.assertEqual(actual_sids,expected_sids)

        pc = parse_coords(pc_lines)
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1','s5']
        actual_coords, actual_sids = get_ordered_coordinates(
         pc[0],pc[1],['s1','s6','s5'])
        self.assertEqual(actual_coords,expected_coords)
        self.assertEqual(actual_sids,expected_sids)

        pc = parse_coords(pc_lines)
        expected_coords = [[-0.049, 0.245, 0.146, -0.036],
                           [-0.267, -0.228, -0.024, -0.095]]
        expected_sids = ['s1','s5']
        self.assertRaises(ValueError,get_ordered_coordinates,
                          pc[0],pc[1],['s1','s6','s5'],strict=True)
Example #10
0
def get_pcoa_ellipsoid_coords(sampled_pcoa_strings, number_of_axes, sampleIds):
    """gets min, max, average for each axis in the number_of_axes

    Inputs:
    sampled_pcoa_strings: list of PCoA strings
    number_of_axes: number of axes for the PCoA
    SampleIDs: list of the contained identifiers

    Outputs:
    A nested dict with sampleIds of interest as keys, with values equal to a
    list of as many dictionaries as there number_of_axes.

    NOTE: result[sampleId1][0] gives the dictionary for the avg, min, max of
    first axis of the pcoa, i.e. the indexing is off by 1.
    {'SampleId1:{'center':[x,y,z,w]
                 'axes_radii':[x_radius, y_radius, z_radius, w_radius]},
     'SampleId2:{'center':[x,y,z,w]
                 'axes_radii':[x_radius, y_radius, z_radius, w_radius]}

    The ellipsoids that are calculated by this function are NOT minimum spanning
    ellipsoids. This function looks at all the iterations for a given sample
    and the calculates the center and max distance from the center of each of 
    those iterations along each individual axis. While this ensures that the 
    most extreme points on any given axis are contained within the ellipse it 
    isn't guaranteed that every point will be contained. As an example, 
    circumscribe a rectangle about an ellipse so that the ellipse and rectangle 
    are co-linear at 4 points (i.e. rectangle is just big enough to contain 
    the ellipse). Take a corner of the rectangle and move it towards the center
    of the ellipse so that its x and y components diminish in magnitude. This 
    new point will not be contained within the ellipse, but its x and y 
    coordinates will be less extreme than the most extreme points of the 
    ellipse (the points that are colinear with the minor and major axis.)
    """
    sampleId_to_coords = {}
    sampleId_center_and_axes = {}
    for pcoa_string in sampled_pcoa_strings:
        coord_header, coords, eigvals, pct_var = parse_coords(pcoa_string.split('\n'))
        if 'variation explained' not in sampleId_to_coords:
            sampleId_to_coords['variation explained'] = []
        sampleId_to_coords['variation explained'].append(pct_var[:number_of_axes])
        
        for sampleName, values in zip(coord_header, coords):
            if sampleName not in sampleId_to_coords.keys():
                sampleId_to_coords[sampleName] = {'coords':[[] for i in range(1,number_of_axes+1,1)]}
                sampleId_center_and_axes[sampleName] = { 'center':[], 'axes_radii':[] }
            for axis in range(number_of_axes):
                sampleId_to_coords[sampleName]['coords'][axis].append(values[axis])
    sampleId_to_coords['variation explained'] = array(sampleId_to_coords['variation explained']).mean(0)
    
    for samId in sampleId_center_and_axes:
        for axis in range(number_of_axes):
            center = array(sampleId_to_coords[samId]['coords'][axis]).mean()
            sampleId_center_and_axes[samId]['center'].append(center)

            dfc = abs(\
                array(sampleId_to_coords[samId]['coords'][axis]) - \
                      sampleId_center_and_axes[samId]['center'][axis]).mean()
            sampleId_center_and_axes[samId]['axes_radii'].append(dfc)

    return sampleId_center_and_axes, sampleId_to_coords
Example #11
0
def _collate_cluster_pcoa_plot_data(coords_f, map_f, category):
    pc_data = parse_coords(coords_f)
    coords_d = dict(zip(pc_data[0], pc_data[1]))

    map_data = parse_mapping_file(map_f)
    full_map_data = [map_data[1]]
    full_map_data.extend(map_data[0])

    sid_map = group_by_field(full_map_data, category)
    sorted_states = sorted(sid_map.keys())

    color_pool = get_color_pool()
    if len(sorted_states) > len(color_pool):
        raise ValueError("Not enough colors to uniquely color sample "
                         "groups.")

    results = []
    for state, color in zip(sorted_states,
                            color_pool[:len(sorted_states)]):
        sids = sid_map[state]
        xs = [coords_d[sid][0] for sid in sids]
        ys = [coords_d[sid][1] for sid in sids]
        results.append((xs, ys, color, state))

    return results
Example #12
0
    def setUp(self):
        """ """
        self.pcoa1_f = StringIO(pcoa1_f)
        self.sample_ids1, self.coords1, self.eigvals1, self.pct_var1 =\
            parse_coords(self.pcoa1_f)
        self.pcoa2_f = StringIO(pcoa2_f)
        self.sample_ids2, self.coords2, self.eigvals2, self.pct_var2 =\
            parse_coords(self.pcoa2_f)
        self.pcoa3_f = StringIO(pcoa3_f)
        self.sample_ids3, self.coords3, self.eigvals3, self.pct_var3 =\
            parse_coords(self.pcoa3_f)
        self.pcoa4_f = StringIO(pcoa4_f)
        self.sample_ids4, self.coords4, self.eigvals4, self.pct_var4 =\
            parse_coords(self.pcoa4_f)

        self.sample_id_map1 = sample_id_map1
Example #13
0
def get_coord(coord_fname, method="IQR"):
    """Opens and returns coords location matrix and metadata.
       Also two spread matrices (+/-) if passed a dir of coord files.
       If only a single coord file, spread matrices are returned as None.
    """
    if not os.path.isdir(coord_fname):
        try:
            coord_f = open(coord_fname, 'U')
        except (TypeError, IOError):
            raise MissingFileError('Coord file required for this analysis')
        coord_header, coords, eigvals, pct_var = parse_coords(coord_f)
        return [coord_header, coords, eigvals, pct_var, None, None]
    else:
        master_pcoa, support_pcoas = load_pcoa_files(coord_fname)

        # get Summary statistics
        coords, coords_low, coords_high, eigval_average, coord_header = \
            summarize_pcoas(master_pcoa, support_pcoas, method=method)
        pct_var = master_pcoa[3]  # should be getting this from an average

        # make_3d_plots expects coord_header to be a python list
        coord_header = list(master_pcoa[0])
        return (
            [coord_header,
             coords,
             eigval_average,
             pct_var,
             coords_low,
             coords_high]
        )
Example #14
0
def get_coord(coord_fname, method="IQR"):
    """Opens and returns coords location matrix and metadata.
       Also two spread matrices (+/-) if passed a dir of coord files.
       If only a single coord file, spread matrices are returned as None.
    """
    if not os.path.isdir(coord_fname):
        try:
            coord_f = open(coord_fname, 'U')
        except (TypeError, IOError):
            raise MissingFileError('Coord file required for this analysis')
        coord_header, coords, eigvals, pct_var = parse_coords(coord_f)
        return [coord_header, coords, eigvals, pct_var, None, None]
    else:
        master_pcoa, support_pcoas = load_pcoa_files(coord_fname)

        # get Summary statistics
        coords, coords_low, coords_high, eigval_average, coord_header = \
            summarize_pcoas(master_pcoa, support_pcoas, method=method)
        pct_var = master_pcoa[3]  # should be getting this from an average

        # make_3d_plots expects coord_header to be a python list
        coord_header = list(master_pcoa[0])
        return ([
            coord_header, coords, eigval_average, pct_var, coords_low,
            coords_high
        ])
Example #15
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    coordinates_fp = opts.coordinates_fp
    mapping_file_fp = opts.mapping_file_fp
    category_header_name = opts.category
    output_fp = opts.output_fp

    coords_headers, coords_data, coords_eigenvalues, coords_percents = parse_coords(
        open(coordinates_fp, 'U'))
    mapping_data, mapping_headers, _ = parse_mapping_file(
        open(mapping_file_fp, 'U'))

    category_header_index = mapping_headers.index(category_header_name)
    category_names = list(
        set([line[category_header_index] for line in mapping_data]))

    xtitle = 'PC1 (%.0f%%)' % round(coords_percents[0])
    ytitle = 'PC2 (%.0f%%)' % round(coords_percents[1])
    main_figure = plt.figure()
    main_axes = main_figure.add_subplot(1, 1, 1, axisbg='white')
    plt.xlabel(xtitle)
    plt.ylabel(ytitle)
    main_axes.tick_params(axis='y')
    main_axes.tick_params(axis='x')

    # sort the data!!! that way you can match make_3d_plots.py
    for index, category in enumerate(natsort(category_names)):
        sample_ids_list = [
            line[0] for line in mapping_data
            if line[category_header_index] == category
        ]

        qiime_color = get_qiime_hex_string_color(index)

        if len(sample_ids_list) < 3:
            continue

        indices = [
            coords_headers.index(sample_id) for sample_id in sample_ids_list
        ]
        points = coords_data[indices, :2]  # * coords_percents[:2]

        hull = ConvexHull(points)
        main_axes.plot(points[:, 0], points[:, 1], 'o', color=qiime_color)
        for simplex in hull.simplices:
            main_axes.plot(points[simplex, 0], points[simplex, 1], 'k-')
        main_axes.plot(points[hull.vertices, 0],
                       points[hull.vertices, 1],
                       '--',
                       lw=2,
                       color=qiime_color)
        # plt.plot(points[hull.vertices[0],0], points[hull.vertices[0],1], '--', color=qiime_color)
    #plt.show()

    main_figure.savefig(output_fp)
Example #16
0
def load_pcoa_files(pcoa_dir):
    """loads PCoA files from filepaths
    """
    support_pcoas = []
    pcoa_filenames = os.listdir(pcoa_dir)
    #ignore invisible files like .DS_Store
    pcoa_filenames = [fname for fname in pcoa_filenames if not \
        fname.startswith('.')]
    master_pcoa = open(os.path.join(pcoa_dir, pcoa_filenames[0]), 'U')
    master_pcoa = parse_coords(master_pcoa)
    for fname in pcoa_filenames:
        try:
            f = open(os.path.join(pcoa_dir, fname), 'U')
            pcoa_res = parse_coords(f)
            support_pcoas.append(pcoa_res)
            f.close()
        except IOError, err:
            sys.sterr.write('error loading support pcoa ' + fname + '\n')
            exit(1)
Example #17
0
def _collate_gradient_pcoa_plot_data(coords_f, map_f, category):
    pc_data = parse_coords(coords_f)
    coords_d = dict(zip(pc_data[0], pc_data[1]))

    # Build list of (gradient value, sid) tuples.
    map_dict = parse_mapping_file_to_dict(map_f)[0]
    sorted_sids = sorted([(float(md[category]), sid)
                          for sid, md in map_dict.items()])

    xs = [coords_d[sid][0] for _, sid in sorted_sids]
    ys = [coords_d[sid][1] for _, sid in sorted_sids]
    gradient = [cat_val for cat_val, _ in sorted_sids]

    return xs, ys, gradient
Example #18
0
def generate_pcoa_cloud_from_point_in_omega(map_headers, map_data, biom_object, metric, 
        sequences, iterations, axes, tree_object=None):
    """run the randomisations and get a WebGL PCoA plot string representation

    Input:
    mapping_file_tuple: data and headers tuple for representing the mapping file
    biom_object: otu table biom object
    metric: string of the name for the beta diversity metric, i. e. 'unifrac'
    sequences: number of sequences per sample
    iterations: number of iterations to generate the pcoa plot
    axes: number of axes to account for
    tree_object: tree to perform the beta diversity calculation

    Output:
    WebGL string representing the PCoA plot
    """
    
    pcoa_input = {'pcoa_headers':[], 'pcoa_values':[], 'eigenvalues':[], 'coords_pct':[]}
    for i in range(iterations):
        rare_biom_table = get_rare_data(biom_object, sequences)
        beta_dm = single_object_beta(rare_biom_table, metric, tree_object)
        pcoa_results = pcoa(beta_dm)

        pcoa_file = StringIO()
        pcoa_file.write(pcoa_results)
        pcoa_file.seek(0)
        pcoa_headers, pcoa_values, eigenvalues, coords_pct = parse_coords(pcoa_file)
        pcoa_file.close()
        pcoa_input['pcoa_headers'].append(pcoa_headers)
        pcoa_input['pcoa_values'].append(pcoa_values)
        pcoa_input['eigenvalues'].append(eigenvalues)
        pcoa_input['coords_pct'].append(coords_pct)
    
    if iterations==1:
        coords_headers = pcoa_input['pcoa_headers'][0]
        coords_data = pcoa_input['pcoa_values'][0]
        coords_eigenvalues = pcoa_input['eigenvalues'][0]
        coords_pct = pcoa_input['coords_pct'][0]
        coords_low, coords_high = None, None
    else:
        coords_headers, coords_data, coords_eigenvalues, coords_pct, coords_low,\
            coords_high, clones = preprocess_coords_file(pcoa_input['pcoa_headers'],
            pcoa_input['pcoa_values'], pcoa_input['eigenvalues'], 
            pcoa_input['coords_pct'], map_headers, map_data, custom_axes=None, 
            jackknifing_method='IQR', is_comparison=False)
    
    return make_pcoa_plot(coords_headers, coords_data, coords_eigenvalues, coords_pct, \
        map_headers, map_data, coords_low, coords_high, True)
def compute_ordination_correlation(map_f, coord_f, category, axis=1,
                                   correlation_type='pearson',
                                   num_permutations=999):
    if correlation_type not in CORRELATION_TYPES:
        raise ValueError("Invalid correlation type '%s'. Must be one of %r." %
                         (correlation_type, CORRELATION_TYPES))
    if num_permutations < 0:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than or equal to zero." % num_permutations)

    coords_samp_ids, coords, _, _ = parse_coords(coord_f)
    num_axes = len(coords[0])
    if axis < 1 or axis > num_axes:
        raise ValueError("Invalid axis number %d. Must be greater than zero "
                         "and less than or equal to the number of axes in the "
                         "input coordinates file (found %d axes)." %
                         (axis, num_axes))
    axis_data = coords[:, axis - 1]

    mdm, _ = parse_mapping_file_to_dict(map_f)
    gradient_data = []
    for samp_id in coords_samp_ids:
        if category not in mdm[samp_id]:
            raise ValueError("Category '%s' does not exist in the input "
                             "mapping file." % category)

        md_value = mdm[samp_id][category]
        try:
            md_value = float(md_value)
        except ValueError:
            raise ValueError("The category state '%s' could not be converted "
                             "to a number. All states in the '%s' category "
                             "must be numeric." % (md_value, category))
        gradient_data.append(md_value)

    corr_coeff, param_p_val, _, nonparam_p_val, _ = \
            correlation_test(axis_data, gradient_data, method=correlation_type,
                             permutations=num_permutations)

    if num_permutations > 0:
        nonparam_p_val = format_p_value_for_num_iters(nonparam_p_val,
                                                      num_permutations)
    else:
        nonparam_p_val = 'N/A'

    return corr_coeff, param_p_val, nonparam_p_val
Example #20
0
    def test_parse_coords(self):
        """parse_coords should handle coords file"""
        coords = """pc vector number\t1\t2\t3
A\t0.11\t0.09\t0.23
B\t0.03\t0.07\t-0.26
C\t0.12\t0.06\t-0.32


eigvals\t4.94\t1.79\t1.50
% variation explained\t14.3\t5.2\t4.3


""".splitlines()
        obs = parse_coords(coords)
        exp = (['A', 'B', 'C'],
               array([[.11, .09, .23], [.03, .07, -.26], [.12, .06, -.32]]),
               array([4.94, 1.79, 1.50]), array([14.3, 5.2, 4.3]))
        self.assertEqual(obs, exp)
Example #21
0
    def test_parse_coords(self):
        """parse_coords should handle coords file"""
        coords = """pc vector number\t1\t2\t3
A\t0.11\t0.09\t0.23
B\t0.03\t0.07\t-0.26
C\t0.12\t0.06\t-0.32


eigvals\t4.94\t1.79\t1.50
% variation explained\t14.3\t5.2\t4.3


""".splitlines()
        obs = parse_coords(coords)
        exp = (['A','B','C'], 
            array([[.11,.09,.23],[.03,.07,-.26],[.12,.06,-.32]]),
            array([4.94,1.79,1.50]),
            array([14.3,5.2,4.3]))
        self.assertEqual(obs, exp)
Example #22
0
def get_multiple_coords(coord_fnames, edges_file=None, serial=False):
    """Opens and returns coords data and edges from multiple coords files.

       Params:
        coord_fnames, the names of the coordinate files

       Returns:
        edges, a list of pairs of sample IDs, (from, to)
        coords
            a list of [coord_header, coords, eigvals, pct_var]
            all coords are put in a single data matrix.
            Sample IDs from ith file have _i appended to them.
            eigvals, pct_var are taken from first coords file

       If "serial" is True, connects points ending with _0 to those with _1, 
       those with _1 to those with _2, etc. Otherwise all sets are connected
       back to those ending with _0.
    """
    # start with empty data structures
    coord_header = []
    coords = []
    edges = []

    # load predetermined edges if they were passed to us
    if not edges_file is None:
        edges = [
            ln.strip().split() for ln in open(edges_file, 'U').readlines()
        ]

    # load all coords files into same data matrix
    for i, f in enumerate(coord_fnames):
        try:
            coord_f = open(coord_fnames[i], 'U').readlines()
        except (TypeError, IOError):
            raise MissingFileError, 'Coord file required for this analysis'
        coord_header_i, coords_i, eigvals_i, pct_var_i = parse_coords(coord_f)
        sampleIDs = coord_header_i
        # append _i to this file's sampleIDs unless we have predetermined edges
        if edges_file is None:
            coord_header_i = ['%s_%d' % (h, i) for h in coord_header_i]

        # get eigvals, pct_var from first coords file
        if i == 0:
            eigvals = eigvals_i
            pct_var = pct_var_i
            coord_header = coord_header_i
            coords = coords_i
        # for second, third, etc coords files, just append to first file
        else:
            coord_header.extend(coord_header_i)
            coords = vstack((coords, coords_i))
    # add all edges unless we have predetermined edges
    if edges_file is None:
        for _id in sampleIDs:
            if serial:
                for i in xrange(len(coord_fnames) - 1):
                    # edges go from one set to the next
                    edges += [('%s_%d' % (_id, i), '%s_%d' % (_id, i + 1))]
            else:
                for i in xrange(1, len(coord_fnames)):
                    # edges go from first file's points to other files' points
                    edges += [('%s_%d' % (_id, 0), '%s_%d' % (_id, i))]

    return edges, [coord_header, coords, eigvals, pct_var, None, None]
Example #23
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_coords = opts.input_coords
    map_fp = opts.map_fp
    output_dir = opts.output_dir
    color_by_column_names = opts.color_by
    add_unique_columns = opts.add_unique_columns
    custom_axes = opts.custom_axes
    ignore_missing_samples = opts.ignore_missing_samples
    missing_custom_axes_values = opts.missing_custom_axes_values
    jackknifing_method = opts.ellipsoid_method
    master_pcoa = opts.master_pcoa
    taxa_fp = opts.taxa_fp
    n_taxa_to_keep = opts.n_taxa_to_keep
    biplot_fp = opts.biplot_fp
    add_vectors = opts.add_vectors
    verbose_output = opts.verbose
    number_of_axes = opts.number_of_axes
    compare_plots = opts.compare_plots
    number_of_segments = opts.number_of_segments

    # add some metadata to the output
    emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML')

    # verifying that the number of axes requested is greater than 3
    if number_of_axes<3:
        option_parser.error(('You need to plot at least 3 axes.'))
        
    # verifying that the number of segments is between the desired range
    if number_of_segments<4 or number_of_segments>14:
        option_parser.error(('number_of_segments should be between 4 and 14.'))
        
    # append headernames that the script didn't find in the mapping file
    # according to different criteria to the following variables
    offending_fields = []
    non_numeric_categories = []

    serial_comparison = True

    # can't do averaged pcoa plots _and_ custom axes in the same plot
    if custom_axes!=None and len(custom_axes.split(','))>1 and\
        isdir(input_coords):
        option_parser.error(('Jackknifed plots are limited to one custom axis, '
            'currently trying to use: %s. Make sure you use only one.' %
            custom_axes))

    # make sure the flag is not misunderstood from the command line interface
    if isdir(input_coords) == False and compare_plots:
        option_parser.error('Cannot use the \'--compare_plots\' flag unless the'
            ' input path is a directory.')

    # before creating any output, check correct parsing of the main input files
    try:
        mapping_data, header, comments = parse_mapping_file(open(map_fp,'U'))

        # use this set variable to make presence/absensce checks faster
        lookup_header = set(header)
    except:
        option_parser.error(('The metadata mapping file \'%s\' does not seem '
            'to be formatted correctly, verify the formatting is QIIME '
            'compliant by using check_id_map.py') % map_fp)

    # dir means jackknifing or coordinate comparison type of processing
    if isdir(input_coords):
        offending_coords_fp = []
        coords_headers, coords_data, coords_eigenvalues, coords_pct=[],[],[],[]

        # iterate only over the non-hidden files and not folders and if anything
        # ignore the procrustes results file that is generated by
        # transform_coordinate_matrices.py suffixed in procrustes_results.txt
        coord_fps = [join(input_coords, f) for f in listdir(input_coords) if
            not f.startswith('.') and not isdir(join(abspath(input_coords),f))
            and not f.endswith('procrustes_results.txt')]

        # this could happen and we rather avoid this problem
        if len(coord_fps) == 0:
            option_parser.error('Could not use any of the files in the input '
                'directory.')

        # the master pcoa must be the first in the list of coordinates; however
        # if the visualization is not a jackknifed plot this gets ignored
        if master_pcoa and compare_plots == False:
            if master_pcoa in coord_fps: # remove it if duplicated
                coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + coord_fps # prepend it to the list
        # passing a master file means that the comparison is not serial
        elif master_pcoa and compare_plots:
            serial_comparison = False

            # guarantee that the master is the first and is not repeated
            if master_pcoa in  coord_fps:
                coord_fps.remove(master_pcoa)
                coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        # QIIME generates folders of transformed coordinates for the specific
        # purpose of connecting all coordinates to a set of origin coordinates.
        # The name of this file is suffixed as _transformed_reference.txt
        elif master_pcoa == None and len([f for f in coord_fps if f.endswith(
            '_transformed_reference.txt')]):
            master_pcoa = [f for f in coord_fps if f.endswith(
                '_transformed_reference.txt')][0]
            serial_comparison = False

            # Note: the following steps are to guarantee consistency.
            # remove the master from the list and re-add it as a first element
            # the rest of the files must be sorted alphabetically so the result
            # will be: ['unifrac_transformed_reference.txt',
            # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc
            coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        for fp in coord_fps:
            try:
                _coords_headers, _coords_data, _coords_eigenvalues,_coords_pct=\
                    parse_coords(open(fp,'U'))

                # pack all the data correspondingly
                coords_headers.append(_coords_headers)
                coords_data.append(_coords_data)
                coords_eigenvalues.append(_coords_eigenvalues)
                coords_pct.append(_coords_pct)
            except ValueError:
                offending_coords_fp.append(fp)

        # in case there were files that couldn't be parsed
        if offending_coords_fp:
            option_parser.error(('The following file(s): \'%s\' could not be '
                'parsed properly. Make sure the input folder only contains '
                'coordinates files.') % ', '.join(offending_coords_fp))

        # check all files contain the same sample identifiers by flattening the
        # list of available sample ids and returning the sample ids that are
        # in one of the sets of sample ids but not in the globablly shared ids
        non_shared_ids = set(sum([list(set(sum(coords_headers, []))^set(e))
            for e in coords_headers],[]))
        if non_shared_ids and len(coords_headers) > 1:
            option_parser.error(('The following sample identifier(s): \'%s\''
                'are not shared between all the files. The files used to '
                'make a jackknifed PCoA plot or coordinate comparison plot ('
                'procustes plot) must share all the same sample identifiers'
                'between each other.')%', '.join(list(non_shared_ids)))

        # flatten the list of lists into a 1-d list
        _coords_headers = list(set(sum(coords_headers, [])))

        # number of samples ids that are shared between coords and mapping files
        sids_intersection=list(set(zip(*mapping_data)[0])&set(_coords_headers))

        # sample ids that are not mapped but are in the coords
        sids_difference=list(set(_coords_headers)-set(zip(*mapping_data)[0]))

        # used to perform different validations in the script, very similar for
        # the case where the input is not a directory
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers[0])

    else:
        try:
            coords_headers, coords_data, coords_eigenvalues, coords_pct =\
                parse_coords(open(input_coords,'U'))
        # this exception was noticed when there were letters in the coords file
        # other exeptions should be catched here; code will be updated then
        except ValueError:
            option_parser.error(('The PCoA file \'%s\' does not seem to be a '
                'coordinates formatted file, verify by manually inspecting '
                'the contents.') % input_coords)

        # number of samples ids that are shared between coords and mapping files
        sids_intersection = list(set(zip(*mapping_data)[0])&set(coords_headers))
        # sample ids that are not mapped but are in the coords
        sids_difference = list(set(coords_headers)-set(zip(*mapping_data)[0]))
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers)

    if taxa_fp:
        try:
            # for summarized tables the "otu_ids" are really the "lineages"
            otu_sample_ids, lineages, otu_table, _ = parse_otu_table(open(
                taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True)
        except ValueError, e:
            option_parser.error('There was a problem parsing the --taxa_fp: %s'%
                e.message)

        # make sure there are matching sample ids with the otu table
        if not len(list(set(sids_intersection)&set(otu_sample_ids))):
            option_parser.error('The sample identifiers in the OTU table must '
                'have at least one match with the data in the mapping file and '
                'with the coordinates file. Verify you are using input files '
                'that belong to the same dataset.')
        if len(lineages) <= 1:
            option_parser.error('Contingency tables with one or fewer rows are '
                'not supported, please try passing a contingency table with '
                'more than one row.')
Example #24
0
def get_multiple_coords(coord_fnames, edges_file=None, serial=False):
    """Opens and returns coords data and edges from multiple coords files.

       Params:
        coord_fnames, the names of the coordinate files

       Returns:
        edges, a list of pairs of sample IDs, (from, to)
        coords
            a list of [coord_header, coords, eigvals, pct_var]
            all coords are put in a single data matrix.
            Sample IDs from ith file have _i appended to them.
            eigvals, pct_var are taken from first coords file

       If "serial" is True, connects points ending with _0 to those with _1, 
       those with _1 to those with _2, etc. Otherwise all sets are connected
       back to those ending with _0.
    """
    # start with empty data structures
    coord_header = []
    coords = []
    edges = []

    # load predetermined edges if they were passed to us
    if not edges_file is None:
        edges = [ln.strip().split() for ln in open(edges_file,'U').readlines()]

    # load all coords files into same data matrix
    for i,f in enumerate(coord_fnames):
        try:
            coord_f = open(coord_fnames[i], 'U').readlines()
        except (TypeError, IOError):
            raise MissingFileError, 'Coord file required for this analysis'
        coord_header_i, coords_i, eigvals_i, pct_var_i = parse_coords(coord_f)
        sampleIDs = coord_header_i
        # append _i to this file's sampleIDs unless we have predetermined edges
        if edges_file is None:
            coord_header_i = ['%s_%d' %(h,i) for h in coord_header_i]

        # get eigvals, pct_var from first coords file
        if i==0:
            eigvals = eigvals_i
            pct_var = pct_var_i
            coord_header = coord_header_i
            coords = coords_i
        # for second, third, etc coords files, just append to first file
        else:
            coord_header.extend(coord_header_i)
            coords = vstack((coords,coords_i))
    # add all edges unless we have predetermined edges
    if edges_file is None:
        for _id in sampleIDs:
            if serial:
                for i in xrange(len(coord_fnames)-1):
                    # edges go from one set to the next
                    edges += [('%s_%d' %(_id,i), '%s_%d' %(_id,i+1))]
            else:
                for i in xrange(1,len(coord_fnames)):
                    # edges go from first file's points to other files' points
                    edges += [('%s_%d' %(_id,0), '%s_%d' %(_id,i))]

    return edges, [coord_header, coords, eigvals, pct_var, None, None]
def get_procrustes_results(
        coords_f1,
        coords_f2,
        sample_id_map=None,
        randomize=None,
        max_dimensions=None,
        get_eigenvalues=get_mean_eigenvalues,
        get_percent_variation_explained=get_mean_percent_variation):
    """ """
    # Parse the PCoA files
    sample_ids1, coords1, eigvals1, pct_var1 = parse_coords(coords_f1)
    sample_ids2, coords2, eigvals2, pct_var2 = parse_coords(coords_f2)
    if sample_id_map:
        sample_ids1 = map_sample_ids(sample_ids1, sample_id_map)
        sample_ids2 = map_sample_ids(sample_ids2, sample_id_map)
    # rearrange the order of coords in coords2 to correspond to
    # the order of coords in coords1
    order = list(set(sample_ids1) & set(sample_ids2))
    coords1 = reorder_coords(coords1, sample_ids1, order)
    coords2 = reorder_coords(coords2, sample_ids2, order)
    if len(order) == 0:
        raise ValueError('No overlapping samples in the two files')

    # If this is a random trial, apply the shuffling function passed as
    # randomize()
    if randomize:
        coords2 = randomize(coords2)
        randomized_coords2 = format_coords(coord_header=order,
                                           coords=coords2,
                                           eigvals=eigvals2,
                                           pct_var=pct_var2)
    else:
        randomized_coords2 = None

    coords1, coords2 = pad_coords_matrices(coords1, coords2)
    if max_dimensions:
        coords1 = filter_coords_matrix(coords1, max_dimensions)
        coords2 = filter_coords_matrix(coords2, max_dimensions)
        pct_var1 = pct_var1[:max_dimensions]
        pct_var2 = pct_var2[:max_dimensions]
        eigvals1 = eigvals1[:max_dimensions]
        eigvals2 = eigvals2[:max_dimensions]
    else:
        if len(pct_var1) > len(pct_var2):
            pct_var2 = append(pct_var2, zeros(len(pct_var1) - len(pct_var2)))
            eigvals2 = append(eigvals2, zeros(len(eigvals1) - len(eigvals2)))
        elif len(pct_var1) < len(pct_var2):
            pct_var1 = append(pct_var1, zeros(len(pct_var2) - len(pct_var1)))
            eigvals1 = append(eigvals1, zeros(len(eigvals2) - len(eigvals1)))

    # Run the Procrustes analysis
    transformed_coords_m1, transformed_coords_m2, m_squared =\
        procrustes(coords1, coords2)
    # print coords2
    # print transformed_coords_m2

    eigvals = get_eigenvalues(eigvals1, eigvals2)
    pct_var = get_percent_variation_explained(pct_var1, pct_var2)

    transformed_coords1 = format_coords(coord_header=order,
                                        coords=transformed_coords_m1,
                                        eigvals=eigvals,
                                        pct_var=pct_var)
    transformed_coords2 = format_coords(coord_header=order,
                                        coords=transformed_coords_m2,
                                        eigvals=eigvals,
                                        pct_var=pct_var)

    # Return the results
    return (transformed_coords1, transformed_coords2, m_squared,
            randomized_coords2)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    chuck_norris_joke = opts.chuck_norris_joke
    coordinates_fp = opts.coordinates_fp
    mapping_file_fp = opts.mapping_file_fp
    category_header_name = opts.category
    output_fp = opts.output_fp

    # have a swell day Yoshiki from the future 
    if chuck_norris_joke:
        o, e, _ = qiime_system_call('curl http://api.icndb.com/jokes/random')

        exec 'joke = %s' % o.strip()
        print joke['value']['joke']
        exit(0)

    coords_headers, coords_data, coords_eigenvalues, coords_percents =\
        parse_coords(open(coordinates_fp, 'U'))
    mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_file_fp, 'U'))

    category_header_index = mapping_headers.index(category_header_name)
    category_names = list(set([line[category_header_index]
        for line in mapping_data]))


    main_figure = plt.figure()
    main_axes = main_figure.add_subplot(1, 1, 1, axisbg='black')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    main_axes.tick_params(axis='y', colors='none')
    main_axes.tick_params(axis='x', colors='none')
 

    # sort the data!!! that way you can match make_3d_plots.py
    sorted_categories = natsort(category_names)
    colors_used = []

    for index, category in enumerate(sorted_categories):
        sample_ids_list = [line[0] for line in mapping_data if line[category_header_index] == category]

        qiime_color = get_qiime_hex_string_color(index)

        if len(sample_ids_list) < 3:
            continue

        colors_used.append(qiime_color)

        indices = [coords_headers.index(sample_id) for sample_id in sample_ids_list]
        points = coords_data[indices, :2]# * coords_percents[:2]

        hull = ConvexHull(points)
        main_axes.plot(points[:,0], points[:,1], 'o', color=qiime_color)
        for simplex in hull.simplices:
            main_axes.plot(points[simplex,0], points[simplex,1], 'w-')
        main_axes.plot(points[hull.vertices,0], points[hull.vertices,1], '--', lw=2, color=qiime_color)
        # plt.plot(points[hull.vertices[0],0], points[hull.vertices[0],1], '--', color=qiime_color)
    # plt.show()

    main_figure.savefig(output_fp)

    name = splitext(output_fp)[0]
    extension = splitext(output_fp)[1].replace('.', '')

    make_legend(sorted_categories, colors_used, 0, 0, 'black', 'white', name,
                extension, 80)
Example #27
0
def plot_pcoa(analysis_type, fig, in_dir, workflow, category, metric, num_rows,
              num_cols, num_methods):
    trial_num = 0
    samp_size = workflow['pcoa_sample_size']

    trial_num_dir = join(in_dir, '%d' % trial_num)
    samp_size_dir = join(trial_num_dir, '%d' % samp_size)

    min_dissim = min(workflow['pcoa_dissim'])
    max_dissim = max(workflow['pcoa_dissim'])

    legend_symbols = []
    legend_labels = []
    for d_idx, d in enumerate(workflow['pcoa_dissim']):
        dissim_dir = join(samp_size_dir, repr(d))
        metric_dir = join(dissim_dir, metric[0])

        pc_fp = join(metric_dir, 'pc.txt')
        map_fp = join(metric_dir, 'map.txt')

        pc_f = open(pc_fp, 'U')
        map_f = open(map_fp, 'U')
        pc_data = parse_coords(pc_f)
        pc_f.seek(0)
        assert len(pc_data[0]) == samp_size

        # Skip the first row (the legend is already at that cell).
        plot_num = (d_idx + 2) * num_cols
        ax = fig.add_subplot(num_rows, num_cols, plot_num)

        if analysis_type == 'gradient':
            # Build list of (gradient value, sid) tuples.
            xs, ys, gradient = _collate_gradient_pcoa_plot_data(pc_f, map_f,
                                                                category[0])
            scatter_colorbar_data = ax.scatter(xs, ys, s=80, c=gradient,
                                               cmap='RdYlBu')
            # We have to use gridspec to get this to work with tight_layout.
            cb = fig.colorbar(scatter_colorbar_data, use_gridspec=True)
            cb.set_label(category[1])
        elif analysis_type == 'cluster':
            plot_data = _collate_cluster_pcoa_plot_data(pc_f, map_f,
                                                        category[0])
            for xs, ys, color, state in plot_data:
                ax.scatter(xs, ys, color=color, label=state)

                if d_idx == 0:
                    legend_symbols.append(Line2D(range(1), range(1),
                                          color='white', marker='o',
                                          markeredgecolor=color,
                                          markerfacecolor=color))
                    legend_labels.append(category[2].get(state, state))
        else:
            raise ValueError("Unrecognized simulated data type '%s'." %
                             analysis_type)

        plot_title = 'd=%r' % d
        if d == 0.0:
            plot_title += ' (actual data)'
        #elif d == max_dissim:
        #    plot_title += ' (neg. control)'
        ax.set_title(plot_title)

        ax.set_xlabel('PC1 (%1.2f%%)' % pc_data[3][0])
        ax.set_ylabel('PC2 (%1.2f%%)' % pc_data[3][1])
        ax.set_xticks([])
        ax.set_yticks([])

        panel_idx = num_methods * 2 + d_idx
        panel_label = get_panel_label(panel_idx)
        xmin = ax.get_xlim()[0]
        ymin, ymax = ax.get_ylim()
        yrange = ymax - ymin
        ax.text(xmin, ymax + (0.04 * yrange), '(%s)' % panel_label)

    if analysis_type == 'cluster':
        # Plot our new legend and add the existing one back.
        legend_ax = fig.add_subplot(num_rows, num_cols, 3, frame_on=False)
        existing_legend = legend_ax.get_legend()
        existing_legend.set_bbox_to_anchor((-0.05, 0.5))

        start_panel_label = get_panel_label(num_methods * 2)
        end_panel_label = get_panel_label(num_methods * 2 +
                                          len(workflow['pcoa_dissim']) - 1)

        assert len(legend_symbols) == len(legend_labels)
        legend_ax.legend(legend_symbols, legend_labels, ncol=1,
                   title='Legend (Panels %s-%s)' % (start_panel_label,
                                                    end_panel_label),
                   loc='center right', fancybox=True, shadow=True, numpoints=1,
                   bbox_to_anchor=(1.05, 0.5))

        legend_ax.add_artist(existing_legend)

    # Draw box around PCoA plots. Do the math in figure coordinates.
    top_ax = fig.add_subplot(num_rows, num_cols, 6)
    rec = Rectangle((1 - (1 / num_cols) + 0.005, 0),
                    (1 / num_cols) - 0.005,
                    1 - (1 / num_rows) - 0.005,
                    fill=False, lw=2, clip_on=False,
                    transform=top_ax.figure.transFigure)
    top_ax.add_patch(rec)
Example #28
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    m = parse_mapping_file_to_dict(open(opts.mapping_fp, "U"))[0]

    adjacent_unifrac_analyses = False
    pcoa_analyses = True

    if pcoa_analyses:
        wpc_h, wpc, _, _ = parse_coords(qiime_open(wpc_fp))
        upc_h, upc, _, _ = parse_coords(qiime_open(upc_fp))
        ugly_pc_function(
            m,
            wpc_h,
            wpc,
            ["GutTimeseries", "TongueTimeseries", "ForeheadTimeseries", "PalmTimeseries"],
            ["SampleAntibioticDisturbance", "SampleMenstruationDisturbance", "SampleSicknessDisturbance"],
            "CUB000",
            "pc-weighted.pdf",
        )
        ugly_pc_function(
            m,
            upc_h,
            upc,
            ["GutTimeseries", "TongueTimeseries", "ForeheadTimeseries", "PalmTimeseries"],
            ["SampleAntibioticDisturbance", "SampleMenstruationDisturbance", "SampleSicknessDisturbance"],
            "CUB000",
            "pc-unweighted.pdf",
        )

        wh, wdm = parse_distmat(qiime_open(wdm_fp, "U"))
        uh, udm = parse_distmat(qiime_open(udm_fp, "U"))

    if adjacent_unifrac_analyses:
        plot_adjacent_unifracs(
            uh,
            udm,
            m,
            ["GutTimeseries", "TongueTimeseries", "ForeheadTimeseries", "PalmTimeseries"],
            "Yes",
            ["SampleAntibioticDisturbance", "SampleMenstruationDisturbance", "SampleSicknessDisturbance"],
            "Yes",
            output_fp="unweighted-unifrac.pdf",
        )
        plot_adjacent_unifracs(
            wh,
            wdm,
            m,
            ["GutTimeseries", "TongueTimeseries", "ForeheadTimeseries", "PalmTimeseries"],
            "Yes",
            ["SampleAntibioticDisturbance", "SampleMenstruationDisturbance", "SampleSicknessDisturbance"],
            "Yes",
            output_fp="weighted-unifrac.pdf",
        )

        r = score_ranked_adjacent_unifracs(
            m,
            udm,
            uh,
            inclusion_field="GutTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Gut Unweighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )
        r = score_ranked_adjacent_unifracs(
            m,
            wdm,
            wh,
            inclusion_field="GutTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Gut Weighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )

        r = score_ranked_adjacent_unifracs(
            m,
            udm,
            uh,
            inclusion_field="TongueTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Tongue Unweighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )
        r = score_ranked_adjacent_unifracs(
            m,
            wdm,
            wh,
            inclusion_field="TongueTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Tongue Weighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )

        r = score_ranked_adjacent_unifracs(
            m,
            udm,
            uh,
            inclusion_field="PalmTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Palm Unweighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )
        r = score_ranked_adjacent_unifracs(
            m,
            wdm,
            wh,
            inclusion_field="PalmTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Palm Weighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )

        r = score_ranked_adjacent_unifracs(
            m,
            udm,
            uh,
            inclusion_field="ForeheadTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Forehead Unweighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )
        r = score_ranked_adjacent_unifracs(
            m,
            wdm,
            wh,
            inclusion_field="ForeheadTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Forehead Weighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )