Beispiel #1
0
def generate_pcoa_cloud_from_point_in_omega(map_headers, map_data, biom_object, metric, 
        sequences, iterations, axes, tree_object=None):
    """run the randomisations and get a WebGL PCoA plot string representation

    Input:
    mapping_file_tuple: data and headers tuple for representing the mapping file
    biom_object: otu table biom object
    metric: string of the name for the beta diversity metric, i. e. 'unifrac'
    sequences: number of sequences per sample
    iterations: number of iterations to generate the pcoa plot
    axes: number of axes to account for
    tree_object: tree to perform the beta diversity calculation

    Output:
    WebGL string representing the PCoA plot
    """
    
    pcoa_input = {'pcoa_headers':[], 'pcoa_values':[], 'eigenvalues':[], 'coords_pct':[]}
    for i in range(iterations):
        rare_biom_table = get_rare_data(biom_object, sequences)
        beta_dm = single_object_beta(rare_biom_table, metric, tree_object)
        pcoa_results = pcoa(beta_dm)

        pcoa_file = StringIO()
        pcoa_file.write(pcoa_results)
        pcoa_file.seek(0)
        pcoa_headers, pcoa_values, eigenvalues, coords_pct = parse_coords(pcoa_file)
        pcoa_file.close()
        pcoa_input['pcoa_headers'].append(pcoa_headers)
        pcoa_input['pcoa_values'].append(pcoa_values)
        pcoa_input['eigenvalues'].append(eigenvalues)
        pcoa_input['coords_pct'].append(coords_pct)
    
    if iterations==1:
        coords_headers = pcoa_input['pcoa_headers'][0]
        coords_data = pcoa_input['pcoa_values'][0]
        coords_eigenvalues = pcoa_input['eigenvalues'][0]
        coords_pct = pcoa_input['coords_pct'][0]
        coords_low, coords_high = None, None
    else:
        coords_headers, coords_data, coords_eigenvalues, coords_pct, coords_low,\
            coords_high, clones = preprocess_coords_file(pcoa_input['pcoa_headers'],
            pcoa_input['pcoa_values'], pcoa_input['eigenvalues'], 
            pcoa_input['coords_pct'], map_headers, map_data, custom_axes=None, 
            jackknifing_method='IQR', is_comparison=False)
    
    return make_pcoa_plot(coords_headers, coords_data, coords_eigenvalues, coords_pct, \
        map_headers, map_data, coords_low, coords_high, True)
def compare_treatment_dists(chosen_samples, category, mf, bt, m, tr):
    """Calculate avg between, within, and to-all distances for chosen_samples.
    Notes: 
     chosen_samples is a list of lists of ids that collectively have some amount
     of different values under category in the mapping file. these samples will
     br grouped by the value they have and then these groupings will be 
     compared. the between distance is the all the pairwise distances between 
     the groupings. the within distance is the distance between the samples in a 
     single group. the to-all distance is the distance from the group to all
     other samples in the distmat.
    Inputs:
     chosen_samples - list of ids. e.g. [sam1,sam7,sam3,sam6,..]
     category - str, field in mf.
     mf - parsed mapping file, dict of sample_id:metadata.
     bt - biom table containing at least all samples contained in the mf.
     m - str, metric to used for beta diversity calculation. 
     tr - tree object, containing at least all nodes in bt.
    Output:
     A list of marginals that are the treatments of the groups, i.e. ['HF','LF']
     bt_wi_m - a 2d upper triangular array that has the average distances
     between treatment groups (or in the case of the main diagonal, the average
     within treatment/group distance).
     bt_wi_se - the standard errors for bt_wi_m.
     ta_m_se - 2d array with number of treatments/groups rows, and 2 cols where 
     the first col is the average distance between that treatment and all others
     and the second col is the se. 
    """
    dm = single_object_beta(bt, m, tr) #make the sample-sample distance matrix
    samples, data = parse_distmat(dm) #parse dm which is list of strs
    tc = treatment_covering(chosen_samples, category, mf)
    output_marginals = tc.keys()
    # make 3 arrays for output, between-within means, between-within ses, 
    # to-all means and ses,
    bt_wi_m = zeros((len(output_marginals),len(output_marginals)))
    bt_wi_se = zeros((len(output_marginals),len(output_marginals)))
    ta_m_se = zeros((len(output_marginals),2))
    for i,t in enumerate(output_marginals): # calculate within and to-all
        ta_m_se[i][0], ta_m_se[i][1] = treatment_dist(tc[t], samples, data)
        bt_wi_m[i][i], bt_wi_se[i][i] = within_treatment_dist(tc[t], samples,
            data)
    for t1, t2 in combinations(output_marginals, 2): #calculate between dists
        t1_ind = output_marginals.index(t1)
        t2_ind = output_marginals.index(t2)
        bt_wi_m[t1_ind][t2_ind], bt_wi_se[t1_ind][t2_ind] = \
            between_treatments_dist(tc[t1], tc[t2], samples, data)
    return output_marginals, bt_wi_m, bt_wi_se, ta_m_se
Beispiel #3
0
def generate_pcoa_cloud_from_point_in_omega(mapping_file_tuple, biom_object,
                                            metric, sequences, iterations, axes,
                                            tree_object=None):
    """run the randomisations and get a WebGL PCoA plot string representation

    Input:
    mapping_file_tuple: data and headers tuple for representing the mapping file
    biom_object: otu table biom object
    metric: string of the name for the beta diversity metric, i. e. 'unifrac'
    sequences: number of sequences per sample
    iterations: number of iterations to generate the pcoa plot
    axes: number of axes to account for
    tree_object: tree to perform the beta diversity calculation

    Output:
    WebGL string representing the PCoA plot
    """

    # get a list of the SampleIds
    full_id_list = mapping_file_to_dict(mapping_file_tuple[0], mapping_file_tuple[1]).keys()

    pcoa_list = []
    for i in range(iterations):
        rare_biom_table = get_rare_data(biom_object, sequences)
        beta_dm = single_object_beta(rare_biom_table, metric, tree_object)
        pcoa_results = pcoa(beta_dm)

        pcoa_list.append(pcoa_results)

    # convert the list of pcoa lines into ellipsoid coords
    ellipse_coords_by_sampleId, sampleId_to_coords  = get_pcoa_ellipsoid_coords(pcoa_list, axes, full_id_list)
        
    # check the ellipses are created correctly
    if type(ellipse_coords_by_sampleId) == type(''):
        raise ValueError, 'Could not create PCoA plot'

    webgl_string = make_pcoa_plot(ellipse_coords_by_sampleId, mapping_file_tuple, sampleId_to_coords['variation explained'])
    return webgl_string
Beispiel #4
0
    def single_object_beta(self,
                           otu_table,
                           metric,
                           tree_string,
                           missing_sams=None):
        """ running single_file_beta should give same result using --rows"""
        if missing_sams is None:
            missing_sams = []

        metrics = list_known_nonphylogenetic_metrics()
        metrics.extend(list_known_phylogenetic_metrics())

        # new metrics that don't trivially parallelize must be dealt with
        # carefully
        warnings.filterwarnings(
            'ignore', 'dissimilarity binary_dist_chisq is\
 not parallelized, calculating the whole matrix...')
        warnings.filterwarnings(
            'ignore', 'dissimilarity dist_chisq is not\
 parallelized, calculating the whole matrix...')
        warnings.filterwarnings(
            'ignore', 'dissimilarity dist_gower is not\
 parallelized, calculating the whole matrix...')
        warnings.filterwarnings(
            'ignore', 'dissimilarity dist_hellinger is\
 not parallelized, calculating the whole matrix...')
        warnings.filterwarnings('ignore', 'unifrac had no information for\
 sample M*')

        # self.files_to_remove.extend([input_path,tree_path])
        # self.folders_to_remove.append(output_dir)
        # os.mkdir(output_dir+'/ft/')

        for metric in metrics:
            # do it
            beta_out = single_object_beta(otu_table,
                                          metric,
                                          tree_string,
                                          rowids=None,
                                          full_tree=False)

            sams, dmtx = parse_distmat(beta_out)

            # do it by rows
            for i in range(len(sams)):
                if sams[i] in missing_sams:
                    continue
                rows = sams[i]
                # row_outname = output_dir + '/' + metric + '_' +\
                # in_fname
                r_out = single_object_beta(otu_table,
                                           metric,
                                           tree_string,
                                           rowids=rows,
                                           full_tree=False)
                col_sams, row_sams, row_dmtx = parse_matrix(r_out)

                self.assertEqual(row_dmtx.shape,
                                 (len(rows.split(',')), len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j, k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                 sams.index(col_sams[k])]
                        npt.assert_almost_equal(row_v1, full_v1)

            # full tree run:
            if 'full_tree' in str(metric).lower():
                continue
            # do it by rows with full tree
            for i in range(len(sams)):
                if sams[i] in missing_sams:
                    continue
                rows = sams[i]

                #~ row_outname = output_dir + '/ft/' + metric + '_' +\
                #~ in_fname
                r_out = single_object_beta(otu_table,
                                           metric,
                                           tree_string,
                                           rowids=None,
                                           full_tree=True)
                col_sams, row_sams, row_dmtx = parse_matrix(r_out)

                self.assertEqual(row_dmtx.shape,
                                 (len(rows.split(',')), len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j, k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                 sams.index(col_sams[k])]
                        npt.assert_almost_equal(row_v1, full_v1)

            # do it with full tree
            r_out = single_object_beta(otu_table,
                                       metric,
                                       tree_string,
                                       rowids=None,
                                       full_tree=True)
            sams_ft, dmtx_ft = parse_distmat(r_out)
            self.assertEqual(sams_ft, sams)
            npt.assert_almost_equal(dmtx_ft, dmtx)
Beispiel #5
0
    def single_object_beta(self, otu_table, metric, tree_string,
                            missing_sams=None):
        """ running single_file_beta should give same result using --rows"""
        if missing_sams==None:
            missing_sams = []
        # setup
        #input_path = get_tmp_filename()
        #in_fname = os.path.split(input_path)[1]
        #f = open(input_path,'w')
        #f.write(otu_table_string)
        #f.close()
        #tree_path = get_tmp_filename()
        #f = open(tree_path,'w')
        #f.write(tree_string)
        #f.close()
        metrics = list_known_nonphylogenetic_metrics()
        metrics.extend(list_known_phylogenetic_metrics())
        #output_dir = get_tmp_filename(suffix = '')
        #os.mkdir(output_dir)

        # new metrics that don't trivially parallelize must be dealt with
        # carefully
        warnings.filterwarnings('ignore','dissimilarity binary_dist_chisq is\
 not parallelized, calculating the whole matrix...')
        warnings.filterwarnings('ignore','dissimilarity dist_chisq is not\
 parallelized, calculating the whole matrix...')  
        warnings.filterwarnings('ignore','dissimilarity dist_gower is not\
 parallelized, calculating the whole matrix...')     
        warnings.filterwarnings('ignore','dissimilarity dist_hellinger is\
 not parallelized, calculating the whole matrix...')  
        warnings.filterwarnings('ignore','unifrac had no information for\
 sample M*')

        #self.files_to_remove.extend([input_path,tree_path])
        #self.folders_to_remove.append(output_dir)
        #os.mkdir(output_dir+'/ft/')

        for metric in metrics:
            # do it
            beta_out = single_object_beta(otu_table, metric, 
                                          tree_string,rowids=None,
                                          full_tree=False)
                                          
            sams, dmtx = parse_distmat(beta_out)

            # do it by rows
            for i in range(len(sams)):
                if sams[i] in missing_sams: continue
                rows = sams[i]
                #row_outname = output_dir + '/' + metric + '_' +\
                    #in_fname
                r_out = single_object_beta(otu_table, metric, 
                                          tree_string,rowids=rows,
                                          full_tree=False)
                col_sams, row_sams, row_dmtx = parse_matrix(r_out)

                self.assertEqual(row_dmtx.shape, (len(rows.split(',')),
                    len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j,k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                sams.index(col_sams[k])]
                        self.assertFloatEqual(row_v1, full_v1)


            ### full tree run:
            if 'full_tree' in str(metric).lower(): continue
            # do it by rows with full tree
            for i in range(len(sams)):
                if sams[i] in missing_sams: continue
                rows = sams[i]
                
                #~ row_outname = output_dir + '/ft/' + metric + '_' +\
                    #~ in_fname
                r_out = single_object_beta(otu_table, metric, 
                                          tree_string,rowids=None,
                                          full_tree=True)
                col_sams, row_sams, row_dmtx = parse_matrix(r_out)

                self.assertEqual(row_dmtx.shape, (len(rows.split(',')),
                    len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j,k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                sams.index(col_sams[k])]
                        self.assertFloatEqual(row_v1, full_v1)

            # # do it with full tree
            r_out = single_object_beta(otu_table, metric, 
                                          tree_string,rowids=None,
                                          full_tree=True)
            sams_ft, dmtx_ft = parse_distmat(r_out)
            self.assertEqual(sams_ft, sams)
            self.assertFloatEqual(dmtx_ft, dmtx)