Exemple #1
0
def select_samples(map_data, headers, biom_table, depth, unique_id_column,
                   subjects, samples_per_subject):
    """
    Randomly select a list of IDs with enough sequeneces per subject

    Input:
    map_data: rows of the mapping file without the headers and the comments
    headers: headers of the mapping file
    biom_table: table object 
    depth: number of sequences pers sample
    unique_id_column: column header to identify unique subjects in the mapping 
    file i.e. HOST_SUBJECT_ID
    subjects: number of subjects to include in the resulting samples
    samples_per_subject: number of samples pers subject to include in the
    resulting samples

    Output:
    chosen_samples: a list of SampleIds with as many lists as subjects where 
    each list has as many elmenents as samples per subject
    final_biom_table: a biom table object containing only the 'chosen_samples'
    """

    unique_id_column_index = headers.index(unique_id_column)
    rare_biom_table = get_rare_data(biom_table, depth)

    # make a dictionary of each subject with its corresponding list of SampleIds
    per_subject_sample_ids = {}
    for row in map_data:
        if row[0] not in rare_biom_table.SampleIds:
            continue

        if row[unique_id_column_index] not in per_subject_sample_ids:
            per_subject_sample_ids[row[unique_id_column_index]] = []
        per_subject_sample_ids[row[unique_id_column_index]].append(row[0])

    # subsampling samples per individual
    subsampled_ids = {}
    subject_keys = []
    for k, v in per_subject_sample_ids.items():
        if len(v) >= samples_per_subject:
            subsampled_ids[k] = v[:samples_per_subject]
            subject_keys.append(k)

    ##open('/tmp/tmp.txt','a').write('%s %d %s\n\n' % (k, len(subsampled_ids[k]), subsampled_ids[k]))

    # subsampling subjects
    shuffle(subject_keys)
    chosen_samples = []
    for k in subject_keys[:subjects]:
        chosen_samples.extend(subsampled_ids[k])

    # creating new biom file with only the good samples
    try:
        final_biom_table = biom_table.filterSamples(
            lambda v, id, md: id in chosen_samples)
    except TableException:
        raise TableException, "Using those parameters there are no subjects "+\
            "available in this study, make the selectors files are correct"

    return chosen_samples, final_biom_table
Exemple #2
0
 def test_get_empty_rare(self):
     """get_rare_data should be empty when depth > # seqs in any sample"""
     rare_sample_ids, rare_otu_table = get_rare_data(
         self.sample_names, self.otu_table, \
         50, include_small_samples=False)
     self.assertEqual(len(rare_sample_ids), 0)
     self.assertEqual(rare_otu_table.size, 0)    
Exemple #3
0
def select_samples(map_data, headers, biom_table, depth, unique_id_column, 
                    subjects, samples_per_subject):
    """
    Randomly select a list of IDs with enough sequeneces per subject

    Input:
    map_data: rows of the mapping file without the headers and the comments
    headers: headers of the mapping file
    biom_table: table object 
    depth: number of sequences pers sample
    unique_id_column: column header to identify unique subjects in the mapping 
    file i.e. HOST_SUBJECT_ID
    subjects: number of subjects to include in the resulting samples
    samples_per_subject: number of samples pers subject to include in the
    resulting samples

    Output:
    chosen_samples: a list of SampleIds with as many lists as subjects where 
    each list has as many elmenents as samples per subject
    final_biom_table: a biom table object containing only the 'chosen_samples'
    """

    unique_id_column_index = headers.index(unique_id_column)
    rare_biom_table = get_rare_data(biom_table, depth)

    # make a dictionary of each subject with its corresponding list of SampleIds
    per_subject_sample_ids = {}
    for row in map_data:
        if row[0] not in rare_biom_table.SampleIds:
            continue

        if  row[unique_id_column_index] not in per_subject_sample_ids:
            per_subject_sample_ids[row[unique_id_column_index]] = []
        per_subject_sample_ids[row[unique_id_column_index]].append(row[0])

    # subsampling samples per individual
    subsampled_ids = {}
    subject_keys = []
    for k,v in per_subject_sample_ids.items():
        if len(v)>=samples_per_subject:
            subsampled_ids[k] = v[:samples_per_subject]
            subject_keys.append(k)

    ##open('/tmp/tmp.txt','a').write('%s %d %s\n\n' % (k, len(subsampled_ids[k]), subsampled_ids[k]))

    # subsampling subjects
    shuffle(subject_keys)
    chosen_samples = []
    for k in subject_keys[:subjects]:
        chosen_samples.extend(subsampled_ids[k])
    
    # creating new biom file with only the good samples
    try:
        final_biom_table = biom_table.filterSamples(lambda v,id,md: id in chosen_samples)
    except TableException:
        raise TableException, "Using those parameters there are no subjects "+\
            "available in this study, make the selectors files are correct"

    return chosen_samples, final_biom_table
Exemple #4
0
    def test_get_11depth_rare(self):
        """get_rare_data should get only sample X

        """
        rare_otu_table = get_rare_data(self.otu_table, 11, include_small_samples=False)
        self.assertEqual(rare_otu_table.SampleIds, ("X",))

        # a very complicated way to test things
        rare_values = [val[0] for (val, otu_id, meta) in rare_otu_table.iterObservations()]
        self.assertEqual(rare_values, [1.0, 5.0, 3.0, 2.0])
Exemple #5
0
    def test_get_11depth_rare(self):
        """get_rare_data should get only sample X

        """
        rare_sample_ids, rare_otu_table = get_rare_data(
            self.sample_names, self.otu_table, \
            11, include_small_samples=False)
        self.assertEqual(rare_sample_ids, ['X'])
        #rare_otu_table[numpy.argsort(rare_otu_ids)]
        self.assertEqual(rare_otu_table[numpy.argsort(self.taxon_names)][:,0], 
            numpy.array([5,1,3,2]))
    def test_get_11depth_rare(self):
        """get_rare_data should get only sample X

        """
        rare_otu_table = get_rare_data(self.otu_table,
                                       11, include_small_samples=False)
        self.assertEqual(rare_otu_table.ids(), ('X',))

        # a very complicated way to test things
        rare_values = [val[0]
                       for (val, otu_id, meta) in rare_otu_table.iter(axis='observation')]
        self.assertEqual(rare_values, [1.0, 5.0, 3.0, 2.0])
Exemple #7
0
    def test_get_overfull_rare(self):
        """get_rare_data should be identical to given in this case

        here, rare depth > any sample, and include_small... = True"""
        rare_otu_table = get_rare_data(self.otu_table, 50, include_small_samples=True)
        self.assertEqual(len(rare_otu_table.SampleIds), 3)
        # 4 observations times 3 samples = size 12 before
        self.assertEqual(len(rare_otu_table.ObservationIds), 4)
        for sam in self.otu_table.SampleIds:
            for otu in self.otu_table.ObservationIds:
                rare_val = rare_otu_table.getValueByIds(otu, sam)
                self.assertEqual(rare_otu_table.getValueByIds(otu, sam), self.otu_table.getValueByIds(otu, sam))
Exemple #8
0
    def test_get_overfull_rare(self):
        """get_rare_data should be identical to given in this case

        here, rare depth > any sample, and include_small... = True"""
        rare_sample_ids, rare_otu_table = get_rare_data(
            self.sample_names, self.otu_table, \
            50, include_small_samples=True)
        self.assertEqual(len(rare_sample_ids), 3)
        self.assertEqual(rare_otu_table.size, 12)
        for i, sam in enumerate(self.sample_names):
            for j, otu in enumerate(self.taxon_names):
                rare_val = rare_otu_table[self.taxon_names.index(otu),
                    rare_sample_ids.index(sam)]
                self.assertEqual(rare_val, self.otu_table[j,i]) 
    def test_get_overfull_rare(self):
        """get_rare_data should be identical to given in this case

        here, rare depth > any sample, and include_small... = True"""
        rare_otu_table = get_rare_data(self.otu_table,
                                       50, include_small_samples=True)
        self.assertEqual(len(rare_otu_table.ids()), 3)
        # 4 observations times 3 samples = size 12 before
        self.assertEqual(len(rare_otu_table.ids(axis='observation')), 4)
        for sam in self.otu_table.ids():
            for otu in self.otu_table.ids(axis='observation'):
                rare_val = rare_otu_table.get_value_by_ids(otu, sam)
                self.assertEqual(rare_otu_table.get_value_by_ids(otu, sam),
                                 self.otu_table.get_value_by_ids(otu, sam))
Exemple #10
0
def generate_pcoa_cloud_from_point_in_omega(map_headers, map_data, biom_object, metric, 
        sequences, iterations, axes, tree_object=None):
    """run the randomisations and get a WebGL PCoA plot string representation

    Input:
    mapping_file_tuple: data and headers tuple for representing the mapping file
    biom_object: otu table biom object
    metric: string of the name for the beta diversity metric, i. e. 'unifrac'
    sequences: number of sequences per sample
    iterations: number of iterations to generate the pcoa plot
    axes: number of axes to account for
    tree_object: tree to perform the beta diversity calculation

    Output:
    WebGL string representing the PCoA plot
    """
    
    pcoa_input = {'pcoa_headers':[], 'pcoa_values':[], 'eigenvalues':[], 'coords_pct':[]}
    for i in range(iterations):
        rare_biom_table = get_rare_data(biom_object, sequences)
        beta_dm = single_object_beta(rare_biom_table, metric, tree_object)
        pcoa_results = pcoa(beta_dm)

        pcoa_file = StringIO()
        pcoa_file.write(pcoa_results)
        pcoa_file.seek(0)
        pcoa_headers, pcoa_values, eigenvalues, coords_pct = parse_coords(pcoa_file)
        pcoa_file.close()
        pcoa_input['pcoa_headers'].append(pcoa_headers)
        pcoa_input['pcoa_values'].append(pcoa_values)
        pcoa_input['eigenvalues'].append(eigenvalues)
        pcoa_input['coords_pct'].append(coords_pct)
    
    if iterations==1:
        coords_headers = pcoa_input['pcoa_headers'][0]
        coords_data = pcoa_input['pcoa_values'][0]
        coords_eigenvalues = pcoa_input['eigenvalues'][0]
        coords_pct = pcoa_input['coords_pct'][0]
        coords_low, coords_high = None, None
    else:
        coords_headers, coords_data, coords_eigenvalues, coords_pct, coords_low,\
            coords_high, clones = preprocess_coords_file(pcoa_input['pcoa_headers'],
            pcoa_input['pcoa_values'], pcoa_input['eigenvalues'], 
            pcoa_input['coords_pct'], map_headers, map_data, custom_axes=None, 
            jackknifing_method='IQR', is_comparison=False)
    
    return make_pcoa_plot(coords_headers, coords_data, coords_eigenvalues, coords_pct, \
        map_headers, map_data, coords_low, coords_high, True)
Exemple #11
0
def generate_pcoa_cloud_from_point_in_omega(mapping_file_tuple, biom_object,
                                            metric, sequences, iterations, axes,
                                            tree_object=None):
    """run the randomisations and get a WebGL PCoA plot string representation

    Input:
    mapping_file_tuple: data and headers tuple for representing the mapping file
    biom_object: otu table biom object
    metric: string of the name for the beta diversity metric, i. e. 'unifrac'
    sequences: number of sequences per sample
    iterations: number of iterations to generate the pcoa plot
    axes: number of axes to account for
    tree_object: tree to perform the beta diversity calculation

    Output:
    WebGL string representing the PCoA plot
    """

    # get a list of the SampleIds
    full_id_list = mapping_file_to_dict(mapping_file_tuple[0], mapping_file_tuple[1]).keys()

    pcoa_list = []
    for i in range(iterations):
        rare_biom_table = get_rare_data(biom_object, sequences)
        beta_dm = single_object_beta(rare_biom_table, metric, tree_object)
        pcoa_results = pcoa(beta_dm)

        pcoa_list.append(pcoa_results)

    # convert the list of pcoa lines into ellipsoid coords
    ellipse_coords_by_sampleId, sampleId_to_coords  = get_pcoa_ellipsoid_coords(pcoa_list, axes, full_id_list)
        
    # check the ellipses are created correctly
    if type(ellipse_coords_by_sampleId) == type(''):
        raise ValueError, 'Could not create PCoA plot'

    webgl_string = make_pcoa_plot(ellipse_coords_by_sampleId, mapping_file_tuple, sampleId_to_coords['variation explained'])
    return webgl_string