def select_samples(map_data, headers, biom_table, depth, unique_id_column, subjects, samples_per_subject): """ Randomly select a list of IDs with enough sequeneces per subject Input: map_data: rows of the mapping file without the headers and the comments headers: headers of the mapping file biom_table: table object depth: number of sequences pers sample unique_id_column: column header to identify unique subjects in the mapping file i.e. HOST_SUBJECT_ID subjects: number of subjects to include in the resulting samples samples_per_subject: number of samples pers subject to include in the resulting samples Output: chosen_samples: a list of SampleIds with as many lists as subjects where each list has as many elmenents as samples per subject final_biom_table: a biom table object containing only the 'chosen_samples' """ unique_id_column_index = headers.index(unique_id_column) rare_biom_table = get_rare_data(biom_table, depth) # make a dictionary of each subject with its corresponding list of SampleIds per_subject_sample_ids = {} for row in map_data: if row[0] not in rare_biom_table.SampleIds: continue if row[unique_id_column_index] not in per_subject_sample_ids: per_subject_sample_ids[row[unique_id_column_index]] = [] per_subject_sample_ids[row[unique_id_column_index]].append(row[0]) # subsampling samples per individual subsampled_ids = {} subject_keys = [] for k, v in per_subject_sample_ids.items(): if len(v) >= samples_per_subject: subsampled_ids[k] = v[:samples_per_subject] subject_keys.append(k) ##open('/tmp/tmp.txt','a').write('%s %d %s\n\n' % (k, len(subsampled_ids[k]), subsampled_ids[k])) # subsampling subjects shuffle(subject_keys) chosen_samples = [] for k in subject_keys[:subjects]: chosen_samples.extend(subsampled_ids[k]) # creating new biom file with only the good samples try: final_biom_table = biom_table.filterSamples( lambda v, id, md: id in chosen_samples) except TableException: raise TableException, "Using those parameters there are no subjects "+\ "available in this study, make the selectors files are correct" return chosen_samples, final_biom_table
def test_get_empty_rare(self): """get_rare_data should be empty when depth > # seqs in any sample""" rare_sample_ids, rare_otu_table = get_rare_data( self.sample_names, self.otu_table, \ 50, include_small_samples=False) self.assertEqual(len(rare_sample_ids), 0) self.assertEqual(rare_otu_table.size, 0)
def select_samples(map_data, headers, biom_table, depth, unique_id_column, subjects, samples_per_subject): """ Randomly select a list of IDs with enough sequeneces per subject Input: map_data: rows of the mapping file without the headers and the comments headers: headers of the mapping file biom_table: table object depth: number of sequences pers sample unique_id_column: column header to identify unique subjects in the mapping file i.e. HOST_SUBJECT_ID subjects: number of subjects to include in the resulting samples samples_per_subject: number of samples pers subject to include in the resulting samples Output: chosen_samples: a list of SampleIds with as many lists as subjects where each list has as many elmenents as samples per subject final_biom_table: a biom table object containing only the 'chosen_samples' """ unique_id_column_index = headers.index(unique_id_column) rare_biom_table = get_rare_data(biom_table, depth) # make a dictionary of each subject with its corresponding list of SampleIds per_subject_sample_ids = {} for row in map_data: if row[0] not in rare_biom_table.SampleIds: continue if row[unique_id_column_index] not in per_subject_sample_ids: per_subject_sample_ids[row[unique_id_column_index]] = [] per_subject_sample_ids[row[unique_id_column_index]].append(row[0]) # subsampling samples per individual subsampled_ids = {} subject_keys = [] for k,v in per_subject_sample_ids.items(): if len(v)>=samples_per_subject: subsampled_ids[k] = v[:samples_per_subject] subject_keys.append(k) ##open('/tmp/tmp.txt','a').write('%s %d %s\n\n' % (k, len(subsampled_ids[k]), subsampled_ids[k])) # subsampling subjects shuffle(subject_keys) chosen_samples = [] for k in subject_keys[:subjects]: chosen_samples.extend(subsampled_ids[k]) # creating new biom file with only the good samples try: final_biom_table = biom_table.filterSamples(lambda v,id,md: id in chosen_samples) except TableException: raise TableException, "Using those parameters there are no subjects "+\ "available in this study, make the selectors files are correct" return chosen_samples, final_biom_table
def test_get_11depth_rare(self): """get_rare_data should get only sample X """ rare_otu_table = get_rare_data(self.otu_table, 11, include_small_samples=False) self.assertEqual(rare_otu_table.SampleIds, ("X",)) # a very complicated way to test things rare_values = [val[0] for (val, otu_id, meta) in rare_otu_table.iterObservations()] self.assertEqual(rare_values, [1.0, 5.0, 3.0, 2.0])
def test_get_11depth_rare(self): """get_rare_data should get only sample X """ rare_sample_ids, rare_otu_table = get_rare_data( self.sample_names, self.otu_table, \ 11, include_small_samples=False) self.assertEqual(rare_sample_ids, ['X']) #rare_otu_table[numpy.argsort(rare_otu_ids)] self.assertEqual(rare_otu_table[numpy.argsort(self.taxon_names)][:,0], numpy.array([5,1,3,2]))
def test_get_11depth_rare(self): """get_rare_data should get only sample X """ rare_otu_table = get_rare_data(self.otu_table, 11, include_small_samples=False) self.assertEqual(rare_otu_table.ids(), ('X',)) # a very complicated way to test things rare_values = [val[0] for (val, otu_id, meta) in rare_otu_table.iter(axis='observation')] self.assertEqual(rare_values, [1.0, 5.0, 3.0, 2.0])
def test_get_overfull_rare(self): """get_rare_data should be identical to given in this case here, rare depth > any sample, and include_small... = True""" rare_otu_table = get_rare_data(self.otu_table, 50, include_small_samples=True) self.assertEqual(len(rare_otu_table.SampleIds), 3) # 4 observations times 3 samples = size 12 before self.assertEqual(len(rare_otu_table.ObservationIds), 4) for sam in self.otu_table.SampleIds: for otu in self.otu_table.ObservationIds: rare_val = rare_otu_table.getValueByIds(otu, sam) self.assertEqual(rare_otu_table.getValueByIds(otu, sam), self.otu_table.getValueByIds(otu, sam))
def test_get_overfull_rare(self): """get_rare_data should be identical to given in this case here, rare depth > any sample, and include_small... = True""" rare_sample_ids, rare_otu_table = get_rare_data( self.sample_names, self.otu_table, \ 50, include_small_samples=True) self.assertEqual(len(rare_sample_ids), 3) self.assertEqual(rare_otu_table.size, 12) for i, sam in enumerate(self.sample_names): for j, otu in enumerate(self.taxon_names): rare_val = rare_otu_table[self.taxon_names.index(otu), rare_sample_ids.index(sam)] self.assertEqual(rare_val, self.otu_table[j,i])
def test_get_overfull_rare(self): """get_rare_data should be identical to given in this case here, rare depth > any sample, and include_small... = True""" rare_otu_table = get_rare_data(self.otu_table, 50, include_small_samples=True) self.assertEqual(len(rare_otu_table.ids()), 3) # 4 observations times 3 samples = size 12 before self.assertEqual(len(rare_otu_table.ids(axis='observation')), 4) for sam in self.otu_table.ids(): for otu in self.otu_table.ids(axis='observation'): rare_val = rare_otu_table.get_value_by_ids(otu, sam) self.assertEqual(rare_otu_table.get_value_by_ids(otu, sam), self.otu_table.get_value_by_ids(otu, sam))
def generate_pcoa_cloud_from_point_in_omega(map_headers, map_data, biom_object, metric, sequences, iterations, axes, tree_object=None): """run the randomisations and get a WebGL PCoA plot string representation Input: mapping_file_tuple: data and headers tuple for representing the mapping file biom_object: otu table biom object metric: string of the name for the beta diversity metric, i. e. 'unifrac' sequences: number of sequences per sample iterations: number of iterations to generate the pcoa plot axes: number of axes to account for tree_object: tree to perform the beta diversity calculation Output: WebGL string representing the PCoA plot """ pcoa_input = {'pcoa_headers':[], 'pcoa_values':[], 'eigenvalues':[], 'coords_pct':[]} for i in range(iterations): rare_biom_table = get_rare_data(biom_object, sequences) beta_dm = single_object_beta(rare_biom_table, metric, tree_object) pcoa_results = pcoa(beta_dm) pcoa_file = StringIO() pcoa_file.write(pcoa_results) pcoa_file.seek(0) pcoa_headers, pcoa_values, eigenvalues, coords_pct = parse_coords(pcoa_file) pcoa_file.close() pcoa_input['pcoa_headers'].append(pcoa_headers) pcoa_input['pcoa_values'].append(pcoa_values) pcoa_input['eigenvalues'].append(eigenvalues) pcoa_input['coords_pct'].append(coords_pct) if iterations==1: coords_headers = pcoa_input['pcoa_headers'][0] coords_data = pcoa_input['pcoa_values'][0] coords_eigenvalues = pcoa_input['eigenvalues'][0] coords_pct = pcoa_input['coords_pct'][0] coords_low, coords_high = None, None else: coords_headers, coords_data, coords_eigenvalues, coords_pct, coords_low,\ coords_high, clones = preprocess_coords_file(pcoa_input['pcoa_headers'], pcoa_input['pcoa_values'], pcoa_input['eigenvalues'], pcoa_input['coords_pct'], map_headers, map_data, custom_axes=None, jackknifing_method='IQR', is_comparison=False) return make_pcoa_plot(coords_headers, coords_data, coords_eigenvalues, coords_pct, \ map_headers, map_data, coords_low, coords_high, True)
def generate_pcoa_cloud_from_point_in_omega(mapping_file_tuple, biom_object, metric, sequences, iterations, axes, tree_object=None): """run the randomisations and get a WebGL PCoA plot string representation Input: mapping_file_tuple: data and headers tuple for representing the mapping file biom_object: otu table biom object metric: string of the name for the beta diversity metric, i. e. 'unifrac' sequences: number of sequences per sample iterations: number of iterations to generate the pcoa plot axes: number of axes to account for tree_object: tree to perform the beta diversity calculation Output: WebGL string representing the PCoA plot """ # get a list of the SampleIds full_id_list = mapping_file_to_dict(mapping_file_tuple[0], mapping_file_tuple[1]).keys() pcoa_list = [] for i in range(iterations): rare_biom_table = get_rare_data(biom_object, sequences) beta_dm = single_object_beta(rare_biom_table, metric, tree_object) pcoa_results = pcoa(beta_dm) pcoa_list.append(pcoa_results) # convert the list of pcoa lines into ellipsoid coords ellipse_coords_by_sampleId, sampleId_to_coords = get_pcoa_ellipsoid_coords(pcoa_list, axes, full_id_list) # check the ellipses are created correctly if type(ellipse_coords_by_sampleId) == type(''): raise ValueError, 'Could not create PCoA plot' webgl_string = make_pcoa_plot(ellipse_coords_by_sampleId, mapping_file_tuple, sampleId_to_coords['variation explained']) return webgl_string