def test_make_otu_table_with_sample_metadata(self): # Want to make sure that the order of the sample IDs in the OTU # map and the order of the IDs in the mapping file do not matter otu_map_lines = """0 ABC_0 DEF_1 1 ABC_1 x GHI_2 GHI_3 GHI_77 z DEF_3 XYZ_1""".split('\n') mapping_f = StringIO(MAPPING_FILE) sample_ids = ['ABC', 'DEF', 'GHI', 'XYZ'] data = [[1, 1, 0, 0], [1, 0, 0, 0], [0, 0, 3, 0], [0, 1, 0, 1]] map_data, map_header, map_comments = parse_mapping_file(mapping_f) sample_metadata = mapping_file_to_dict(map_data, map_header) sample_md = [sample_metadata[sample_id] for sample_id in sample_ids] obs = make_otu_table(otu_map_lines, sample_metadata=sample_metadata) exp = Table(data, ['0', '1', 'x', 'z'], sample_ids, sample_metadata=sample_md, input_is_dense=True) self.assertEqual(obs, exp) # Test with a mapping file that is missing a sample's metadata, # make sure it raises the KeyError mapping_f = StringIO(MAPPING_FILE_MISSING_SAMPLE) map_data, map_header, map_comments = parse_mapping_file(mapping_f) sample_metadata = mapping_file_to_dict(map_data, map_header) with self.assertRaises(KeyError): obs = make_otu_table(otu_map_lines, sample_metadata=sample_metadata)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) map_data, map_header, map_comments = parse_mapping_file(open( opts.map, 'U')) map_dict = mapping_file_to_dict(map_data, map_header) distdict = parse_distmat_to_dict(open(opts.distance_matrix, 'U')) if opts.colorby == None: colorby_cats = [None] else: colorby_idx = map_header.index(opts.colorby) colorby_cats = list(set([map_data[i][colorby_idx] for\ i in range(len(map_data))])) textfilename = os.path.splitext(opts.output_path)[0] + '.txt' text_fh = open(textfilename, 'w') text_fh.write(opts.axis_category + '\tdistance\tSampleID' + '\n') colorby_cats.sort() plt.figure() for cat_num, cat in enumerate(colorby_cats): # collect the primary and secondary samples within this category state1_samids, state2_samids = get_sam_ids(map_data, map_header, opts.colorby, cat, opts.primary_state, opts.secondary_state) state1_samids =\ list(set(state1_samids).intersection(set(distdict.keys()))) state2_samids =\ list(set(state2_samids).intersection(set(distdict.keys()))) if state1_samids == [] or state2_samids == [] or \ (len(state1_samids) == 1 and state1_samids == state2_samids): raise RuntimeError("one category of samples didn't have any valid"+\ " distances. try eliminating samples from -p or -s, or changing"+\ " your mapping file with filter_samples_from_otu_table.py") # go through dmtx state1_avg_dists = get_avg_dists(state1_samids, state2_samids, distdict) # plot xvals = [float(map_dict[sam][opts.axis_category]) for\ sam in state1_samids] try: color = plt.cm.jet(cat_num / (len(colorby_cats) - 1)) except ZeroDivisionError: # only one cat color = 'b' plt.scatter(xvals, state1_avg_dists, edgecolors=color, alpha=.5, facecolors='none') plt.xlabel(opts.axis_category) plt.ylabel('average distance') lines = [str(xvals[i])+'\t'+str(state1_avg_dists[i])+\ '\t'+state1_samids[i]+'\n' for i in range(len(xvals))] text_fh.writelines(lines) if opts.colorby != None: plt.legend(colorby_cats) plt.savefig(opts.output_path)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) exclude_otus_fp = opts.exclude_otus_fp if not opts.taxonomy_fname: otu_to_taxonomy = None else: infile = open(opts.taxonomy_fname, 'U') otu_to_taxonomy = parse_taxonomy(infile) ids_to_exclude = [] if exclude_otus_fp: if splitext(exclude_otus_fp)[1] in ('.fasta', '.fna'): ids_to_exclude = \ get_seq_ids_from_fasta_file(open(exclude_otus_fp, 'U')) else: ids_to_exclude = \ get_seq_ids_from_seq_id_file(open(exclude_otus_fp, 'U')) sample_metadata = None if opts.mapping_fp is not None: with open(opts.mapping_fp, 'U') as map_f: mapping_data, mapping_header, mapping_comments = \ parse_mapping_file(map_f) sample_metadata = mapping_file_to_dict(mapping_data, mapping_header) with open(opts.otu_map_fp, 'U') as otu_map_f: biom_otu_table = make_otu_table(otu_map_f, otu_to_taxonomy=otu_to_taxonomy, otu_ids_to_exclude=ids_to_exclude, sample_metadata=sample_metadata) write_biom_table(biom_otu_table, opts.output_biom_fp)
def format_mapping_file_to_js(mapping_file_data, mapping_file_headers, columns): """Write a javascript representation of the mapping file Inputs: mapping_file_data: contents of the mapping file mapping_file_headers: headers of the mapping file columns: valid columns to use, usually a subset of mapping_file_headers Outputs: string: javascript representation of the mapping file """ js_mapping_file_string = '' mapping_file_dict = mapping_file_to_dict(mapping_file_data, mapping_file_headers) map_values = [] for k,v in mapping_file_dict.items(): if 'SampleID' in columns: vals = ["'%s'" % k] + ["'%s'" % v[col]\ for col in mapping_file_headers[1:]] else: vals = ["'%s'" % v[col] for col in mapping_file_headers[1:]] map_values.append("'%s': [%s]" % (k, ','.join(vals))) if 'SampleID' not in columns: mapping_file_headers = mapping_file_headers[1:] # format the mapping file as javascript objects js_mapping_file_string += 'var g_mappingFileHeaders = [%s];\n' % ','.join( ["'%s'" % col for col in mapping_file_headers]) js_mapping_file_string += 'var g_mappingFileData = { %s };\n' % ','.join( map_values) return js_mapping_file_string
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) map_data, map_header, map_comments = parse_mapping_file( open(opts.map, 'U')) map_dict = mapping_file_to_dict(map_data, map_header) distdict = parse_distmat_to_dict(open(opts.distance_matrix, 'U')) if opts.colorby is None: colorby_cats = [None] else: colorby_idx = map_header.index(opts.colorby) colorby_cats = list(set([map_data[i][colorby_idx] for i in range(len(map_data))])) textfilename = os.path.splitext(opts.output_path)[0] + '.txt' text_fh = open(textfilename, 'w') text_fh.write(opts.axis_category + '\tdistance\tSampleID' + '\n') colorby_cats.sort() plt.figure() for cat_num, cat in enumerate(colorby_cats): # collect the primary and secondary samples within this category state1_samids, state2_samids = get_sam_ids(map_data, map_header, opts.colorby, cat, opts.primary_state, opts.secondary_state) state1_samids =\ list(set(state1_samids).intersection(set(distdict.keys()))) state2_samids =\ list(set(state2_samids).intersection(set(distdict.keys()))) if state1_samids == [] or state2_samids == [] or \ (len(state1_samids) == 1 and state1_samids == state2_samids): raise RuntimeError("one category of samples didn't have any valid" + " distances. try eliminating samples from -p or -s, or changing" + " your mapping file with filter_samples_from_otu_table.py") # go through dmtx state1_avg_dists = get_avg_dists( state1_samids, state2_samids, distdict) # plot xvals = [float(map_dict[sam][opts.axis_category]) for sam in state1_samids] try: color = plt.cm.jet(cat_num / (len(colorby_cats) - 1)) except ZeroDivisionError: # only one cat color = 'b' plt.scatter(xvals, state1_avg_dists, edgecolors=color, alpha=.5, facecolors='none') plt.xlabel(opts.axis_category) plt.ylabel('average distance') lines = [str(xvals[i]) + '\t' + str(state1_avg_dists[i]) + '\t' + state1_samids[i] + '\n' for i in range(len(xvals))] text_fh.writelines(lines) if opts.colorby is not None: plt.legend(colorby_cats) plt.savefig(opts.output_path)
def test_mapping_file_to_dict(self): """parse_mapping_file functions as expected""" s1 = ['#sample\ta\tb', '#comment line to skip',\ 'x \t y \t z ', ' ', '#more skip', 'i\tj\tk'] exp = ([['x','y','z'],['i','j','k']],\ ['sample','a','b'],\ ['comment line to skip','more skip']) mapres = parse_mapping_file(s1) # map_data, header, comments mapdict = mapping_file_to_dict(*mapres[:2]) expdict = {'x':{'a':'y','b':'z'}, 'i':{'a':'j','b':'k'}} self.assertEqual(mapdict, expdict)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) mapping_fp = opts.mapping_fp mapping_category = opts.mapping_category otu_table_fp = opts.otu_table_fp output_fp = opts.output_fp normalize = opts.normalize # define a function that returns the bin a sample shouldbe placed into bin_function = lambda id_, sample_metadata:\ sample_metadata[mapping_category] # parse the sample metadata and add it to the OTU table (we assume that # sample metadata is not already present in the table) mapping, headers, comments = parse_mapping_file(open(mapping_fp, 'U')) # added in ability to combine metadata columns and summarize based on the # new combined category if '&&' in mapping_category: new_mapping = [] new_mapping.append(headers) for i in range(len(mapping)): new_mapping.append(mapping[i]) # Create an array using multiple columns from mapping file combinecolorby = mapping_category.split('&&') mapping = combine_map_label_cols(combinecolorby, new_mapping) sample_metadata = mapping_file_to_dict(mapping, headers) with biom_open(otu_table_fp, 'U') as biom_file: table = parse_biom_table(biom_file) table.add_metadata(sample_metadata) # create a new OTU table where samples are binned based on their return # value from bin_function result = table.collapse(bin_function, norm=False, min_group_size=1, axis='sample') # normalize the result if requested by the user if normalize: result.norm(axis='sample', inplace=True) # write a new BIOM file write_biom_table(result, output_fp)
def generate_pcoa_cloud_from_point_in_omega(mapping_file_tuple, biom_object, metric, sequences, iterations, axes, tree_object=None): """run the randomisations and get a WebGL PCoA plot string representation Input: mapping_file_tuple: data and headers tuple for representing the mapping file biom_object: otu table biom object metric: string of the name for the beta diversity metric, i. e. 'unifrac' sequences: number of sequences per sample iterations: number of iterations to generate the pcoa plot axes: number of axes to account for tree_object: tree to perform the beta diversity calculation Output: WebGL string representing the PCoA plot """ # get a list of the SampleIds full_id_list = mapping_file_to_dict(mapping_file_tuple[0], mapping_file_tuple[1]).keys() pcoa_list = [] for i in range(iterations): rare_biom_table = get_rare_data(biom_object, sequences) beta_dm = single_object_beta(rare_biom_table, metric, tree_object) pcoa_results = pcoa(beta_dm) pcoa_list.append(pcoa_results) # convert the list of pcoa lines into ellipsoid coords ellipse_coords_by_sampleId, sampleId_to_coords = get_pcoa_ellipsoid_coords(pcoa_list, axes, full_id_list) # check the ellipses are created correctly if type(ellipse_coords_by_sampleId) == type(''): raise ValueError, 'Could not create PCoA plot' webgl_string = make_pcoa_plot(ellipse_coords_by_sampleId, mapping_file_tuple, sampleId_to_coords['variation explained']) return webgl_string
def format_mapping_file_to_js(mapping_file_data, mapping_file_headers, columns): """Write a javascript representation of the mapping file Inputs: mapping_file_data: contents of the mapping file mapping_file_headers: headers of the mapping file columns: valid columns to use, usually a subset of mapping_file_headers Outputs: string: javascript representation of the mapping file """ js_mapping_file_string = '' mapping_file_dict = mapping_file_to_dict(mapping_file_data, mapping_file_headers) map_values = [] for k, v in mapping_file_dict.items(): if 'SampleID' in columns: vals = ["'%s'" % k] + ["'%s'" % v[col]\ for col in mapping_file_headers[1:]] else: vals = ["'%s'" % v[col] for col in mapping_file_headers[1:]] map_values.append("'%s': [%s]" % (k, ','.join(vals))) if 'SampleID' not in columns: mapping_file_headers = mapping_file_headers[1:] # format the mapping file as javascript objects js_mapping_file_string += 'var g_mappingFileHeaders = [%s];\n' % ','.join( ["'%s'" % col for col in mapping_file_headers]) js_mapping_file_string += 'var g_mappingFileData = { %s };\n' % ','.join( map_values) return js_mapping_file_string
if '&&' in col: for _col in col.split('&&'): if _col not in lookup_header: offending_fields.append(col) elif col not in lookup_header: offending_fields.append(col) else: # if the user didn't specify the header names display everything color_by_column_names = header[:] # extract a list of the custom axes provided and each element is numeric if custom_axes: custom_axes = custom_axes.strip().strip("'").strip('"').split(',') # the MetadataMap object makes some checks easier map_object = MetadataMap(mapping_file_to_dict(mapping_data, header), []) for axis in custom_axes: # append the field to the error queue that it belongs to if axis not in lookup_header: offending_fields.append(axis) break # make sure this value is in the mapping file elif axis not in color_by_column_names: color_by_column_names.append(axis) # perform only if the for loop does not call break else: # make sure all these axes are numeric for axis in custom_axes: if map_object.isNumericCategory(axis) == False: non_numeric_categories.append(axis)
def preprocess_mapping_file(data, headers, columns, unique=False, single=False, clones=0): """Process a mapping file to expand the data or remove unuseful fields Inputs: data: mapping file data headers: mapping file headers columns: list of headers to keep, if one of these headers includes two ampersands, this function will create a new column by merging the delimited columns. unique: keep columns where all values are unique single: keep columns where all values are the same clones: number of times to replicate the metadata Outputs: data: processed mapping file data headers: processed mapping file headers """ # The sample ID must always be there, else it's meaningless data if 'SampleID' != columns[0]: columns = ['SampleID'] + columns # process concatenated columns if needed merge = [] for column in columns: if '&&' in column: merge.append(column) # each element needs several columns to be merged for new_column in merge: indices = [ headers.index(header_name) for header_name in new_column.split('&&') ] # join all the fields of the metadata that are listed in indices for line in data: line.append(''.join([line[index] for index in indices])) headers.append(new_column) # remove all unique or singled valued columns if unique or single: columns_to_remove = [] metadata = MetadataMap(mapping_file_to_dict(data, headers), []) # find columns that have values that are all unique if unique == True: columns_to_remove += [ column_name for column_name in headers[1::] if metadata.hasUniqueCategoryValues(column_name) ] # remove categories where there is only one value if single == True: columns_to_remove += [ column_name for column_name in headers[1::] if metadata.hasSingleCategoryValue(column_name) ] columns_to_remove = list(set(columns_to_remove)) # remove the single or unique columns data, headers = keep_columns_from_mapping_file(data, headers, columns_to_remove, negate=True) # remove anything not specified in the input data, headers = keep_columns_from_mapping_file(data, headers, columns) # sanitize the mapping file data and headers data, headers = sanitize_mapping_file(data, headers) # clones mean: replicate the metadata retagging the sample ids with a suffix if clones: out_data = [] for index in range(0, clones): out_data.extend([[element[0] + '_%d' % index] + element[1::] for element in data]) data = out_data return data, headers
def preprocess_mapping_file(data, headers, columns, unique=False, single=False, clones=0): """Process a mapping file to expand the data or remove unuseful fields Inputs: data: mapping file data headers: mapping file headers columns: list of headers to keep, if one of these headers includes two ampersands, this function will create a new column by merging the delimited columns. unique: keep columns where all values are unique single: keep columns where all values are the same clones: number of times to replicate the metadata Outputs: data: processed mapping file data headers: processed mapping file headers """ # The sample ID must always be there, else it's meaningless data if "SampleID" != columns[0]: columns = ["SampleID"] + columns # process concatenated columns if needed merge = [] for column in columns: if "&&" in column: merge.append(column) # each element needs several columns to be merged for new_column in merge: indices = [headers.index(header_name) for header_name in new_column.split("&&")] # join all the fields of the metadata that are listed in indices for line in data: line.append("".join([line[index] for index in indices])) headers.append(new_column) # remove all unique or singled valued columns if unique or single: columns_to_remove = [] metadata = MetadataMap(mapping_file_to_dict(data, headers), []) # find columns that have values that are all unique if unique == True: columns_to_remove += [ column_name for column_name in headers[1::] if metadata.hasUniqueCategoryValues(column_name) ] # remove categories where there is only one value if single == True: columns_to_remove += [ column_name for column_name in headers[1::] if metadata.hasSingleCategoryValue(column_name) ] columns_to_remove = list(set(columns_to_remove)) # remove the single or unique columns data, headers = keep_columns_from_mapping_file(data, headers, columns_to_remove, negate=True) # remove anything not specified in the input data, headers = keep_columns_from_mapping_file(data, headers, columns) # sanitize the mapping file data and headers data, headers = sanitize_mapping_file(data, headers) # clones mean: replicate the metadata retagging the sample ids with a suffix if clones: out_data = [] for index in range(0, clones): out_data.extend([[element[0] + "_%d" % index] + element[1::] for element in data]) data = out_data return data, headers