def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) mapping_fp = opts.input_fp categories = opts.categories header_names = opts.categories_header_names output_fp = opts.output_fp if len(categories) != len(header_names): option_parser.error('This shouldnt be happening what are you doing?') data, headers, _ = parse_mapping_file(open(mapping_fp, 'U')) headers, data = apply_operation_on_mapping_file_columns(headers, data, categories, header_names) # for j in range(0,len(categories)): # headers.append(header_names[j]) # for k, line in enumerate(data): # temp = 0.0 # indices = map(lambda x: headers.index(x), categories[j].split(',')) # for index in indices: # temp = temp + float(line[index]) # data[k].append('%f' % temp) lines = format_mapping_file(headers, data) fd = open(output_fp, 'w') fd.writelines(lines) fd.close()
def create_replicated_mapping_file(map_f, num_replicates, sample_ids): """Returns a formatted mapping file with replicated sample IDs. Each sample ID will have an ascending integer appended to it from the range [0, num_replicates - 1]. For example, if there are two input sample IDs, S1 and S2, with 3 replicates each, the output will be: S1.0 S1.1 S1.2 S2.0 S2.1 S2.2 All other metadata columns will simply be copied to the output mapping file. The order of input sample IDs is preserved. Arguments: map_f - input mapping file to replicate (file-like object) num_replicates - number of replicates at each sample sample_ids - only sample IDs in the mapping file that are in this list will be replicated. Sample IDs in the mapping file that are not found in this list will not be added to the resulting mapping file """ if num_replicates < 1: raise ValueError("Must specify at least one sample replicate (was " "provided %d)." % num_replicates) map_data, header, comments = parse_mapping_file(map_f) rep_map_data = [] for row in map_data: if row[0] in sample_ids: for rep_num in range(num_replicates): rep_map_data.append(['%s.%i' % (row[0], rep_num)] + row[1:]) return format_mapping_file(header, rep_map_data, comments)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) mapping_fp = opts.input_fp out_mapping_fp = opts.output_fp valid_states = opts.valid_states if opts.sample_id_fp: valid_sample_ids = \ get_seqs_to_keep_lookup_from_seq_id_file(open(opts.sample_id_fp,'U')) elif mapping_fp and valid_states: valid_sample_ids = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) data, headers, _ = parse_mapping_file(open(mapping_fp, 'U')) good_mapping_file = [] for line in data: if line[0] in valid_sample_ids: good_mapping_file.append(line) lines = format_mapping_file(headers, good_mapping_file) fd = open(out_mapping_fp, 'w') fd.write(lines) fd.close()
def create_personal_mapping_file(map_as_list, header, comments, personal_id_of_interest, output_fp, personal_id_index, individual_titles): """ creates mapping file on a per-individual basis """ if individual_titles == None: individual_titles = ['Self', 'Other'] else: individual_titles = individual_titles.split(',') personal_map = [] for line in map_as_list: personal_map.append(line[:]) for i in personal_map: if i[personal_id_index] == personal_id_of_interest: i.append(individual_titles[0]) else: i.append(individual_titles[1]) personal_mapping_file = format_mapping_file(header, personal_map, comments) output_f = open(output_fp,'w') output_f.write(personal_mapping_file) output_f.close() return personal_map
def split_mapping_file_on_field(mapping_f, mapping_field, column_rename_ids=None, include_repeat_cols=True): """ split mapping file based on value in field """ mapping_f = list(mapping_f) mapping_values = get_mapping_values(mapping_f,mapping_field) mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) if column_rename_ids: try: column_rename_ids = mapping_headers.index(column_rename_ids) except ValueError: raise KeyError("Field is not in mapping file (search is case "+\ "and white-space sensitive). \n\tProvided field: "+\ "%s. \n\tValid fields: %s" % (mapping_field,' '.join(mapping_headers))) for v in mapping_values: v_fp_str = v.replace(' ','_') sample_ids_to_keep = sample_ids_from_metadata_description( mapping_f,valid_states_str="%s:%s" % (mapping_field,v)) # parse mapping file each time though the loop as filtering operates on values mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) mapping_headers, mapping_data = filter_mapping_file( mapping_data, mapping_headers, sample_ids_to_keep, include_repeat_cols=include_repeat_cols, column_rename_ids=column_rename_ids) yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) columns_to_merge = opts.columns_to_merge mapping_fp = opts.mapping_fp output_fp = opts.output_fp try: data, headers, comments = parse_mapping_file(open(mapping_fp, 'U')) except: option_parser.error('Bro, that doesn\'t look like a mapping file') for merging in columns_to_merge: retrieve = lambda x: headers.index(x) indices = map(retrieve, merging.split('&&')) headers.append(''.join([headers[element] for element in indices])) for line in data: line.append(''.join([line[element] for element in indices])) # this should never happen assert len(headers) == len(data[0]), "Something went horribly wrong, "+\ "that's what you get for using non-unit-tested software" lines = format_mapping_file(headers, data, comments) fd = open(output_fp, 'w') fd.writelines(lines) fd.close()
def split_mapping_file_on_field(mapping_f, mapping_field, column_rename_ids=None, include_repeat_cols=True): """ split mapping file based on value in field """ mapping_f = list(mapping_f) mapping_values = get_mapping_values(mapping_f, mapping_field) mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) if column_rename_ids: try: column_rename_ids = mapping_headers.index(column_rename_ids) except ValueError: raise KeyError("Field is not in mapping file (search is case " + "and white-space sensitive). \n\tProvided field: " + "%s. \n\tValid fields: %s" % (mapping_field, ' '.join(mapping_headers))) for v in mapping_values: v_fp_str = v.replace(' ', '_') sample_ids_to_keep = sample_ids_from_metadata_description( mapping_f, valid_states_str="%s:%s" % (mapping_field, v)) # parse mapping file each time though the loop as filtering operates on # values mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) mapping_headers, mapping_data = filter_mapping_file( mapping_data, mapping_headers, sample_ids_to_keep, include_repeat_cols=include_repeat_cols, column_rename_ids=column_rename_ids) yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if mapping_fp is None and valid_states is not None: option_parser.error("--mapping_fp must be provided if --valid_states " "is passed.") if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error( "No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those)." ) if (mapping_fp and valid_states) and sample_id_fp: option_parser.error("Providing both --sample_id_fp and " "--mapping_fp/--valid_states is not supported.") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) negate_sample_id_fp = opts.negate_sample_id_fp if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description(open(mapping_fp, "U"), valid_states) negate_sample_id_fp = False else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: o = open(sample_id_fp, "U") sample_id_f_ids = set([l.strip().split()[0] for l in o if not l.startswith("#")]) o.close() sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table( otu_table, sample_ids_to_keep, min_count, max_count, negate_ids_to_keep=negate_sample_id_fp ) try: write_biom_table(filtered_otu_table, output_fp) except EmptyBIOMTableError: option_parser.error( "Filtering resulted in an empty BIOM table. " "This indicates that no samples remained after filtering." ) # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_fp, "U")) mapping_headers, mapping_data = filter_mapping_file(mapping_data, mapping_headers, filtered_otu_table.ids()) open(output_mapping_fp, "w").write(format_mapping_file(mapping_headers, mapping_data))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error("No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those).") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: sample_id_f_ids = set([l.strip().split()[0] for l in open(sample_id_fp, 'U') if not l.startswith('#')]) sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table(otu_table, sample_ids_to_keep, min_count, max_count) write_biom_table(filtered_otu_table, output_fp) # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.ids()) open( output_mapping_fp, 'w').write( format_mapping_file( mapping_headers, mapping_data))
def test_format_mapping_file(self): """ format_mapping file should match expected result""" headers = ['SampleID','col1','col0','Description'] samples =\ [['bsample','v1_3','v0_3','d1'],['asample','aval','another','d2']] comments = ['this goes after headers','this too'] self.assertEqual(format_mapping_file(headers,samples,comments), example_mapping_file)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) biom_table_fp = opts.biom_table_fp mapping_fp = opts.mapping_fp fields = opts.fields.split(',') output_dir = opts.output_dir suppress_mf = opts.suppress_mapping_file_output # column_rename_ids = opts.column_rename_ids # include_repeat_cols = opts.include_repeat_cols bt = load_table(biom_table_fp) mdata, mheaders, mcomments = parse_mapping_file(mapping_fp) mdata = array(mdata) # check that biom file and mapping file have matching sample names. discard # those samples that do not appear in both. shared_samples = list(set(mdata[:, 0]).intersection(bt.ids(axis='sample'))) if len(shared_samples) == 0: raise ValueError('Mapping file and biom table share no samples.') elif len(shared_samples) == len(mdata[:, 0]): mdata = array(mdata) else: # we want to preserve the order of the samples in the biom table ss_bt_order = [s for s in bt.ids(axis='sample') if s in shared_samples] bt = bt.filter(ss_bt_order, axis='sample', inplace=True) mdata = subset_mapping_data(mdata, shared_samples) # check that headers in mapping data if not all([i in mheaders for i in fields]): raise ValueError('One or more of the specified fields was not found ' +\ 'in the mapping file.') # create output directory and create base names create_dir(output_dir) mf_base_name = join(output_dir, splitext(split(mapping_fp)[1])[0]) bt_base_name = join(output_dir, splitext(split(biom_table_fp)[1])[0]) # run code and append output sample_groups, value_groups = make_non_empty_sample_lists(fields, mheaders, mdata) for sg, vg in zip(sample_groups, value_groups): name_base = '__' + '%s_%s_' * len(vg) + '_' name_tmp = [] for f, v in zip(fields, vg): name_tmp.extend([f, v]) nb = name_base % tuple(name_tmp) tmp_mf_data = subset_mapping_data(mdata, sg) tmp_mf_str = format_mapping_file(mheaders, tmp_mf_data, mcomments) write_biom_table(bt.filter(sg, axis='sample', inplace=False), bt_base_name + nb + '.biom') if not suppress_mf: o = open(mf_base_name + nb + '.txt', 'w') o.writelines(tmp_mf_str) o.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error( "No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination of those).") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = parse_biom_table(open(opts.input_fp, 'U')) output_f = open(opts.output_fp, 'w') if (mapping_fp and valid_states): sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) else: sample_ids_to_keep = otu_table.SampleIds if (sample_id_fp is not None): sample_id_f_ids = set([ l.strip().split()[0] for l in open(sample_id_fp, 'U') if not l.startswith('#') ]) sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table(otu_table, sample_ids_to_keep, min_count, max_count) output_f.write(format_biom_table(filtered_otu_table)) output_f.close() # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.SampleIds) open(output_mapping_fp, 'w').write(format_mapping_file(mapping_headers, mapping_data))
def test_format_mapping_file(self): """ format_mapping file should match expected result""" headers = ['SampleID', 'col1', 'col0', 'Description'] samples =\ [['bsample', 'v1_3', 'v0_3', 'd1'], ['asample', 'aval', 'another', 'd2']] comments = ['this goes after headers', 'this too'] self.assertEqual(format_mapping_file(headers, samples, comments), example_mapping_file)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) alpha_fps = opts.alpha_fps mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp binning_method = opts.binning_method missing_value_name = opts.missing_value_name depth = opts.depth number_of_bins = opts.number_of_bins collated_input = opts.collated_input # if using collated data, make sure they specify a depth if collated_input: alpha_dict = {} # build up a dictionary with the filenames as keys and lines as values for single_alpha_fp in alpha_fps: alpha_dict[splitext(basename(single_alpha_fp))[0]] = open( single_alpha_fp, 'U').readlines() # format the collated data try: metrics, alpha_sample_ids, alpha_data = mean_alpha( alpha_dict, depth) except ValueError as e: # see mean_alpha for the possible exceptions option_parser.error(e.message) # when not using collated data, the user can only specify one input file else: if len(alpha_fps) > 1: option_parser.error( 'A comma-separated list of files should only be' ' passed with the --alpha_fps option when using collated alpha ' 'diversity data and also selecting a rarefaction depth with the' ' --depth option.') else: metrics, alpha_sample_ids, alpha_data = parse_matrix( open(alpha_fps[0], 'U')) # parse the data from the files mapping_file_data, mapping_file_headers, comments = parse_mapping_file( open(mapping_fp, 'U')) # add the alpha diversity data to the mapping file out_mapping_file_data, out_mapping_file_headers = \ add_alpha_diversity_values_to_mapping_file(metrics, alpha_sample_ids, alpha_data, mapping_file_headers, mapping_file_data, number_of_bins, binning_method, missing_value_name) # format the new data and write it down lines = format_mapping_file(out_mapping_file_headers, out_mapping_file_data) fd_out = open(output_mapping_fp, 'w') fd_out.writelines(lines) fd_out.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) alpha_fps = opts.alpha_fps mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp binning_method = opts.binning_method missing_value_name = opts.missing_value_name depth = opts.depth # make sure the number of bins is an integer try: number_of_bins = int(opts.number_of_bins) except ValueError: raise ValueError, 'The number of bins must be an integer, not %s'\ % opts.number_of_bins # if using collated data, make sure they specify a depth if depth is not None: alpha_dict = {} # build up a dictionary with the filenames as keys and lines as values for single_alpha_fp in alpha_fps: alpha_dict[splitext(basename(single_alpha_fp))[0]] = open( single_alpha_fp, 'U').readlines() # format the collated data metrics, alpha_sample_ids, alpha_data = mean_alpha(alpha_dict, depth) # when not using collated data, the user can only specify one input file else: if len(alpha_fps) > 1: option_parser.error('A comma-separated list of files should only be' ' passed with the --alpha_fps option when using collated alpha ' 'diversity data and also selecting a rarefaction depth with the' ' --depth option.') else: metrics, alpha_sample_ids, alpha_data = parse_matrix(open( alpha_fps[0], 'U')) # parse the data from the files mapping_file_data, mapping_file_headers, comments = parse_mapping_file( open(mapping_fp, 'U')) # add the alpha diversity data to the mapping file out_mapping_file_data, out_mapping_file_headers = \ add_alpha_diversity_values_to_mapping_file(metrics, alpha_sample_ids, alpha_data, mapping_file_headers, mapping_file_data, number_of_bins, binning_method, missing_value_name) # format the new data and write it down lines = format_mapping_file(out_mapping_file_headers, out_mapping_file_data) fd_out = open(output_mapping_fp, 'w') fd_out.writelines(lines) fd_out.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) try: data, headers, comments = parse_mapping_file(open(opts.input_fp, 'U')) except: option_parser.error('That doesn\'t look like a mapping file') lines = format_mapping_file(headers, data, comments) fd = open(opts.input_fp, 'w') fd.writelines(lines) fd.close()
def filter_mapping_file_from_mapping_f(mapping_f,sample_ids_to_keep,negate=False): """ Filter rows from a metadata mapping file """ mapping_data, header, comments = parse_mapping_file(mapping_f) filtered_mapping_data = [] sample_ids_to_keep = {}.fromkeys(sample_ids_to_keep) for mapping_datum in mapping_data: if mapping_datum[0] in sample_ids_to_keep: filtered_mapping_data.append(mapping_datum) elif negate: filtered_mapping_data.append(mapping_datum) else: pass return format_mapping_file(header,filtered_mapping_data)
def filter_mapping_file_from_mapping_f(mapping_f,sample_ids_to_keep,negate=False): """ Filter rows from a metadata mapping file """ mapping_data, header, comments = parse_mapping_file(mapping_f) filtered_mapping_data = [] sample_ids_to_keep = {}.fromkeys(sample_ids_to_keep) for mapping_datum in mapping_data: hit = mapping_datum[0] in sample_ids_to_keep if hit and not negate: filtered_mapping_data.append(mapping_datum) elif not hit and negate: filtered_mapping_data.append(mapping_datum) else: pass return format_mapping_file(header,filtered_mapping_data)
def format_vectors_to_js(mapping_file_data, mapping_file_headers, coords_data, coords_headers, connected_by_header, sorted_by_header=None): """Write a string representing the vectors in a PCoA plot as javascript Inputs: mapping_file_data: contents of the mapping file mapping_file_headers: headers of the mapping file coords_data: coordinates of the PCoA plot in a numpy 2-D array or a list of numpy 2-D arrays for jackknifed input coords_headers: headers of the coords in the PCoA plot or a list of lists with the headers for jackknifed input connected_by_header: header of the mapping file that represents how the lines will be connected sorted_by_header: numeric-only header name to sort the samples in the vectors Output: js_vectors_string: string that represents the vectors in the shape of a javascript object Notes: If using jackknifed input, the coordinates and headers that will be used are the ones belonging to the master coords i. e. the first element. """ js_vectors_string = [] js_vectors_string.append('\nvar g_vectorPositions = new Array();\n') if connected_by_header != None: # check if we are processing jackknifed input, if so just get the master if type(coords_data) == list: coords_data = coords_data[0] coords_headers = coords_headers[0] columns_to_keep = ['SampleID', connected_by_header] # do not ad None if sorted_by_header is None or empty if sorted_by_header: columns_to_keep.append(sorted_by_header) # reduce the amount of data by keeping the required fields only mapping_file_data, mapping_file_headers =\ keep_columns_from_mapping_file(mapping_file_data, mapping_file_headers, columns_to_keep) # format the mapping file to use this with the filtering function mf_string = format_mapping_file(mapping_file_headers, mapping_file_data) index = mapping_file_headers.index(connected_by_header) connected_by = list(set([line[index] for line in mapping_file_data])) for category in connected_by: # convert to StringIO to for each iteration; else the object # won't be usable after the first iteration & you'll get an error sample_ids = sample_ids_from_metadata_description( StringIO(mf_string), '%s:%s' % (connected_by_header, category)) # if there is a sorting header, sort the coords using these values if sorted_by_header: sorting_index = mapping_file_headers.index(sorted_by_header) to_sort = [line for line in mapping_file_data if line[0] in\ sample_ids] # get the sorted sample ids from the sorted-reduced mapping file sample_ids = zip( *sorted(to_sort, key=lambda x: float(x[sorting_index])))[0] # each category value is a new vector js_vectors_string.append( "g_vectorPositions['%s'] = new Array();\n" % (category)) for s in sample_ids: index = coords_headers.index(s) # print the first three elements of each coord for each sample js_vectors_string.append( "g_vectorPositions['%s']['%s'] = %s;\n" % (category, s, coords_data[index, :3].tolist())) return ''.join(js_vectors_string)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table_fp = opts.otu_table_fp output_dir = opts.output_dir mapping_fp = opts.mapping_fp tree_fp = opts.tree_fp verbose = opts.verbose print_only = opts.print_only seqs_per_sample = int(opts.seqs_per_sample) parallel = opts.parallel min_seqs_sample = opts.min_seqs_sample subject_category = opts.subject_name try: makedirs(output_dir) except OSError: if opts.force: pass else: # Since the analysis can take quite a while, I put this check # in to help users avoid overwriting previous output. option_parser.error("Output directory already exists. Please choose" " a different directory, or force overwrite with -f.") ## ******************** make_evident_selectors ******************** ## The code for make_evident_selectors.py is here and has to go before the params ## validation as we need to know the main cats before creating the params file map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U')) biom_table = parse_biom_table(open(otu_table_fp, 'U')) # getting valid samples from biom file real_map_headers, real_map_data = filter_mapping_file(map_data, headers,\ biom_table.SampleIds, include_repeat_cols=False) if subject_category not in real_map_headers: option_parser.error('This column: %s is not in the mapping file, try %s'%\ (subject_category, real_map_headers)) sorted_counts_per_sample = get_sorted_counts_per_sample(biom_table) mapping_file_tuple = (real_map_data, real_map_headers) # calculate the available subjects at each rarefaction level results, main_map_cat = make_selectors(sorted_counts_per_sample, min_seqs_sample,\ mapping_file_tuple, subject_category, verbose=verbose) fout = open(join(output_dir,'selectors.txt'),'w') fout.write('#Sequences\tSubjects\tSamples\tMetadata\n') fout.write('\n'.join(results)) fout.close() fout = open(join(output_dir,'mapping_file.txt'),'w') fout.write(format_mapping_file(real_map_headers, real_map_data)) fout.close() ## ******************** make_evident_selectors ******************** fout = open(join(output_dir,'study_preferences.txt'),'w') fout.write('%d\n' % seqs_per_sample) fout.write('%s\n' % subject_category) fout.close() ## ******************** filter_samples_from_otu_table ******************** ## Filtering original biom file to only have samples above the max length to avoid ## ugly plots alpha_biom_file = join(output_dir,'filtered_otu_table_for_alpha.biom') fout = open(alpha_biom_file,'w') sample_ids_to_keep = biom_table.SampleIds filtered_otu_table = filter_samples_from_otu_table(biom_table, sample_ids_to_keep, min_count=seqs_per_sample, max_count=inf) fout.write(format_biom_table(filtered_otu_table)) fout.close() ## ******************** filter_samples_from_otu_table ******************** if opts.parameter_fp: try: parameter_f = open(opts.parameter_fp, 'U') except IOError: option_parser.error("Can't open parameters file (%s). Does it exist? " \ "Do you have read access?" % opts.parameter_fp) params = parse_qiime_parameters(parameter_f) parameter_f.close() else: params = parse_qiime_parameters( ['beta_diversity:metrics unweighted_unifrac',\ 'make_rarefaction_plots:prefs_path %s' % join(output_dir,'prefs.txt'), 'make_rarefaction_plots:colorby %s' % ','.join(main_map_cat), 'make_rarefaction_plots:output_type memory', 'multiple_rarefactions:min %d' % int(seqs_per_sample/4), 'multiple_rarefactions:max %d' % (seqs_per_sample+1), 'multiple_rarefactions:step %d' % int(seqs_per_sample/4), 'multiple_rarefactions:num-reps 4', ]) # empty list returns empty defaultdict for now jobs_to_start = opts.jobs_to_start default_jobs_to_start = qiime_config['jobs_to_start'] validate_and_set_jobs_to_start(params, jobs_to_start, default_jobs_to_start, parallel, option_parser) if print_only: command_handler = print_commands else: command_handler = call_commands_serially if verbose: status_update_callback = print_to_stdout else: status_update_callback = no_status_updates copyfile(otu_table_fp, join(output_dir,'raw.biom')) run_beta_diversity_through_plots(otu_table_fp=otu_table_fp, mapping_fp=mapping_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, color_by_interesting_fields_only=False, sampling_depth=seqs_per_sample, histogram_categories=None, tree_fp=tree_fp, parallel=parallel, suppress_3d_plots=True, suppress_2d_plots=True, status_update_callback=status_update_callback) output_dir = join(output_dir,'alpha') run_alpha_rarefaction(otu_table_fp=alpha_biom_file,\ mapping_fp=mapping_fp,\ output_dir=output_dir,\ command_handler=command_handler,\ params=params, qiime_config=qiime_config,\ tree_fp=tree_fp,\ num_steps=4,\ parallel=parallel,\ min_rare_depth=10, max_rare_depth=20, status_update_callback=status_update_callback, plot_stderr_and_stddev=True)
def test_format_mapping_file(self): """ format_mapping file should match expected result""" headers = ["SampleID", "col1", "col0", "Description"] samples = [["bsample", "v1_3", "v0_3", "d1"], ["asample", "aval", "another", "d2"]] comments = ["this goes after headers", "this too"] self.assertEqual(format_mapping_file(headers, samples, comments), example_mapping_file)
def make_distance_boxplots(dm_f, map_f, fields, width=None, height=6.0, suppress_all_within=False, suppress_all_between=False, suppress_individual_within=False, suppress_individual_between=False, y_min=0.0, y_max=1.0, whisker_length=1.5, box_width=0.5, box_color=None, color_individual_within_by_field=None, sort=None): """Generates various types of boxplots for distance comparisons. Returns a list of tuples, one for each field. Each tuple contains the following: 1) the name of the field (string) 2) a matplotlib.figure.Figure object containing the boxplots 3) a list of lists containing the raw plot data that was passed to mpl 4) a list of labels for each of the boxplots (string) 5) a list of mpl-compatible colors (one for each boxplot) The Figure can be saved, and the raw data and labels can be useful (for example) performing statistical tests or writing the raw data to disk. The input arguments are exactly derived from the make_distance_boxplots.py script (see the script options for details). To avoid duplicated effort, their descriptions are not reproduced here. """ # Parse data files and do some preliminary error checking. dm_header, dm_data = parse_distmat(dm_f) map_data, map_header, map_comments = parse_mapping_file(map_f) if fields is None or len(fields) < 1: raise ValueError("You must provide at least one field to analyze.") for field in fields: if field not in map_header: raise ValueError("The field '%s' is not in the provided mapping " "file. Please supply correct fields " "corresponding to fields in the mapping file." % field) # Make sure the y_min and y_max options make sense, as they can be either # 'auto' or a number. y_min = _cast_y_axis_extrema(y_min) y_max = _cast_y_axis_extrema(y_max) # Collate the distributions of distances that will comprise each boxplot. # Suppress the generation of the indicated types of boxplots. results = [] for field in fields: plot_data = [] plot_labels = [] plot_colors = [] legend = None # Little bit of duplicate code here... not sure it's worth the effort # to clean up though. if not suppress_all_within: plot_data.append( get_all_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=True)) plot_labels.append("All within %s" % field) if color_individual_within_by_field is not None: plot_colors.append(None) else: plot_colors.append(box_color) if not suppress_all_between: plot_data.append( get_all_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=False)) plot_labels.append("All between %s" % field) if color_individual_within_by_field is not None: plot_colors.append(None) else: plot_colors.append(box_color) if not suppress_individual_within: within_dists = get_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=True) field_states = [] for grouping in within_dists: plot_data.append(grouping[2]) plot_labels.append("%s vs. %s" % (grouping[0], grouping[1])) field_states.append(grouping[0]) # If we need to color these boxplots by a field, build up a # list of colors and a legend. if color_individual_within_by_field is not None: colors, color_mapping = _color_field_states( format_mapping_file(map_header, map_data).split('\n'), dm_header, field, field_states, color_individual_within_by_field) plot_colors.extend(colors) legend = (color_mapping.values(), color_mapping.keys()) else: plot_colors.extend([box_color] * len(field_states)) if not suppress_individual_between: between_dists = get_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=False) for grouping in between_dists: plot_data.append(grouping[2]) plot_labels.append("%s vs. %s" % (grouping[0], grouping[1])) if color_individual_within_by_field is not None: plot_colors.append(None) else: plot_colors.append(box_color) assert (len(plot_data) == len(plot_labels) and len(plot_labels) == len(plot_colors)), "The number " +\ "of boxplot labels and colors do not match the number of " +\ "boxplots." # We now have our data and labels ready, so plot them! if plot_data: if sort is not None: plot_data, plot_labels, plot_colors = _sort_distributions( plot_data, plot_labels, plot_colors, sort) if width is None: width = len(plot_data) * box_width + 2 if width <= 0 or height <= 0: raise ValueError("The specified width and height of the plot " "must be greater than zero.") plot_figure = boxplots(plot_data, x_tick_labels=plot_labels, title="%s Distances" % field, x_label="Grouping", y_label="Distance", x_tick_labels_orientation='vertical', y_min=y_min, y_max=y_max, whisker_length=whisker_length, box_width=box_width, box_colors=plot_colors, figure_width=width, figure_height=height, legend=legend) results.append( (field, plot_figure, plot_data, plot_labels, plot_colors)) else: raise ValueError("The generation of all plots was suppressed. At " "least one type of plot must be unsuppressed.") return results
"diversity data and also selecting a rarefaction depth with the" " --depth option." ) else: metrics, alpha_sample_ids, alpha_data = parse_matrix(open(alpha_fps[0], "U")) # parse the data from the files mapping_file_data, mapping_file_headers, comments = parse_mapping_file(open(mapping_fp, "U")) # add the alpha diversity data to the mapping file out_mapping_file_data, out_mapping_file_headers = add_alpha_diversity_values_to_mapping_file( metrics, alpha_sample_ids, alpha_data, mapping_file_headers, mapping_file_data, number_of_bins, binning_method, missing_value_name, ) # format the new data and write it down lines = format_mapping_file(out_mapping_file_headers, out_mapping_file_data) fd_out = open(output_mapping_fp, "w") fd_out.writelines(lines) fd_out.close() if __name__ == "__main__": main()
def format_vectors_to_js(mapping_file_data, mapping_file_headers, coords_data, coords_headers, connected_by_header, sorted_by_header=None): """Write a string representing the vectors in a PCoA plot as javascript Inputs: mapping_file_data: contents of the mapping file mapping_file_headers: headers of the mapping file coords_data: coordinates of the PCoA plot in a numpy 2-D array or a list of numpy 2-D arrays for jackknifed input coords_headers: headers of the coords in the PCoA plot or a list of lists with the headers for jackknifed input connected_by_header: header of the mapping file that represents how the lines will be connected sorted_by_header: numeric-only header name to sort the samples in the vectors Output: js_vectors_string: string that represents the vectors in the shape of a javascript object Notes: If using jackknifed input, the coordinates and headers that will be used are the ones belonging to the master coords i. e. the first element. """ js_vectors_string = [] js_vectors_string.append('\nvar g_vectorPositions = new Array();\n') if connected_by_header != None: # check if we are processing jackknifed input, if so just get the master if type(coords_data) == list: coords_data = coords_data[0] coords_headers = coords_headers[0] columns_to_keep = ['SampleID', connected_by_header] # do not ad None if sorted_by_header is None or empty if sorted_by_header: columns_to_keep.append(sorted_by_header) # reduce the amount of data by keeping the required fields only mapping_file_data, mapping_file_headers =\ keep_columns_from_mapping_file(mapping_file_data, mapping_file_headers, columns_to_keep) # format the mapping file to use this with the filtering function mf_string = format_mapping_file(mapping_file_headers, mapping_file_data) index = mapping_file_headers.index(connected_by_header) connected_by = list(set([line[index] for line in mapping_file_data])) for category in connected_by: # convert to StringIO to for each iteration; else the object # won't be usable after the first iteration & you'll get an error sample_ids = sample_ids_from_metadata_description( StringIO(mf_string),'%s:%s' % (connected_by_header,category)) # if there is a sorting header, sort the coords using these values if sorted_by_header: sorting_index = mapping_file_headers.index(sorted_by_header) to_sort = [line for line in mapping_file_data if line[0] in\ sample_ids] # get the sorted sample ids from the sorted-reduced mapping file sample_ids = zip(*sorted(to_sort, key=lambda x: float(x[sorting_index])))[0] # each category value is a new vector js_vectors_string.append("g_vectorPositions['%s'] = new Array();\n" % (category)) for s in sample_ids: index = coords_headers.index(s) # print the first three elements of each coord for each sample js_vectors_string.append("g_vectorPositions['%s']['%s'] = %s;\n" % (category, s, coords_data[index, :3].tolist())) return ''.join(js_vectors_string)
else: if len(alpha_fps) > 1: option_parser.error( 'A comma-separated list of files should only be' ' passed with the --alpha_fps option when using collated alpha ' 'diversity data and also selecting a rarefaction depth with the' ' --depth option.') else: metrics, alpha_sample_ids, alpha_data = parse_matrix( open(alpha_fps[0], 'U')) # parse the data from the files mapping_file_data, mapping_file_headers, comments = parse_mapping_file( open(mapping_fp, 'U')) # add the alpha diversity data to the mapping file out_mapping_file_data, out_mapping_file_headers = \ add_alpha_diversity_values_to_mapping_file(metrics, alpha_sample_ids, alpha_data, mapping_file_headers, mapping_file_data, number_of_bins, binning_method, missing_value_name) # format the new data and write it down lines = format_mapping_file(out_mapping_file_headers, out_mapping_file_data) fd_out = open(output_mapping_fp, 'w') fd_out.writelines(lines) fd_out.close() if __name__ == "__main__": main()
def create_personal_results(output_dir, mapping_fp, coord_fp, collated_dir, otu_table_fp, prefs_fp, personal_id_column, personal_ids=None, column_title='Self', individual_titles=None, category_to_split='BodySite', time_series_category='WeeksSinceStart', rarefaction_depth=10000, alpha=0.05, rep_set_fp=None, parameter_fp=None, body_site_rarefied_otu_table_dir=None, retain_raw_data=False, suppress_alpha_rarefaction=False, suppress_beta_diversity=False, suppress_taxa_summary_plots=False, suppress_alpha_diversity_boxplots=False, suppress_otu_category_significance=False, command_handler=call_commands_serially, status_update_callback=no_status_updates): # Create our output directory and copy over the resources the personalized # pages need (e.g. javascript, images, etc.). create_dir(output_dir) support_files_dir = join(output_dir, 'support_files') if not exists(support_files_dir): copytree(join(get_project_dir(), 'my_microbes', 'support_files'), support_files_dir) logger = WorkflowLogger(generate_log_fp(output_dir)) mapping_data, header, comments = parse_mapping_file(open(mapping_fp, 'U')) try: personal_id_index = header.index(personal_id_column) except ValueError: raise ValueError("Personal ID field '%s' is not a mapping file column " "header." % personal_id_column) try: bodysite_index = header.index(category_to_split) except ValueError: raise ValueError("Category to split field '%s' is not a mapping file " "column header." % category_to_split) header = header[:-1] + [column_title] + [header[-1]] # column that differentiates between body-sites within a single individual # used for the creation of the vectors in make_3d_plots.py, this data is # created by concatenating the two columns when writing the mapping file site_id_category = '%s&&%s' % (personal_id_column, category_to_split) header.insert(len(header)-1, site_id_category) all_personal_ids = get_personal_ids(mapping_data, personal_id_index) if personal_ids == None: personal_ids = all_personal_ids else: for pid in personal_ids: if pid not in all_personal_ids: raise ValueError("'%s' is not a personal ID in the mapping " "file column '%s'." % (pid, personal_id_column)) if time_series_category not in header: raise ValueError("Time series field '%s' is not a mapping file column " "header." % time_series_category) otu_table_title = splitext(basename(otu_table_fp)) output_directories = [] raw_data_files = [] raw_data_dirs = [] # Rarefy the OTU table and split by body site here (instead of on a # per-individual basis) as we can use the same rarefied and split tables # for each individual. if not suppress_otu_category_significance: rarefied_otu_table_fp = join(output_dir, add_filename_suffix(otu_table_fp, '_even%d' % rarefaction_depth)) if body_site_rarefied_otu_table_dir is None: commands = [] cmd_title = 'Rarefying OTU table' cmd = 'single_rarefaction.py -i %s -o %s -d %s' % (otu_table_fp, rarefied_otu_table_fp, rarefaction_depth) commands.append([(cmd_title, cmd)]) raw_data_files.append(rarefied_otu_table_fp) per_body_site_dir = join(output_dir, 'per_body_site_otu_tables') cmd_title = 'Splitting rarefied OTU table by body site' cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % ( rarefied_otu_table_fp, mapping_fp, category_to_split, per_body_site_dir) commands.append([(cmd_title, cmd)]) raw_data_dirs.append(per_body_site_dir) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) else: per_body_site_dir = body_site_rarefied_otu_table_dir for person_of_interest in personal_ids: create_dir(join(output_dir, person_of_interest)) personal_mapping_file_fp = join(output_dir, person_of_interest, 'mapping_file.txt') html_fp = join(output_dir, person_of_interest, 'index.html') personal_mapping_data = create_personal_mapping_file(mapping_data, person_of_interest, personal_id_index, bodysite_index, individual_titles) personal_mapping_f = open(personal_mapping_file_fp, 'w') personal_mapping_f.write( format_mapping_file(header, personal_mapping_data, comments)) personal_mapping_f.close() raw_data_files.append(personal_mapping_file_fp) column_title_index = header.index(column_title) column_title_values = set([e[column_title_index] for e in personal_mapping_data]) cat_index = header.index(category_to_split) cat_values = set([e[cat_index] for e in personal_mapping_data]) # Generate alpha diversity boxplots, split by body site, one per # metric. We run this one first because it completes relatively # quickly and it does not call any QIIME scripts. alpha_diversity_boxplots_html = '' if not suppress_alpha_diversity_boxplots: adiv_boxplots_dir = join(output_dir, person_of_interest, 'adiv_boxplots') create_dir(adiv_boxplots_dir) output_directories.append(adiv_boxplots_dir) logger.write("\nGenerating alpha diversity boxplots (%s)\n\n" % person_of_interest) plot_filenames = _generate_alpha_diversity_boxplots( collated_dir, personal_mapping_file_fp, category_to_split, column_title, rarefaction_depth, adiv_boxplots_dir) # Create relative paths for use with the index page. rel_boxplot_dir = basename(normpath(adiv_boxplots_dir)) plot_fps = [join(rel_boxplot_dir, plot_filename) for plot_filename in plot_filenames] alpha_diversity_boxplots_html = \ create_alpha_diversity_boxplots_html(plot_fps) ## Alpha rarefaction steps if not suppress_alpha_rarefaction: rarefaction_dir = join(output_dir, person_of_interest, 'alpha_rarefaction') output_directories.append(rarefaction_dir) commands = [] cmd_title = 'Creating rarefaction plots (%s)' % person_of_interest cmd = 'make_rarefaction_plots.py -i %s -m %s -p %s -o %s' % ( collated_dir, personal_mapping_file_fp, prefs_fp, rarefaction_dir) commands.append([(cmd_title, cmd)]) raw_data_dirs.append(join(rarefaction_dir, 'average_plots')) raw_data_dirs.append(join(rarefaction_dir, 'average_tables')) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) ## Beta diversity steps if not suppress_beta_diversity: pcoa_dir = join(output_dir, person_of_interest, 'beta_diversity') pcoa_time_series_dir = join(output_dir, person_of_interest, 'beta_diversity_time_series') output_directories.append(pcoa_dir) output_directories.append(pcoa_time_series_dir) commands = [] cmd_title = 'Creating beta diversity time series plots (%s)' % \ person_of_interest cmd = 'make_3d_plots.py -m %s -p %s -i %s -o %s --custom_axes=' % ( personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_time_series_dir) +\ '\'%s\' --add_vectors=\'%s,%s\'' % (time_series_category, site_id_category, time_series_category) commands.append([(cmd_title, cmd)]) cmd_title = 'Creating beta diversity plots (%s)' % \ person_of_interest cmd = 'make_3d_plots.py -m %s -p %s -i %s -o %s' % (personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_dir) commands.append([(cmd_title, cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) ## Time series taxa summary plots steps if not suppress_taxa_summary_plots: area_plots_dir = join(output_dir, person_of_interest, 'time_series') create_dir(area_plots_dir) output_directories.append(area_plots_dir) ## Split OTU table into self/other per-body-site tables commands = [] cmd_title = 'Splitting OTU table into self/other (%s)' % \ person_of_interest cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (otu_table_fp, personal_mapping_file_fp, column_title, area_plots_dir) commands.append([(cmd_title, cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) for column_title_value in column_title_values: biom_fp = join(area_plots_dir, add_filename_suffix(otu_table_fp, '_%s' % column_title_value)) column_title_map_fp = join(area_plots_dir, 'mapping_%s.txt' % column_title_value) raw_data_files.append(biom_fp) raw_data_files.append(column_title_map_fp) body_site_dir = join(area_plots_dir, column_title_value) commands = [] cmd_title = 'Splitting "%s" OTU table by body site (%s)' % \ (column_title_value, person_of_interest) cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (biom_fp, personal_mapping_file_fp, category_to_split, body_site_dir) commands.append([(cmd_title, cmd)]) raw_data_dirs.append(body_site_dir) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] for cat_value in cat_values: body_site_otu_table_fp = join(body_site_dir, add_filename_suffix(biom_fp, '_%s' % cat_value)) # We won't always get an OTU table if the mapping file # category contains samples that aren't in the OTU table # (e.g. the 'na' state for body site). if exists(body_site_otu_table_fp): plots = join(area_plots_dir, 'taxa_plots_%s_%s' % ( column_title_value, cat_value)) cmd_title = 'Creating taxa summary plots (%s)' % \ person_of_interest cmd = ('summarize_taxa_through_plots.py -i %s ' '-o %s -c %s -m %s -s' % (body_site_otu_table_fp, plots, time_series_category, personal_mapping_file_fp)) if parameter_fp is not None: cmd += ' -p %s' % parameter_fp commands.append([(cmd_title, cmd)]) raw_data_files.append(join(plots, '*.biom')) raw_data_files.append(join(plots, '*.txt')) create_comparative_taxa_plots_html(cat_value, join(area_plots_dir, '%s_comparative.html' % cat_value)) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # Generate OTU category significance tables (per body site). otu_cat_sig_output_fps = [] otu_category_significance_html = '' if not suppress_otu_category_significance: otu_cat_sig_dir = join(output_dir, person_of_interest, 'otu_category_significance') create_dir(otu_cat_sig_dir) output_directories.append(otu_cat_sig_dir) # For each body-site rarefied OTU table, run # otu_category_significance.py using self versus other category. # Keep track of each output file that is created because we need to # parse these later on. commands = [] for cat_value in cat_values: body_site_otu_table_fp = join(per_body_site_dir, add_filename_suffix(rarefied_otu_table_fp, '_%s' % cat_value)) if exists(body_site_otu_table_fp): otu_cat_output_fp = join(otu_cat_sig_dir, 'otu_cat_sig_%s.txt' % cat_value) cmd_title = ('Testing for significant differences in ' 'OTU abundances in "%s" body site (%s)' % ( cat_value, person_of_interest)) cmd = ('otu_category_significance.py -i %s -m %s -c %s ' '-o %s' % (body_site_otu_table_fp, personal_mapping_file_fp, column_title, otu_cat_output_fp)) commands.append([(cmd_title, cmd)]) raw_data_files.append(otu_cat_output_fp) otu_cat_sig_output_fps.append(otu_cat_output_fp) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # Reformat otu category significance tables. otu_cat_sig_html_filenames = \ format_otu_category_significance_tables_as_html( otu_cat_sig_output_fps, alpha, otu_cat_sig_dir, individual_titles, rep_set_fp=rep_set_fp) # Create relative paths for use with the index page. rel_otu_cat_sig_dir = basename(normpath(otu_cat_sig_dir)) otu_cat_sig_html_fps = [join(rel_otu_cat_sig_dir, html_filename) for html_filename in otu_cat_sig_html_filenames] otu_category_significance_html = \ create_otu_category_significance_html(otu_cat_sig_html_fps) # Create the index.html file for the current individual. create_index_html(person_of_interest, html_fp, alpha_diversity_boxplots_html=alpha_diversity_boxplots_html, otu_category_significance_html=otu_category_significance_html) logger.close() # Clean up the unnecessary raw data files and directories. glob will only # grab paths that exist. if not retain_raw_data: for raw_data_fp_glob in raw_data_files: remove_files(glob(raw_data_fp_glob)) for raw_data_dir_glob in raw_data_dirs: for dir_to_remove in glob(raw_data_dir_glob): rmtree(dir_to_remove) return output_directories
def main(): smp_map_f = open('smp_map.txt', 'U') smp_map_data, smp_map_header, smp_map_comments = \ parse_mapping_file(smp_map_f) smp_map_f.close() dist_f = open('new_disturbance_list.txt', 'U') anti_dist_map, sick_dist_map, menst_dist_map = \ parse_disturbance_file(dist_f) dist_f.close() pid_idx = smp_map_header.index('PersonalID') wss_idx = smp_map_header.index('WeeksSinceStart') # Add the three new columns at the end. new_smp_map_header = smp_map_header[:] new_smp_map_header.extend([anti_dist_name, sick_dist_name, menst_dist_name]) new_smp_map_data = [] for row in smp_map_data: pid = row[pid_idx] school = pid[:-3] week = row[wss_idx] anti_dist = False sick_dist = False menst_dist = False # Figure out if we should try to map this sample. valid_sample = True try: int(pid[-3:]) except: valid_sample = False if school not in schools: valid_sample = False try: week = float(week) except: valid_sample = False if valid_sample: if pid in anti_dist_map and week in anti_dist_map[pid]: anti_dist = True if pid in sick_dist_map and week in sick_dist_map[pid]: sick_dist = True if pid in menst_dist_map and week in menst_dist_map[pid]: menst_dist = True # Write out our results in three new columns. anti_dist_str = 'Yes' if anti_dist else 'No' sick_dist_str = 'Yes' if sick_dist else 'No' menst_dist_str = 'Yes' if menst_dist else 'No' new_row = row[:] new_row.extend([anti_dist_str, sick_dist_str, menst_dist_str]) new_smp_map_data.append(new_row) new_smp_map_f = open('new_smp_map.txt', 'w') new_smp_map_f.write(format_mapping_file(new_smp_map_header, new_smp_map_data, smp_map_comments)) new_smp_map_f.close()
def create_personal_results( output_dir, mapping_fp, coord_fp, collated_dir, otu_table_fp, prefs_fp, personal_id_column, personal_ids=None, column_title="Self", individual_titles=None, category_to_split="BodySite", time_series_category="WeeksSinceStart", rarefaction_depth=10000, alpha=0.05, rep_set_fp=None, body_site_rarefied_otu_table_dir=None, retain_raw_data=False, suppress_alpha_rarefaction=False, suppress_beta_diversity=False, suppress_taxa_summary_plots=False, suppress_alpha_diversity_boxplots=False, suppress_otu_category_significance=False, command_handler=call_commands_serially, status_update_callback=no_status_updates, ): # Create our output directory and copy over the resources the personalized # pages need (e.g. javascript, images, etc.). create_dir(output_dir) support_files_dir = join(output_dir, "support_files") if not exists(support_files_dir): copytree(join(get_project_dir(), "my_microbes", "support_files"), support_files_dir) logger = WorkflowLogger(generate_log_fp(output_dir)) mapping_data, header, comments = parse_mapping_file(open(mapping_fp, "U")) try: personal_id_index = header.index(personal_id_column) except ValueError: raise ValueError("Personal ID field '%s' is not a mapping file column " "header." % personal_id_column) try: bodysite_index = header.index(category_to_split) except ValueError: raise ValueError("Category to split field '%s' is not a mapping file " "column header." % category_to_split) header = header[:-1] + [column_title] + [header[-1]] # column that differentiates between body-sites within a single individual # used for the creation of the vectors in make_3d_plots.py, this data is # created by concatenating the two columns when writing the mapping file site_id_category = "%s&&%s" % (personal_id_column, category_to_split) header.insert(len(header) - 1, site_id_category) all_personal_ids = get_personal_ids(mapping_data, personal_id_index) if personal_ids == None: personal_ids = all_personal_ids else: for pid in personal_ids: if pid not in all_personal_ids: raise ValueError( "'%s' is not a personal ID in the mapping " "file column '%s'." % (pid, personal_id_column) ) if time_series_category not in header: raise ValueError("Time series field '%s' is not a mapping file column " "header." % time_series_category) otu_table_title = splitext(basename(otu_table_fp)) output_directories = [] raw_data_files = [] raw_data_dirs = [] # Rarefy the OTU table and split by body site here (instead of on a # per-individual basis) as we can use the same rarefied and split tables # for each individual. if not suppress_otu_category_significance: rarefied_otu_table_fp = join(output_dir, add_filename_suffix(otu_table_fp, "_even%d" % rarefaction_depth)) if body_site_rarefied_otu_table_dir is None: commands = [] cmd_title = "Rarefying OTU table" cmd = "single_rarefaction.py -i %s -o %s -d %s" % (otu_table_fp, rarefied_otu_table_fp, rarefaction_depth) commands.append([(cmd_title, cmd)]) raw_data_files.append(rarefied_otu_table_fp) per_body_site_dir = join(output_dir, "per_body_site_otu_tables") cmd_title = "Splitting rarefied OTU table by body site" cmd = "split_otu_table.py -i %s -m %s -f %s -o %s" % ( rarefied_otu_table_fp, mapping_fp, category_to_split, per_body_site_dir, ) commands.append([(cmd_title, cmd)]) raw_data_dirs.append(per_body_site_dir) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) else: per_body_site_dir = body_site_rarefied_otu_table_dir for person_of_interest in personal_ids: # Files to clean up on a per-individual basis. personal_raw_data_files = [] personal_raw_data_dirs = [] create_dir(join(output_dir, person_of_interest)) personal_mapping_file_fp = join(output_dir, person_of_interest, "mapping_file.txt") html_fp = join(output_dir, person_of_interest, "index.html") personal_mapping_data = create_personal_mapping_file( mapping_data, person_of_interest, personal_id_index, bodysite_index, individual_titles ) personal_mapping_f = open(personal_mapping_file_fp, "w") personal_mapping_f.write(format_mapping_file(header, personal_mapping_data, comments)) personal_mapping_f.close() personal_raw_data_files.append(personal_mapping_file_fp) column_title_index = header.index(column_title) column_title_values = set([e[column_title_index] for e in personal_mapping_data]) cat_index = header.index(category_to_split) cat_values = set([e[cat_index] for e in personal_mapping_data]) # Generate alpha diversity boxplots, split by body site, one per # metric. We run this one first because it completes relatively # quickly and it does not call any QIIME scripts. alpha_diversity_boxplots_html = "" if not suppress_alpha_diversity_boxplots: adiv_boxplots_dir = join(output_dir, person_of_interest, "adiv_boxplots") create_dir(adiv_boxplots_dir) output_directories.append(adiv_boxplots_dir) logger.write("\nGenerating alpha diversity boxplots (%s)\n\n" % person_of_interest) plot_filenames = _generate_alpha_diversity_boxplots( collated_dir, personal_mapping_file_fp, category_to_split, column_title, rarefaction_depth, adiv_boxplots_dir, ) # Create relative paths for use with the index page. rel_boxplot_dir = basename(normpath(adiv_boxplots_dir)) plot_fps = [join(rel_boxplot_dir, plot_filename) for plot_filename in plot_filenames] alpha_diversity_boxplots_html = create_alpha_diversity_boxplots_html(plot_fps) ## Alpha rarefaction steps if not suppress_alpha_rarefaction: rarefaction_dir = join(output_dir, person_of_interest, "alpha_rarefaction") output_directories.append(rarefaction_dir) commands = [] cmd_title = "Creating rarefaction plots (%s)" % person_of_interest cmd = "make_rarefaction_plots.py -i %s -m %s -p %s -o %s" % ( collated_dir, personal_mapping_file_fp, prefs_fp, rarefaction_dir, ) commands.append([(cmd_title, cmd)]) personal_raw_data_dirs.append(join(rarefaction_dir, "average_plots")) personal_raw_data_dirs.append(join(rarefaction_dir, "average_tables")) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) ## Beta diversity steps if not suppress_beta_diversity: pcoa_dir = join(output_dir, person_of_interest, "beta_diversity") pcoa_time_series_dir = join(output_dir, person_of_interest, "beta_diversity_time_series") output_directories.append(pcoa_dir) output_directories.append(pcoa_time_series_dir) commands = [] cmd_title = "Creating beta diversity time series plots (%s)" % person_of_interest cmd = "make_3d_plots.py -m %s -p %s -i %s -o %s --custom_axes=" % ( personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_time_series_dir, ) + "'%s' --add_vectors='%s,%s'" % (time_series_category, site_id_category, time_series_category) commands.append([(cmd_title, cmd)]) cmd_title = "Creating beta diversity plots (%s)" % person_of_interest cmd = "make_3d_plots.py -m %s -p %s -i %s -o %s" % (personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_dir) commands.append([(cmd_title, cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) ## Time series taxa summary plots steps taxa_summary_plots_html = "" if not suppress_taxa_summary_plots: area_plots_dir = join(output_dir, person_of_interest, "time_series") create_dir(area_plots_dir) output_directories.append(area_plots_dir) files_to_remove, dirs_to_remove = _generate_taxa_summary_plots( otu_table_fp, personal_mapping_file_fp, person_of_interest, column_title, column_title_values, category_to_split, cat_values, time_series_category, area_plots_dir, command_handler, status_update_callback, logger, ) personal_raw_data_files.extend(files_to_remove) personal_raw_data_dirs.extend(dirs_to_remove) taxa_summary_plots_html = create_taxa_summary_plots_html(output_dir, person_of_interest, cat_values) # Generate OTU category significance tables (per body site). otu_cat_sig_output_fps = [] otu_category_significance_html = "" if not suppress_otu_category_significance: otu_cat_sig_dir = join(output_dir, person_of_interest, "otu_category_significance") create_dir(otu_cat_sig_dir) output_directories.append(otu_cat_sig_dir) # For each body-site rarefied OTU table, run # otu_category_significance.py using self versus other category. # Keep track of each output file that is created because we need to # parse these later on. commands = [] valid_body_sites = [] for cat_value in cat_values: body_site_otu_table_fp = join( per_body_site_dir, add_filename_suffix(rarefied_otu_table_fp, "_%s" % cat_value) ) if exists(body_site_otu_table_fp): # Make sure we have at least one sample for Self, otherwise # otu_category_significance.py crashes with a division by # zero error. with open(body_site_otu_table_fp, "U") as body_site_otu_table_f, open( personal_mapping_file_fp, "U" ) as personal_mapping_file_f: personal_sample_count = _count_per_individual_samples( body_site_otu_table_f, personal_mapping_file_f, personal_id_column, person_of_interest ) if personal_sample_count < 1: continue else: valid_body_sites.append(cat_value) otu_cat_output_fp = join(otu_cat_sig_dir, "otu_cat_sig_%s.txt" % cat_value) cmd_title = "Testing for significant differences in " 'OTU abundances in "%s" body site (%s)' % ( cat_value, person_of_interest, ) cmd = "otu_category_significance.py -i %s -m %s -c %s " "-o %s" % ( body_site_otu_table_fp, personal_mapping_file_fp, column_title, otu_cat_output_fp, ) commands.append([(cmd_title, cmd)]) personal_raw_data_files.append(otu_cat_output_fp) otu_cat_sig_output_fps.append(otu_cat_output_fp) # Hack to allow print-only mode. if command_handler is not print_commands and not valid_body_sites: raise ValueError( "None of the body sites for personal ID '%s' " "could be processed because there were no " "matching samples in the rarefied OTU table." % person_of_interest ) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # Reformat otu category significance tables. otu_cat_sig_html_filenames = create_otu_category_significance_html_tables( otu_cat_sig_output_fps, alpha, otu_cat_sig_dir, individual_titles, rep_set_fp=rep_set_fp ) # Create relative paths for use with the index page. rel_otu_cat_sig_dir = basename(normpath(otu_cat_sig_dir)) otu_cat_sig_html_fps = [ join(rel_otu_cat_sig_dir, html_filename) for html_filename in otu_cat_sig_html_filenames ] otu_category_significance_html = create_otu_category_significance_html(otu_cat_sig_html_fps) # Create the index.html file for the current individual. create_index_html( person_of_interest, html_fp, taxa_summary_plots_html=taxa_summary_plots_html, alpha_diversity_boxplots_html=alpha_diversity_boxplots_html, otu_category_significance_html=otu_category_significance_html, ) # Clean up the unnecessary raw data files and directories for the # current individual. glob will only grab paths that exist. if not retain_raw_data: clean_up_raw_data_files(personal_raw_data_files, personal_raw_data_dirs) # Clean up any remaining raw data files that weren't created on a # per-individual basis. if not retain_raw_data: clean_up_raw_data_files(raw_data_files, raw_data_dirs) logger.close() return output_directories
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if (mapping_fp is None and valid_states is not None): option_parser.error("--mapping_fp must be provided if --valid_states " "is passed.") if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error("No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those).") if (mapping_fp and valid_states) and sample_id_fp: option_parser.error("Providing both --sample_id_fp and " "--mapping_fp/--valid_states is not supported.") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) negate_sample_id_fp = opts.negate_sample_id_fp if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) negate_sample_id_fp = False else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: o = open(sample_id_fp, 'U') sample_id_f_ids = set( [l.strip().split()[0] for l in o if not l.startswith('#')]) o.close() sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table( otu_table, sample_ids_to_keep, min_count, max_count, negate_ids_to_keep=negate_sample_id_fp) try: write_biom_table(filtered_otu_table, output_fp) except EmptyBIOMTableError: option_parser.error( "Filtering resulted in an empty BIOM table. " "This indicates that no samples remained after filtering.") # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.ids()) open(output_mapping_fp, 'w').write(format_mapping_file(mapping_headers, mapping_data))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) otu_table_fp = opts.otu_table_fp otu_table = parse_biom_table(qiime_open(otu_table_fp)) min_counts, max_counts, median_counts, mean_counts, counts_per_sample = compute_seqs_per_library_stats( otu_table, opts.num_otus ) num_otus = len(otu_table.ObservationIds) counts_per_sample_values = counts_per_sample.values() med_abs_dev = median_absolute_deviation(counts_per_sample_values)[0] even_sampling_depth = guess_even_sampling_depth(counts_per_sample_values) num_samples = len(counts_per_sample) print "Num samples: %s" % str(num_samples) print "Num otus: %s" % str(num_otus) if not opts.num_otus: num_observations = sum(counts_per_sample_values) print "Num observations (sequences): %s" % str(num_observations) # port denisty functionality to a tested function. the following is broken (should be # count of non-zero cells rather than number of observations in the numerator) # print 'Table density (fraction of non-zero values): %1.4f' % (num_observations/(num_samples * num_otus)) print if opts.num_otus: print "OTUs/sample summary:" else: print "Seqs/sample summary:" print " Min: %s" % str(min_counts) print " Max: %s" % str(max_counts) print " Median: %s" % str(median_counts) print " Mean: %s" % str(mean_counts) print " Std. dev.: %s" % (str(std(counts_per_sample_values))) print " Median Absolute Deviation: %s" % str(med_abs_dev) print " Default even sampling depth in\n core_qiime_analyses.py (just a suggestion): %s" % str(even_sampling_depth) print "" if opts.num_otus: print "OTUs/sample detail:" else: print "Seqs/sample detail:" sorted_counts_per_sample = [(v, k) for k, v in counts_per_sample.items()] sorted_counts_per_sample.sort() total_count = 0 for v, k in sorted_counts_per_sample: total_count += v print " %s: %s" % (k, str(v)) if opts.mapping_fp: if not opts.output_mapping_fp: raise RuntimeError("input mapping file supplied, but no path to" + " output file") f = open(opts.mapping_fp, "U") mapping_lines, headers, comments = parse_mapping_file(f) f.close() if len(headers) == 1: endoffset = 0 # if we only have the sample id, this data -> last col else: endoffset = 1 # usually make this data the penultimate column. headers.insert(len(headers) - endoffset, "NumIndividuals") for map_line in mapping_lines: sample_id = map_line try: depth = str(counts_per_sample[map_line[0]]) except KeyError: depth = "na" map_line.insert(len(map_line) - endoffset, depth) new_map_str = format_mapping_file(headers, mapping_lines, comments) f = open(opts.output_mapping_fp, "w") f.write(new_map_str) f.close()
def make_distance_boxplots( dm_f, map_f, fields, width=None, height=6.0, suppress_all_within=False, suppress_all_between=False, suppress_individual_within=False, suppress_individual_between=False, y_min=0.0, y_max=1.0, whisker_length=1.5, box_width=0.5, box_color=None, color_individual_within_by_field=None, sort=None, ): """Generates various types of boxplots for distance comparisons. Returns a list of tuples, one for each field. Each tuple contains the following: 1) the name of the field (string) 2) a matplotlib.figure.Figure object containing the boxplots 3) a list of lists containing the raw plot data that was passed to mpl 4) a list of labels for each of the boxplots (string) 5) a list of mpl-compatible colors (one for each boxplot) The Figure can be saved, and the raw data and labels can be useful (for example) performing statistical tests or writing the raw data to disk. The input arguments are exactly derived from the make_distance_boxplots.py script (see the script options for details). To avoid duplicated effort, their descriptions are not reproduced here. """ # Parse data files and do some preliminary error checking. dm_header, dm_data = parse_distmat(dm_f) map_data, map_header, map_comments = parse_mapping_file(map_f) if fields is None or len(fields) < 1: raise ValueError("You must provide at least one field to analyze.") for field in fields: if field not in map_header: raise ValueError( "The field '%s' is not in the provided mapping " "file. Please supply correct fields " "corresponding to fields in the mapping file." % field ) # Make sure the y_min and y_max options make sense, as they can be either # 'auto' or a number. y_min = _cast_y_axis_extrema(y_min) y_max = _cast_y_axis_extrema(y_max) # Collate the distributions of distances that will comprise each boxplot. # Suppress the generation of the indicated types of boxplots. results = [] for field in fields: plot_data = [] plot_labels = [] plot_colors = [] legend = None # Little bit of duplicate code here... not sure it's worth the effort # to clean up though. if not suppress_all_within: plot_data.append(get_all_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=True)) plot_labels.append("All within %s" % field) if color_individual_within_by_field is not None: plot_colors.append(None) else: plot_colors.append(box_color) if not suppress_all_between: plot_data.append(get_all_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=False)) plot_labels.append("All between %s" % field) if color_individual_within_by_field is not None: plot_colors.append(None) else: plot_colors.append(box_color) if not suppress_individual_within: within_dists = get_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=True) field_states = [] for grouping in within_dists: plot_data.append(grouping[2]) plot_labels.append("%s vs. %s" % (grouping[0], grouping[1])) field_states.append(grouping[0]) # If we need to color these boxplots by a field, build up a # list of colors and a legend. if color_individual_within_by_field is not None: colors, color_mapping = _color_field_states( format_mapping_file(map_header, map_data).split("\n"), dm_header, field, field_states, color_individual_within_by_field, ) plot_colors.extend(colors) legend = (color_mapping.values(), color_mapping.keys()) else: plot_colors.extend([box_color] * len(field_states)) if not suppress_individual_between: between_dists = get_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=False) for grouping in between_dists: plot_data.append(grouping[2]) plot_labels.append("%s vs. %s" % (grouping[0], grouping[1])) if color_individual_within_by_field is not None: plot_colors.append(None) else: plot_colors.append(box_color) assert len(plot_data) == len(plot_labels) and len(plot_labels) == len(plot_colors), ( "The number " + "of boxplot labels and colors do not match the number of " + "boxplots." ) # We now have our data and labels ready, so plot them! if plot_data: if sort is not None: plot_data, plot_labels, plot_colors = _sort_distributions(plot_data, plot_labels, plot_colors, sort) if width is None: width = len(plot_data) * box_width + 2 if width <= 0 or height <= 0: raise ValueError("The specified width and height of the plot " "must be greater than zero.") plot_figure = boxplots( plot_data, x_tick_labels=plot_labels, title="%s Distances" % field, x_label="Grouping", y_label="Distance", x_tick_labels_orientation="vertical", y_min=y_min, y_max=y_max, whisker_length=whisker_length, box_width=box_width, box_colors=plot_colors, figure_width=width, figure_height=height, legend=legend, ) results.append((field, plot_figure, plot_data, plot_labels, plot_colors)) else: raise ValueError( "The generation of all plots was suppressed. At " "least one type of plot must be unsuppressed." ) return results
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) otu_table_fp = opts.otu_table_fp otu_table = parse_biom_table(qiime_open(otu_table_fp)) min_counts, max_counts, median_counts, mean_counts, counts_per_sample =\ compute_seqs_per_library_stats(otu_table, opts.num_otus) num_otus = len(otu_table.ObservationIds) counts_per_sample_values = counts_per_sample.values() med_abs_dev = median_absolute_deviation(counts_per_sample_values)[0] even_sampling_depth = guess_even_sampling_depth(counts_per_sample_values) num_samples = len(counts_per_sample) print 'Num samples: %s' % str(num_samples) print 'Num otus: %s' % str(num_otus) if not opts.num_otus: num_observations = sum(counts_per_sample_values) print 'Num observations (sequences): %s' % str(num_observations) # port denisty functionality to a tested function. the following is broken (should be # count of non-zero cells rather than number of observations in the numerator) #print 'Table density (fraction of non-zero values): %1.4f' % (num_observations/(num_samples * num_otus)) print if opts.num_otus: print 'OTUs/sample summary:' else: print 'Seqs/sample summary:' print ' Min: %s' % str(min_counts) print ' Max: %s' % str(max_counts) print ' Median: %s' % str(median_counts) print ' Mean: %s' % str(mean_counts) print ' Std. dev.: %s' % (str(std(counts_per_sample_values))) print ' Median Absolute Deviation: %s' % str(med_abs_dev) print ' Default even sampling depth in\n core_qiime_analyses.py (just a suggestion): %s' %\ str(even_sampling_depth) print '' if opts.num_otus: print 'OTUs/sample detail:' else: print 'Seqs/sample detail:' sorted_counts_per_sample = [(v, k) for k, v in counts_per_sample.items()] sorted_counts_per_sample.sort() total_count = 0 for v, k in sorted_counts_per_sample: total_count += v print ' %s: %s' % (k, str(v)) if opts.mapping_fp: if not opts.output_mapping_fp: raise RuntimeError('input mapping file supplied, but no path to'+\ ' output file') f = open(opts.mapping_fp, 'U') mapping_lines, headers, comments = parse_mapping_file(f) f.close() if len(headers) == 1: endoffset = 0 # if we only have the sample id, this data -> last col else: endoffset = 1 # usually make this data the penultimate column. headers.insert(len(headers) - endoffset, 'NumIndividuals') for map_line in mapping_lines: sample_id = map_line try: depth = str(counts_per_sample[map_line[0]]) except KeyError: depth = 'na' map_line.insert(len(map_line) - endoffset, depth) new_map_str = format_mapping_file(headers, mapping_lines, comments) f = open(opts.output_mapping_fp, 'w') f.write(new_map_str) f.close()
def main(): option_parser, opts,args = parse_command_line_parameters(**script_info) otu_table_fp = opts.otu_table_fp otu_table = parse_biom_table(qiime_open(otu_table_fp)) min_counts, max_counts, median_counts, mean_counts, counts_per_sample =\ compute_seqs_per_library_stats(otu_table, opts.num_otus) num_otus = len(otu_table.ObservationIds) counts_per_sample_values = counts_per_sample.values() med_abs_dev = median_absolute_deviation(counts_per_sample_values)[0] even_sampling_depth = guess_even_sampling_depth(counts_per_sample_values) try: sample_md_keys = otu_table.SampleMetadata[0].keys() except TypeError: sample_md_keys = ["None provided"] try: observation_md_keys = otu_table.ObservationMetadata[0].keys() except TypeError: observation_md_keys = ["None provided"] num_samples = len(counts_per_sample) print 'Num samples: %s' % str(num_samples) print 'Num otus: %s' % str(num_otus) if not opts.num_otus: num_observations = sum(counts_per_sample_values) print 'Num observations (sequences): %s' % str(num_observations) print 'Table density (fraction of non-zero values): %1.4f' % \ otu_table.getTableDensity() print if opts.num_otus: print 'OTUs/sample summary:' else: print 'Seqs/sample summary:' print ' Min: %s' % str(min_counts) print ' Max: %s' % str(max_counts) print ' Median: %s' % str(median_counts) print ' Mean: %s' % str(mean_counts) print ' Std. dev.: %s' % (str(std(counts_per_sample_values))) print ' Median Absolute Deviation: %s' % str(med_abs_dev) print ' Default even sampling depth in\n core_qiime_analyses.py (just a suggestion): %s' %\ str(even_sampling_depth) print ' Sample Metadata Categories: %s' % '; '.join(sample_md_keys) print ' Observation Metadata Categories: %s' % '; '.join(observation_md_keys) print '' if opts.num_otus: print 'OTUs/sample detail:' else: print 'Seqs/sample detail:' sorted_counts_per_sample = [(v,k) for k,v in counts_per_sample.items()] sorted_counts_per_sample.sort() total_count = 0 for v,k in sorted_counts_per_sample: total_count += v print ' %s: %s' % (k,str(v)) if opts.mapping_fp: if not opts.output_mapping_fp: raise RuntimeError('input mapping file supplied, but no path to'+\ ' output file') f = open(opts.mapping_fp,'U') mapping_lines, headers, comments = parse_mapping_file(f) f.close() if len(headers)==1: endoffset = 0 # if we only have the sample id, this data -> last col else: endoffset = 1 # usually make this data the penultimate column. headers.insert(len(headers)-endoffset,'SequenceCount') for map_line in mapping_lines: sample_id = map_line try: depth = str(counts_per_sample[map_line[0]]) except KeyError: depth = 'na' map_line.insert(len(map_line)-endoffset,depth) new_map_str = format_mapping_file(headers, mapping_lines, comments) f = open(opts.output_mapping_fp, 'w') f.write(new_map_str) f.close()