def split_mapping_file_on_field(mapping_f, mapping_field, column_rename_ids=None, include_repeat_cols=True): """ split mapping file based on value in field """ mapping_f = list(mapping_f) mapping_values = get_mapping_values(mapping_f, mapping_field) mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) if column_rename_ids: try: column_rename_ids = mapping_headers.index(column_rename_ids) except ValueError: raise KeyError("Field is not in mapping file (search is case " + "and white-space sensitive). \n\tProvided field: " + "%s. \n\tValid fields: %s" % (mapping_field, ' '.join(mapping_headers))) for v in mapping_values: v_fp_str = v.replace(' ', '_') sample_ids_to_keep = sample_ids_from_metadata_description( mapping_f, valid_states_str="%s:%s" % (mapping_field, v)) # parse mapping file each time though the loop as filtering operates on # values mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) mapping_headers, mapping_data = filter_mapping_file( mapping_data, mapping_headers, sample_ids_to_keep, include_repeat_cols=include_repeat_cols, column_rename_ids=column_rename_ids) yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
def silly_function(ui): for c_value in ui.series(coloring_values): sample_ids = sample_ids_from_metadata_description(open(mapping_fp, 'U'), '%s:%s' % (coloring_header_name, c_value)) _headers, _data = filter_mapping_file(data, headers, sample_ids, True) per_color_subject_values = list(set([row[subject_index] for row in _data])) fd = open(join(output_path, 'color_by_'+c_value+'.txt'), 'w') for s in ui.series(per_color_subject_values): fd.write('%s\n' % s) fd.close() if not suppress_trajectory_files: for s in ui.series(per_color_subject_values): filename = join(output_path, s+'.txt') if opts.verbose: print 'Working on printing', filename COMMAND_CALL = FILTER_CMD % (coords_fp, mapping_fp, '%s:%s' % (subject_header_name, s), filename, sorting_category) o, e, r = qiime_system_call(COMMAND_CALL) if opts.verbose and e: print 'Error happened on filtering step: \n%s' % e continue COMMAND_CALL = CONVERSION_CMD % (filename, filename) o, e, r = qiime_system_call(COMMAND_CALL) if opts.verbose and e: print 'Error happened on conversion step: \n%s' % e continue # useless here but just in case
def split_mapping_file_on_field(mapping_f, mapping_field, column_rename_ids=None, include_repeat_cols=True): """ split mapping file based on value in field """ mapping_f = list(mapping_f) mapping_values = get_mapping_values(mapping_f,mapping_field) mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) if column_rename_ids: try: column_rename_ids = mapping_headers.index(column_rename_ids) except ValueError: raise KeyError("Field is not in mapping file (search is case "+\ "and white-space sensitive). \n\tProvided field: "+\ "%s. \n\tValid fields: %s" % (mapping_field,' '.join(mapping_headers))) for v in mapping_values: v_fp_str = v.replace(' ','_') sample_ids_to_keep = sample_ids_from_metadata_description( mapping_f,valid_states_str="%s:%s" % (mapping_field,v)) # parse mapping file each time though the loop as filtering operates on values mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) mapping_headers, mapping_data = filter_mapping_file( mapping_data, mapping_headers, sample_ids_to_keep, include_repeat_cols=include_repeat_cols, column_rename_ids=column_rename_ids) yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) map_fp = opts.mapping biom_fp = opts.biom_file min_seqs_sample = opts.min_seqs_sample subject_category = opts.subject_name cleaned_fp = opts.clean_fp verbose = opts.verbose map_data, headers, comments = parse_mapping_file(open(map_fp, 'U')) biom_table = parse_biom_table(open(biom_fp, 'U')) # getting valid samples from biom file real_map_headers, real_map_data = filter_mapping_file(map_data, headers,\ biom_table.SampleIds, include_repeat_cols=False) if subject_category not in real_map_headers: raise ValueError, 'This column: %s is not in the mapping file, try %s'%\ (subject_category, real_map_headers) sorted_counts_per_sample = get_sorted_counts_per_sample(biom_table) mapping_file_tuple = (real_map_data, real_map_headers) # calculate the available subjects at each rarefaction level results = make_selectors(sorted_counts_per_sample, min_seqs_sample,\ mapping_file_tuple, subject_category, verbose=verbose) # save the output fout = open(cleaned_fp,'w') fout.write('#Sequences\tSubjects\tSamples\tMetadata\n') fout.write('\n'.join(results)) fout.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if mapping_fp is None and valid_states is not None: option_parser.error("--mapping_fp must be provided if --valid_states " "is passed.") if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error( "No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those)." ) if (mapping_fp and valid_states) and sample_id_fp: option_parser.error("Providing both --sample_id_fp and " "--mapping_fp/--valid_states is not supported.") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) negate_sample_id_fp = opts.negate_sample_id_fp if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description(open(mapping_fp, "U"), valid_states) negate_sample_id_fp = False else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: o = open(sample_id_fp, "U") sample_id_f_ids = set([l.strip().split()[0] for l in o if not l.startswith("#")]) o.close() sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table( otu_table, sample_ids_to_keep, min_count, max_count, negate_ids_to_keep=negate_sample_id_fp ) try: write_biom_table(filtered_otu_table, output_fp) except EmptyBIOMTableError: option_parser.error( "Filtering resulted in an empty BIOM table. " "This indicates that no samples remained after filtering." ) # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_fp, "U")) mapping_headers, mapping_data = filter_mapping_file(mapping_data, mapping_headers, filtered_otu_table.ids()) open(output_mapping_fp, "w").write(format_mapping_file(mapping_headers, mapping_data))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error("No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those).") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: sample_id_f_ids = set([l.strip().split()[0] for l in open(sample_id_fp, 'U') if not l.startswith('#')]) sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table(otu_table, sample_ids_to_keep, min_count, max_count) write_biom_table(filtered_otu_table, output_fp) # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.ids()) open( output_mapping_fp, 'w').write( format_mapping_file( mapping_headers, mapping_data))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error( "No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination of those).") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = parse_biom_table(open(opts.input_fp, 'U')) output_f = open(opts.output_fp, 'w') if (mapping_fp and valid_states): sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) else: sample_ids_to_keep = otu_table.SampleIds if (sample_id_fp is not None): sample_id_f_ids = set([ l.strip().split()[0] for l in open(sample_id_fp, 'U') if not l.startswith('#') ]) sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table(otu_table, sample_ids_to_keep, min_count, max_count) output_f.write(format_biom_table(filtered_otu_table)) output_f.close() # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.SampleIds) open(output_mapping_fp, 'w').write(format_mapping_file(mapping_headers, mapping_data))
message += ' Offending sample identifier(s): %s.' %\ ', '.join(sids_difference) print sids_difference option_parser.error(message) if number_intersected_sids != required_number_of_sids and\ ignore_missing_samples: # keep only the samples that are mapped in the mapping file coords_headers, coords_data = keep_samples_from_pcoa_data( coords_headers, coords_data, sids_intersection) # ignore samples that exist in the coords but not in the mapping file, note: # we're using sids_intersection so if --ignore_missing_samples is enabled we # account for unmapped coords, else the program will exit before this point header, mapping_data = filter_mapping_file(mapping_data, header, sids_intersection, include_repeat_cols=True) # catch the errors that could occur when filling the mapping file values if missing_custom_axes_values: try: # the fact that this uses parse_metadata_state_descriptions makes # the following option '-x Category:7;PH:12' to work as well as the # script-interface-documented '-x Category:7 -x PH:12' option for val in missing_custom_axes_values: if ':' not in val: option_parser.error("Not valid missing value for custom " "axes: %s" % val) mapping_data = fill_mapping_field_from_mapping_file(mapping_data, header, ';'.join(missing_custom_axes_values)) except AssertionError, e:
def make_selectors(counts_per_sample, minimum, mapping_file_tuple, subject_header_name, verbose=False): """make the four column string needed to print in the selectors file Inputs: counts_per_sample: a sorted list of tuples with the sample identifier and the number of sequences. minimum: minimum number of sequences considered to be a valid state. mapping_file_tuple: a tuple with the data of a mapping file and the headers. subject_header_name: string identifying the name of the column in the mapping file that represents a unique subject. Output: result: four columns string corresponding to number of sequences, subjects, number of samples and metadata fields. """ # unwrap the mapping file mapping_data = mapping_file_tuple[0] mapping_headers = mapping_file_tuple[1] seqs_per_sample = [t[0] for t in counts_per_sample] head_val = None subj_val = None samp_sub = None results = [] depth = -1 samples_per_subject = {} # store the index for convenience subject_index = mapping_headers.index(subject_header_name) list_of_subjects = [line[subject_index] for line in mapping_data] # initialize the samples_per_subject dictionary with as many keys as # subjects and values equal to the minimum number of samples among them for unique_subject in list(set(list_of_subjects)): samples_per_subject[unique_subject] = list_of_subjects.count( unique_subject) least_number_of_samples = min(samples_per_subject.values()) for key, value in samples_per_subject.iteritems(): samples_per_subject[key] = least_number_of_samples for sequences_per_sample_tuple in counts_per_sample: # there's no need to iterate if the minimum rarefaction depth is not met # or if the depth is the same as the previous depth, this would mean a # repeated row in the output line with the same values if sequences_per_sample_tuple[0] < minimum or \ sequences_per_sample_tuple[0] == depth: continue if verbose: print 'Samples per subject: {0} @ depth: {1}'\ .format(samples_per_subject, depth) # Some samples are not in the mapping file just print those out sample_id = sequences_per_sample_tuple[1] try: current_subject = [ line[subject_index] for line in mapping_data if line[0] == sample_id ][0] except IndexError: print 'Sample Id: {0} is not in the mapping file'.format(sample_id) continue # extract convenience data for ease of use depth = sequences_per_sample_tuple[0] remaining_ids = [ _tuple[1] for _tuple in counts_per_sample if _tuple[0] >= depth ] filtered_headers, filtered_data = filter_mapping_file(mapping_data,\ mapping_headers, remaining_ids, include_repeat_cols=False) # Breaking when there are no subjects/individuals left if subject_header_name not in filtered_headers: break # numbers to be written in the selectors file number_of_subjects = len(samples_per_subject.keys()) number_of_samples = min(samples_per_subject.values()) if number_of_subjects * number_of_samples < 3: continue # format the output if not subj_val and not head_val and not samp_sub: results.append('%d\t%d\t%d\t%s' % (int(depth), number_of_subjects,\ number_of_samples, ','.join(filtered_headers[1:-1]))) subj_val = number_of_subjects head_val = filtered_headers samp_sub = number_of_samples main_map_cat = filtered_headers else: if head_val != filtered_headers: results.append('%d\t%d\t%d\t%s'%(int(depth),number_of_subjects,\ number_of_samples, ','.join(filtered_headers[1:-1]))) head_val = filtered_headers elif samp_sub != number_of_samples: results.append('%d\t%d\t%d\tNone'% (int(depth),\ number_of_subjects, number_of_samples)) samp_sub = number_of_samples elif subj_val != number_of_subjects: results.append('%d\t%d\t%d\tNone' % (int(depth),\ number_of_subjects, number_of_samples)) subj_val = number_of_subjects # remove the current processed sample and if needed, remove the subject try: samples_per_subject[current_subject] -= 1 if samples_per_subject[current_subject] == 0: del samples_per_subject[current_subject] except: pass return results, main_map_cat
def get_field_state_comparisons(dist_matrix_header, dist_matrix, mapping_header, mapping, field, comparison_field_states, suppress_symmetry_and_hollowness_check=False): """Returns a 2D dictionary relating distances between field states. The 2D dictionary is constructed such that each top-level key is a field state other than the field states in comparison_field_states. The second-level key is a field state from comparison_field_states, and the value at the (key, key) index is a list of distances between those two field states. Thus, given a field, this function will create comparisons between the specified comparison_field_states and all other field states. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. Arguments: - dist_matrix_header: The distance matrix header, obtained from parse.parse_distmat() - dist_matrix: The distance matrix, obtained from parse.parse_distmat(). - mapping_header: The mapping file header, obtained from parse.parse_mapping_file() - mapping: The mapping file's contents, obtained from parse.parse_mapping_file() - field: A field in the mapping file to do the comparisons on. - comparison_field_states: A list of strings specifying the field states to compare to all other field states. Cannot be an empty list. - suppress_symmetry_and_hollowness_check: By default, the input distance matrix will be checked for symmetry and hollowness. It is recommended to leave this check in place for safety, as the check is fairly fast. However, if you *know* you have a symmetric and hollow distance matrix, you can disable this check for small performance gains on extremely large distance matrices """ _validate_input(dist_matrix_header, dist_matrix, mapping_header, mapping, field) # avoid empty groups of distances mapping_header, mapping = filter_mapping_file(mapping, mapping_header, dist_matrix_header) # Make sure each comparison group field state is in the specified field. if not comparison_field_states: raise ValueError("You must provide at least one field state to " "compare to all of the other field states.") mapping_data = [mapping_header] mapping_data.extend(mapping) groups = group_by_field(mapping_data, field) for field_state in comparison_field_states: if field_state not in groups: raise ValueError("The comparison group field state '%s' is not in " "the provided mapping file's field '%s'." % (field_state, field)) # Grab a list of all other field states (besides the ones in # comparison_field_states). These will be the field states that the states # in comparison_field_states will be compared against. field_states = [ group for group in groups.keys() if group not in comparison_field_states ] # Get between distance groupings for the field of interest. between_groupings = get_grouped_distances( dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=False, suppress_symmetry_and_hollowness_check= suppress_symmetry_and_hollowness_check) # Build up our 2D dictionary giving the distances between a field state and # a comparison group field state by filtering out the between_groupings # list to include only the comparisons that we want. result = {} for field_state in field_states: result[field_state] = {} for comp_field_state in comparison_field_states: result[field_state][comp_field_state] = [] for group in between_groupings: if ((group[0] == field_state or group[1] == field_state) and (group[0] == comp_field_state or group[1] == comp_field_state)): # We've found a group of distances between our comparison # field state and the current field state, so keep the # data. result[field_state][comp_field_state] = group[2] return result
def get_field_state_comparisons(dist_matrix_header, dist_matrix, mapping_header, mapping, field, comparison_field_states, suppress_symmetry_and_hollowness_check=False): """Returns a 2D dictionary relating distances between field states. The 2D dictionary is constructed such that each top-level key is a field state other than the field states in comparison_field_states. The second-level key is a field state from comparison_field_states, and the value at the (key, key) index is a list of distances between those two field states. Thus, given a field, this function will create comparisons between the specified comparison_field_states and all other field states. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. Arguments: - dist_matrix_header: The distance matrix header, obtained from parse.parse_distmat() - dist_matrix: The distance matrix, obtained from parse.parse_distmat(). - mapping_header: The mapping file header, obtained from parse.parse_mapping_file() - mapping: The mapping file's contents, obtained from parse.parse_mapping_file() - field: A field in the mapping file to do the comparisons on. - comparison_field_states: A list of strings specifying the field states to compare to all other field states. Cannot be an empty list. - suppress_symmetry_and_hollowness_check: By default, the input distance matrix will be checked for symmetry and hollowness. It is recommended to leave this check in place for safety, as the check is fairly fast. However, if you *know* you have a symmetric and hollow distance matrix, you can disable this check for small performance gains on extremely large distance matrices """ _validate_input(dist_matrix_header, dist_matrix, mapping_header, mapping, field) # avoid empty groups of distances mapping_header, mapping = filter_mapping_file(mapping, mapping_header, dist_matrix_header) # Make sure each comparison group field state is in the specified field. if not comparison_field_states: raise ValueError("You must provide at least one field state to " "compare to all of the other field states.") mapping_data = [mapping_header] mapping_data.extend(mapping) groups = group_by_field(mapping_data, field) for field_state in comparison_field_states: if field_state not in groups: raise ValueError("The comparison group field state '%s' is not in " "the provided mapping file's field '%s'." % (field_state, field)) # Grab a list of all other field states (besides the ones in # comparison_field_states). These will be the field states that the states # in comparison_field_states will be compared against. field_states = [group for group in groups.keys() if group not in comparison_field_states] # Get between distance groupings for the field of interest. between_groupings = get_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=False, suppress_symmetry_and_hollowness_check= suppress_symmetry_and_hollowness_check) # Build up our 2D dictionary giving the distances between a field state and # a comparison group field state by filtering out the between_groupings # list to include only the comparisons that we want. result = {} for field_state in field_states: result[field_state] = {} for comp_field_state in comparison_field_states: result[field_state][comp_field_state] = [] for group in between_groupings: if ((group[0] == field_state or group[1] == field_state) and (group[0] == comp_field_state or group[1] == comp_field_state)): # We've found a group of distances between our comparison # field state and the current field state, so keep the # data. result[field_state][comp_field_state] = group[2] return result
def make_selectors(counts_per_sample, minimum, mapping_file_tuple, subject_header_name, verbose=False): """make the four column string needed to print in the selectors file Inputs: counts_per_sample: a sorted list of tuples with the sample identifier and the number of sequences. minimum: minimum number of sequences considered to be a valid state. mapping_file_tuple: a tuple with the data of a mapping file and the headers. subject_header_name: string identifying the name of the column in the mapping file that represents a unique subject. Output: result: four columns string corresponding to number of sequences, subjects, number of samples and metadata fields. """ # unwrap the mapping file mapping_data = mapping_file_tuple[0] mapping_headers = mapping_file_tuple[1] seqs_per_sample = [t[0] for t in counts_per_sample] head_val = None subj_val = None samp_sub = None results = [] depth = -1 samples_per_subject = {} # store the index for convenience subject_index = mapping_headers.index(subject_header_name) list_of_subjects = [line[subject_index] for line in mapping_data] # initialize the samples_per_subject dictionary with as many keys as # subjects and values equal to the minimum number of samples among them for unique_subject in list(set(list_of_subjects)): samples_per_subject[unique_subject] = list_of_subjects.count(unique_subject) least_number_of_samples = min(samples_per_subject.values()) for key, value in samples_per_subject.iteritems(): samples_per_subject[key] = least_number_of_samples for sequences_per_sample_tuple in counts_per_sample: # there's no need to iterate if the minimum rarefaction depth is not met # or if the depth is the same as the previous depth, this would mean a # repeated row in the output line with the same values if sequences_per_sample_tuple[0] < minimum or \ sequences_per_sample_tuple[0] == depth: continue if verbose: print 'Samples per subject: {0} @ depth: {1}'\ .format(samples_per_subject, depth) # Some samples are not in the mapping file just print those out sample_id = sequences_per_sample_tuple[1] try: current_subject = [line[subject_index] for line in mapping_data if line[0] == sample_id][0] except IndexError: print 'Sample Id: {0} is not in the mapping file'.format(sample_id) continue # extract convenience data for ease of use depth = sequences_per_sample_tuple[0] remaining_ids = [_tuple[1] for _tuple in counts_per_sample if _tuple[0] >= depth] filtered_headers, filtered_data = filter_mapping_file(mapping_data,\ mapping_headers, remaining_ids, include_repeat_cols=False) # Breaking when there are no subjects/individuals left if subject_header_name not in filtered_headers: break # numbers to be written in the selectors file number_of_subjects = len(samples_per_subject.keys()) number_of_samples = min(samples_per_subject.values()) if number_of_subjects*number_of_samples < 3: continue # format the output if not subj_val and not head_val and not samp_sub: results.append('%d\t%d\t%d\t%s' % (int(depth), number_of_subjects,\ number_of_samples, ','.join(filtered_headers[1:-1]))) subj_val = number_of_subjects head_val = filtered_headers samp_sub = number_of_samples main_map_cat = filtered_headers else: if head_val!=filtered_headers: results.append('%d\t%d\t%d\t%s'%(int(depth),number_of_subjects,\ number_of_samples, ','.join(filtered_headers[1:-1]))) head_val = filtered_headers elif samp_sub!=number_of_samples: results.append('%d\t%d\t%d\tNone'% (int(depth),\ number_of_subjects, number_of_samples)) samp_sub = number_of_samples elif subj_val!=number_of_subjects: results.append('%d\t%d\t%d\tNone' % (int(depth),\ number_of_subjects, number_of_samples)) subj_val = number_of_subjects # remove the current processed sample and if needed, remove the subject try: samples_per_subject[current_subject] -= 1 if samples_per_subject[current_subject] == 0: del samples_per_subject[current_subject] except: pass return results, main_map_cat
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if (mapping_fp is None and valid_states is not None): option_parser.error("--mapping_fp must be provided if --valid_states " "is passed.") if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error("No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those).") if (mapping_fp and valid_states) and sample_id_fp: option_parser.error("Providing both --sample_id_fp and " "--mapping_fp/--valid_states is not supported.") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) negate_sample_id_fp = opts.negate_sample_id_fp if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) negate_sample_id_fp = False else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: o = open(sample_id_fp, 'U') sample_id_f_ids = set( [l.strip().split()[0] for l in o if not l.startswith('#')]) o.close() sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table( otu_table, sample_ids_to_keep, min_count, max_count, negate_ids_to_keep=negate_sample_id_fp) try: write_biom_table(filtered_otu_table, output_fp) except EmptyBIOMTableError: option_parser.error( "Filtering resulted in an empty BIOM table. " "This indicates that no samples remained after filtering.") # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.ids()) open(output_mapping_fp, 'w').write(format_mapping_file(mapping_headers, mapping_data))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table_fp = opts.otu_table_fp output_dir = opts.output_dir mapping_fp = opts.mapping_fp tree_fp = opts.tree_fp verbose = opts.verbose print_only = opts.print_only seqs_per_sample = int(opts.seqs_per_sample) parallel = opts.parallel min_seqs_sample = opts.min_seqs_sample subject_category = opts.subject_name try: makedirs(output_dir) except OSError: if opts.force: pass else: # Since the analysis can take quite a while, I put this check # in to help users avoid overwriting previous output. option_parser.error("Output directory already exists. Please choose" " a different directory, or force overwrite with -f.") ## ******************** make_evident_selectors ******************** ## The code for make_evident_selectors.py is here and has to go before the params ## validation as we need to know the main cats before creating the params file map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U')) biom_table = parse_biom_table(open(otu_table_fp, 'U')) # getting valid samples from biom file real_map_headers, real_map_data = filter_mapping_file(map_data, headers,\ biom_table.SampleIds, include_repeat_cols=False) if subject_category not in real_map_headers: option_parser.error('This column: %s is not in the mapping file, try %s'%\ (subject_category, real_map_headers)) sorted_counts_per_sample = get_sorted_counts_per_sample(biom_table) mapping_file_tuple = (real_map_data, real_map_headers) # calculate the available subjects at each rarefaction level results, main_map_cat = make_selectors(sorted_counts_per_sample, min_seqs_sample,\ mapping_file_tuple, subject_category, verbose=verbose) fout = open(join(output_dir,'selectors.txt'),'w') fout.write('#Sequences\tSubjects\tSamples\tMetadata\n') fout.write('\n'.join(results)) fout.close() fout = open(join(output_dir,'mapping_file.txt'),'w') fout.write(format_mapping_file(real_map_headers, real_map_data)) fout.close() ## ******************** make_evident_selectors ******************** fout = open(join(output_dir,'study_preferences.txt'),'w') fout.write('%d\n' % seqs_per_sample) fout.write('%s\n' % subject_category) fout.close() ## ******************** filter_samples_from_otu_table ******************** ## Filtering original biom file to only have samples above the max length to avoid ## ugly plots alpha_biom_file = join(output_dir,'filtered_otu_table_for_alpha.biom') fout = open(alpha_biom_file,'w') sample_ids_to_keep = biom_table.SampleIds filtered_otu_table = filter_samples_from_otu_table(biom_table, sample_ids_to_keep, min_count=seqs_per_sample, max_count=inf) fout.write(format_biom_table(filtered_otu_table)) fout.close() ## ******************** filter_samples_from_otu_table ******************** if opts.parameter_fp: try: parameter_f = open(opts.parameter_fp, 'U') except IOError: option_parser.error("Can't open parameters file (%s). Does it exist? " \ "Do you have read access?" % opts.parameter_fp) params = parse_qiime_parameters(parameter_f) parameter_f.close() else: params = parse_qiime_parameters( ['beta_diversity:metrics unweighted_unifrac',\ 'make_rarefaction_plots:prefs_path %s' % join(output_dir,'prefs.txt'), 'make_rarefaction_plots:colorby %s' % ','.join(main_map_cat), 'make_rarefaction_plots:output_type memory', 'multiple_rarefactions:min %d' % int(seqs_per_sample/4), 'multiple_rarefactions:max %d' % (seqs_per_sample+1), 'multiple_rarefactions:step %d' % int(seqs_per_sample/4), 'multiple_rarefactions:num-reps 4', ]) # empty list returns empty defaultdict for now jobs_to_start = opts.jobs_to_start default_jobs_to_start = qiime_config['jobs_to_start'] validate_and_set_jobs_to_start(params, jobs_to_start, default_jobs_to_start, parallel, option_parser) if print_only: command_handler = print_commands else: command_handler = call_commands_serially if verbose: status_update_callback = print_to_stdout else: status_update_callback = no_status_updates copyfile(otu_table_fp, join(output_dir,'raw.biom')) run_beta_diversity_through_plots(otu_table_fp=otu_table_fp, mapping_fp=mapping_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, color_by_interesting_fields_only=False, sampling_depth=seqs_per_sample, histogram_categories=None, tree_fp=tree_fp, parallel=parallel, suppress_3d_plots=True, suppress_2d_plots=True, status_update_callback=status_update_callback) output_dir = join(output_dir,'alpha') run_alpha_rarefaction(otu_table_fp=alpha_biom_file,\ mapping_fp=mapping_fp,\ output_dir=output_dir,\ command_handler=command_handler,\ params=params, qiime_config=qiime_config,\ tree_fp=tree_fp,\ num_steps=4,\ parallel=parallel,\ min_rare_depth=10, max_rare_depth=20, status_update_callback=status_update_callback, plot_stderr_and_stddev=True)