def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) output_f = open(opts.output_distance_matrix, 'w') if opts.otu_table_fp: otu_table = parse_biom_table(open(opts.otu_table_fp, 'U')) samples_to_keep = otu_table.SampleIds #samples_to_keep = \ # sample_ids_from_otu_table(open(opts.otu_table_fp,'U')) elif opts.sample_id_fp: samples_to_keep = \ get_seqs_to_keep_lookup_from_seq_id_file(open(opts.sample_id_fp,'U')) elif opts.mapping_fp and opts.valid_states: samples_to_keep = sample_ids_from_metadata_description( open(opts.mapping_fp, 'U'), opts.valid_states) else: option_parser.error( 'must pass either --sample_id_fp, -t, or -m and -s') # note that negate gets a little weird here. The function we're calling removes the specified # samples from the distance matrix, but the other QIIME filter scripts keep these samples specified. # So, the interface of this script is designed to keep the specified samples, and therefore # negate=True is passed to filter_samples_from_distance_matrix by default. d = filter_samples_from_distance_matrix(parse_distmat( open(opts.input_distance_matrix, 'U')), samples_to_keep, negate=not opts.negate) output_f.write(d) output_f.close()
def split_otu_table_on_sample_metadata(otu_table, mapping_f, mapping_field): """ split otu table into sub otu tables where each represent samples corresponding to only a certain value in mapping_field """ with errstate(empty='raise'): mapping_f = list(mapping_f) mapping_values = get_mapping_values(mapping_f, mapping_field) tables = 0 for v in mapping_values: v_fp_str = v.replace(' ', '_') sample_ids_to_keep = sample_ids_from_metadata_description( mapping_f, valid_states_str="%s:%s" % (mapping_field, v)) try: # filtering cannot be inplace otherwise we lose data filtered_otu_table = otu_table.filter( lambda values, id_, metadata: id_ in sample_ids_to_keep, axis='sample', inplace=False) tables += 1 except TableException: # all samples are filtered out, so no otu table to write continue yield v_fp_str, filtered_otu_table if not tables: raise OTUTableSplitError( "Could not split OTU tables! There are no matches between the " "sample identifiers in the OTU table and the mapping file.")
def silly_function(ui): for c_value in ui.series(coloring_values): sample_ids = sample_ids_from_metadata_description(open(mapping_fp, 'U'), '%s:%s' % (coloring_header_name, c_value)) _headers, _data = filter_mapping_file(data, headers, sample_ids, True) per_color_subject_values = list(set([row[subject_index] for row in _data])) fd = open(join(output_path, 'color_by_'+c_value+'.txt'), 'w') for s in ui.series(per_color_subject_values): fd.write('%s\n' % s) fd.close() if not suppress_trajectory_files: for s in ui.series(per_color_subject_values): filename = join(output_path, s+'.txt') if opts.verbose: print 'Working on printing', filename COMMAND_CALL = FILTER_CMD % (coords_fp, mapping_fp, '%s:%s' % (subject_header_name, s), filename, sorting_category) o, e, r = qiime_system_call(COMMAND_CALL) if opts.verbose and e: print 'Error happened on filtering step: \n%s' % e continue COMMAND_CALL = CONVERSION_CMD % (filename, filename) o, e, r = qiime_system_call(COMMAND_CALL) if opts.verbose and e: print 'Error happened on conversion step: \n%s' % e continue # useless here but just in case
def split_mapping_file_on_field(mapping_f, mapping_field, column_rename_ids=None, include_repeat_cols=True): """ split mapping file based on value in field """ mapping_f = list(mapping_f) mapping_values = get_mapping_values(mapping_f,mapping_field) mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) if column_rename_ids: try: column_rename_ids = mapping_headers.index(column_rename_ids) except ValueError: raise KeyError("Field is not in mapping file (search is case "+\ "and white-space sensitive). \n\tProvided field: "+\ "%s. \n\tValid fields: %s" % (mapping_field,' '.join(mapping_headers))) for v in mapping_values: v_fp_str = v.replace(' ','_') sample_ids_to_keep = sample_ids_from_metadata_description( mapping_f,valid_states_str="%s:%s" % (mapping_field,v)) # parse mapping file each time though the loop as filtering operates on values mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) mapping_headers, mapping_data = filter_mapping_file( mapping_data, mapping_headers, sample_ids_to_keep, include_repeat_cols=include_repeat_cols, column_rename_ids=column_rename_ids) yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) mapping_fp = opts.input_fp out_mapping_fp = opts.output_fp valid_states = opts.valid_states if opts.sample_id_fp: valid_sample_ids = \ get_seqs_to_keep_lookup_from_seq_id_file(open(opts.sample_id_fp,'U')) elif mapping_fp and valid_states: valid_sample_ids = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) data, headers, _ = parse_mapping_file(open(mapping_fp, 'U')) good_mapping_file = [] for line in data: if line[0] in valid_sample_ids: good_mapping_file.append(line) lines = format_mapping_file(headers, good_mapping_file) fd = open(out_mapping_fp, 'w') fd.write(lines) fd.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) coords_fp = opts.input_coords mapping_fp = opts.mapping_fp output_fp = opts.output_fp valid_states = opts.valid_states negate = opts.negate mapping_header_name = opts.mapping_header_name coords_ids, coords, eigen_values, pct_exp = parse_coords(open(coords_fp, "U")) data, headers, _ = parse_mapping_file(open(mapping_fp, "U")) if mapping_fp and valid_states: valid_sample_ids = sample_ids_from_metadata_description(open(mapping_fp, "U"), valid_states) valid_coords_ids, valid_coords = filter_sample_ids_from_coords(coords_ids, coords, valid_sample_ids, negate) if mapping_header_name: sorted_sample_ids = sort_sample_ids(data, headers, mapping_header_name) sorted_coord_ids, sorted_coords = sort_coords(valid_coords_ids, valid_coords, sorted_sample_ids) valid_coords_ids, valid_coords = sorted_coord_ids, sorted_coords lines = format_coords(valid_coords_ids, valid_coords, eigen_values, pct_exp) fd = open(output_fp, "w") fd.writelines(lines) fd.close
def make_profiles_by_category(mapping_fp, taxa_level, category): """ Creates a list of profiles for each unique value in the category Inputs: mapping_fp: filepath to the mapping file category: mapping file category to split data over defaults to HOST_SUBJECT_ID Returns a dictionary keyed by the values on that category and a list of profiles as values """ # Parse the mapping file map_f = open(mapping_fp, 'U') mapping_data, comments = parse_mapping_file_to_dict(map_f) map_f.close() # Get a list of unique keys for the specified category if category == 'SampleID': result = {} for sid in mapping_data: result[sid] = [make_profile_by_sid(mapping_data, sid, taxa_level)] else: values = set([mapping_data[sid][category] for sid in mapping_data]) result = {} # Loop over each value in that category for value in values: # Re-open the mapping file map_f = open(mapping_fp, 'U') # Get sample ids that match the value sids = sample_ids_from_metadata_description(map_f, category+":"+value) map_f.close() # Create the list with all the profiles of the sample IDs in this # category value result[value] = [make_profile_by_sid(mapping_data, sid, taxa_level) for sid in sids] return result
def split_mapping_file_on_field(mapping_f, mapping_field, column_rename_ids=None, include_repeat_cols=True): """ split mapping file based on value in field """ mapping_f = list(mapping_f) mapping_values = get_mapping_values(mapping_f, mapping_field) mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) if column_rename_ids: try: column_rename_ids = mapping_headers.index(column_rename_ids) except ValueError: raise KeyError("Field is not in mapping file (search is case " + "and white-space sensitive). \n\tProvided field: " + "%s. \n\tValid fields: %s" % (mapping_field, ' '.join(mapping_headers))) for v in mapping_values: v_fp_str = v.replace(' ', '_') sample_ids_to_keep = sample_ids_from_metadata_description( mapping_f, valid_states_str="%s:%s" % (mapping_field, v)) # parse mapping file each time though the loop as filtering operates on # values mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f) mapping_headers, mapping_data = filter_mapping_file( mapping_data, mapping_headers, sample_ids_to_keep, include_repeat_cols=include_repeat_cols, column_rename_ids=column_rename_ids) yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) output_f = open(opts.output_distance_matrix, 'w') if opts.otu_table_fp: otu_table = load_table(opts.otu_table_fp) samples_to_keep = otu_table.ids() # samples_to_keep = \ # sample_ids_from_otu_table(open(opts.otu_table_fp,'U')) elif opts.sample_id_fp: samples_to_keep = \ get_seqs_to_keep_lookup_from_seq_id_file( open(opts.sample_id_fp, 'U')) elif opts.mapping_fp and opts.valid_states: try: samples_to_keep = sample_ids_from_metadata_description( open(opts.mapping_fp, 'U'), opts.valid_states) except ValueError as e: option_parser.error(e.message) else: option_parser.error('must pass either --sample_id_fp, -t, or -m and ' '-s') # note that negate gets a little weird here. The function we're calling # removes the specified samples from the distance matrix, but the other # QIIME filter scripts keep these samples specified. So, the interface of # this script is designed to keep the specified samples, and therefore # negate=True is passed to filter_samples_from_distance_matrix by default. d = filter_samples_from_distance_matrix( parse_distmat( open(opts.input_distance_matrix, 'U')), samples_to_keep, negate=not opts.negate) output_f.write(d) output_f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error("No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those).") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: sample_id_f_ids = set([l.strip().split()[0] for l in open(sample_id_fp, 'U') if not l.startswith('#')]) sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table(otu_table, sample_ids_to_keep, min_count, max_count) write_biom_table(filtered_otu_table, output_fp) # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.ids()) open( output_mapping_fp, 'w').write( format_mapping_file( mapping_headers, mapping_data))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if mapping_fp is None and valid_states is not None: option_parser.error("--mapping_fp must be provided if --valid_states " "is passed.") if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error( "No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those)." ) if (mapping_fp and valid_states) and sample_id_fp: option_parser.error("Providing both --sample_id_fp and " "--mapping_fp/--valid_states is not supported.") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) negate_sample_id_fp = opts.negate_sample_id_fp if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description(open(mapping_fp, "U"), valid_states) negate_sample_id_fp = False else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: o = open(sample_id_fp, "U") sample_id_f_ids = set([l.strip().split()[0] for l in o if not l.startswith("#")]) o.close() sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table( otu_table, sample_ids_to_keep, min_count, max_count, negate_ids_to_keep=negate_sample_id_fp ) try: write_biom_table(filtered_otu_table, output_fp) except EmptyBIOMTableError: option_parser.error( "Filtering resulted in an empty BIOM table. " "This indicates that no samples remained after filtering." ) # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_fp, "U")) mapping_headers, mapping_data = filter_mapping_file(mapping_data, mapping_headers, filtered_otu_table.ids()) open(output_mapping_fp, "w").write(format_mapping_file(mapping_headers, mapping_data))
def get_seqs_to_keep_lookup_from_mapping_file(fasta_f, mapping_f, valid_states): sample_ids = {}.fromkeys(sample_ids_from_metadata_description(mapping_f, valid_states)) seqs_to_keep = [] for seq_id, seq in parse_fasta(fasta_f): if seq_id.split("_")[0] in sample_ids: seqs_to_keep.append(seq_id) else: continue return {}.fromkeys(seqs_to_keep)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error( "No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination of those).") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = parse_biom_table(open(opts.input_fp, 'U')) output_f = open(opts.output_fp, 'w') if (mapping_fp and valid_states): sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) else: sample_ids_to_keep = otu_table.SampleIds if (sample_id_fp is not None): sample_id_f_ids = set([ l.strip().split()[0] for l in open(sample_id_fp, 'U') if not l.startswith('#') ]) sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table(otu_table, sample_ids_to_keep, min_count, max_count) output_f.write(format_biom_table(filtered_otu_table)) output_f.close() # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.SampleIds) open(output_mapping_fp, 'w').write(format_mapping_file(mapping_headers, mapping_data))
def get_seqs_to_keep_lookup_from_mapping_file(fasta_f,mapping_f,valid_states): sample_ids = {}.fromkeys(\ sample_ids_from_metadata_description(mapping_f,valid_states)) seqs_to_keep = [] for seq_id, seq in MinimalFastaParser(fasta_f): if seq_id.split('_')[0] in sample_ids: seqs_to_keep.append(seq_id) else: continue return {}.fromkeys(seqs_to_keep)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) output_f = open(opts.output_distance_matrix,'w') if opts.otu_table_fp: otu_table = parse_biom_table(open(opts.otu_table_fp,'U')) samples_to_keep = otu_table.SampleIds #samples_to_keep = \ # sample_ids_from_otu_table(open(opts.otu_table_fp,'U')) elif opts.sample_id_fp: samples_to_keep = \ get_seqs_to_keep_lookup_from_seq_id_file(open(opts.sample_id_fp,'U')) elif opts.mapping_fp and opts.valid_states: try: samples_to_keep = sample_ids_from_metadata_description( open(opts.mapping_fp,'U'),opts.valid_states) except ValueError, e: option_parser.error(e.message)
def split_otu_table_on_sample_metadata(otu_table_f,mapping_f,mapping_field): """ split otu table into sub otu tables where each represent samples corresponding to only a certain value in mapping_field """ mapping_f = list(mapping_f) mapping_values = get_mapping_values(mapping_f,mapping_field) otu_table = parse_biom_table(otu_table_f) for v in mapping_values: v_fp_str = v.replace(' ','_') sample_ids_to_keep = sample_ids_from_metadata_description( mapping_f,valid_states_str="%s:%s" % (mapping_field,v)) try: filtered_otu_table = otu_table.filterSamples( lambda values,id_,metadata: id_ in sample_ids_to_keep) except TableException: # all samples are filtered out, so no otu table to write continue yield v_fp_str, format_biom_table(filtered_otu_table)
def split_otu_table_on_sample_metadata(otu_table_f, mapping_f, mapping_field): """ split otu table into sub otu tables where each represent samples corresponding to only a certain value in mapping_field """ mapping_f = list(mapping_f) mapping_values = get_mapping_values(mapping_f, mapping_field) otu_table = parse_biom_table(otu_table_f) for v in mapping_values: v_fp_str = v.replace(' ', '_') sample_ids_to_keep = sample_ids_from_metadata_description( mapping_f, valid_states_str="%s:%s" % (mapping_field, v)) try: filtered_otu_table = otu_table.filterSamples( lambda values, id_, metadata: id_ in sample_ids_to_keep) except TableException: # all samples are filtered out, so no otu table to write continue yield v_fp_str, format_biom_table(filtered_otu_table)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) output_f = open(opts.output_distance_matrix, 'w') if opts.otu_table_fp: otu_table = parse_biom_table(open(opts.otu_table_fp, 'U')) samples_to_keep = otu_table.SampleIds #samples_to_keep = \ # sample_ids_from_otu_table(open(opts.otu_table_fp,'U')) elif opts.sample_id_fp: samples_to_keep = \ get_seqs_to_keep_lookup_from_seq_id_file(open(opts.sample_id_fp,'U')) elif opts.mapping_fp and opts.valid_states: try: samples_to_keep = sample_ids_from_metadata_description( open(opts.mapping_fp, 'U'), opts.valid_states) except ValueError, e: option_parser.error(e.message)
def split_otu_table_on_sample_metadata(otu_table, mapping_f, mapping_field): """ split otu table into sub otu tables where each represent samples corresponding to only a certain value in mapping_field """ mapping_f = list(mapping_f) mapping_values = get_mapping_values(mapping_f, mapping_field) for v in mapping_values: v_fp_str = v.replace(' ', '_') sample_ids_to_keep = sample_ids_from_metadata_description( mapping_f, valid_states_str="%s:%s" % (mapping_field, v)) try: # filtering cannot be inplace otherwise we lose data filtered_otu_table = otu_table.filter( lambda values, id_, metadata: id_ in sample_ids_to_keep, axis='observation', inplace=False) except TableException: # all samples are filtered out, so no otu table to write continue yield v_fp_str, filtered_otu_table
def format_vectors_to_js(mapping_file_data, mapping_file_headers, coords_data, coords_headers, connected_by_header, sorted_by_header=None): """Write a string representing the vectors in a PCoA plot as javascript Inputs: mapping_file_data: contents of the mapping file mapping_file_headers: headers of the mapping file coords_data: coordinates of the PCoA plot in a numpy 2-D array or a list of numpy 2-D arrays for jackknifed input coords_headers: headers of the coords in the PCoA plot or a list of lists with the headers for jackknifed input connected_by_header: header of the mapping file that represents how the lines will be connected sorted_by_header: numeric-only header name to sort the samples in the vectors Output: js_vectors_string: string that represents the vectors in the shape of a javascript object Notes: If using jackknifed input, the coordinates and headers that will be used are the ones belonging to the master coords i. e. the first element. """ js_vectors_string = [] js_vectors_string.append('\nvar g_vectorPositions = new Array();\n') if connected_by_header != None: # check if we are processing jackknifed input, if so just get the master if type(coords_data) == list: coords_data = coords_data[0] coords_headers = coords_headers[0] columns_to_keep = ['SampleID', connected_by_header] # do not ad None if sorted_by_header is None or empty if sorted_by_header: columns_to_keep.append(sorted_by_header) # reduce the amount of data by keeping the required fields only mapping_file_data, mapping_file_headers =\ keep_columns_from_mapping_file(mapping_file_data, mapping_file_headers, columns_to_keep) # format the mapping file to use this with the filtering function mf_string = format_mapping_file(mapping_file_headers, mapping_file_data) index = mapping_file_headers.index(connected_by_header) connected_by = list(set([line[index] for line in mapping_file_data])) for category in connected_by: # convert to StringIO to for each iteration; else the object # won't be usable after the first iteration & you'll get an error sample_ids = sample_ids_from_metadata_description( StringIO(mf_string), '%s:%s' % (connected_by_header, category)) # if there is a sorting header, sort the coords using these values if sorted_by_header: sorting_index = mapping_file_headers.index(sorted_by_header) to_sort = [line for line in mapping_file_data if line[0] in\ sample_ids] # get the sorted sample ids from the sorted-reduced mapping file sample_ids = zip( *sorted(to_sort, key=lambda x: float(x[sorting_index])))[0] # each category value is a new vector js_vectors_string.append( "g_vectorPositions['%s'] = new Array();\n" % (category)) for s in sample_ids: index = coords_headers.index(s) # print the first three elements of each coord for each sample js_vectors_string.append( "g_vectorPositions['%s']['%s'] = %s;\n" % (category, s, coords_data[index, :3].tolist())) return ''.join(js_vectors_string)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) otu_table_fp = opts.otu_table_fp mothur_counts_fp = opts.mothur_counts_fp mapping_fp = opts.mapping_fp valid_states = opts.valid_states blank_id_fp = opts.blank_id_fp contaminant_db_fp = opts.contaminant_db_fp contaminant_similarity = opts.contaminant_similarity max_correlation = opts.max_correlation correlate_header = opts.correlate_header input_fasta_fp = opts.input_fasta_fp otu_map_fp = opts.otu_map_fp output_dir = opts.output_dir min_relabund_threshold = opts.min_relabund_threshold prescreen_threshold = opts.prescreen_threshold removal_stat_blank = opts.removal_stat_blank removal_stat_sample = opts.removal_stat_sample removal_differential = opts.removal_differential reinstatement_stat_sample = opts.reinstatement_stat_sample reinstatement_stat_blank = opts.reinstatement_stat_blank reinstatement_differential = opts.reinstatement_differential reinstatement_sample_number = opts.reinstatement_sample_number reinstatement_method = opts.reinstatement_method write_output_seq_lists = opts.write_output_seq_lists write_filtered_output = opts.write_filtered_output drop_lib_threshold = opts.drop_lib_threshold write_per_seq_stats = opts.write_per_seq_stats write_per_library_stats = opts.write_per_library_stats write_per_seq_disposition = opts.write_per_seq_disposition # Make unique seq OTU table (biom file) # Compute unique seq stats # output biom file with unique seq stats # Optionally: make candidate contaminant DB # remove sequences present at higher abundance in samples # cluster blanks # remove low-abundance contaminant OTUs # Filter by similarity against candidate contaminant DB # annotate unique seq OTU table with top hit (OTU#, rep seq, ID%) # make list of seqs @ threshold # Calculate reinstatement rule for filtered sequences # Generate lists of seqs failing: # - unique seq rule # - hit to contaminant # - reinstatement after hit # Make sure passed at least one of an OTU biom or mothur counts table file input_file_counter = 0 if mothur_counts_fp: input_file_counter += 1 unique_seq_biom = mothur_counts_to_biom(mothur_counts_fp) mothur_output = True print "mothur input" if otu_table_fp: input_file_counter += 1 unique_seq_biom = load_table(otu_table_fp) mothur_output = False print "BIOM input" if input_file_counter != 1: option_parser.error("must provide ONLY ONE of an OTU table biom file or" "mothur counts table") # Check to make sure that if blank-based contamination filtering requested, # all necessary options are specified: removal_options_counter = 0 if removal_stat_blank: removal_options_counter += 1 if removal_stat_sample: removal_options_counter += 1 if removal_differential: removal_options_counter += 1 if ((removal_options_counter > 0) and (removal_options_counter < 3)): option_parser.error("Must provide all of " "removal_stats_blank, " "removal_stat_sample, and " "removal_differential, or none.") elif removal_options_counter == 0: blank_stats_removal = False elif removal_options_counter == 3: blank_stats_removal = True # If reference-based filtering requested, make sure all necessary options # have been specified: if contaminant_db_fp and not input_fasta_fp: option_parser.error("If specifying ref-based contaminant ID, must " "also specify path to input sequence fasta") # If correlation-based filtering requested, make sure correlate data # are specified if max_correlation and not correlate_header: option_parser.error("If specifying maximum Spearman correlation, must " "also provide map column header for correlate data") # If sequence reinstatement is requested, make sure all necessary options # are specified reinstatement_options_counter = 0 if reinstatement_stat_blank: reinstatement_options_counter += 1 if reinstatement_stat_sample: reinstatement_options_counter += 1 if reinstatement_differential: reinstatement_options_counter += 1 if ((reinstatement_options_counter > 0) and (reinstatement_options_counter < 3)): option_parser.error("Must provide all of " "reinstatement_stats_blank, " "reinstatement_stat_sample, and " "reinstatement_differential, or none.") if ((reinstatement_options_counter == 3 and reinstatement_sample_number) and not reinstatement_method): option_parser.error("If providing sample number AND abundance criteria " "for sequence reinstatement, must also provide " "a method for combining results.") if reinstatement_options_counter == 3 or reinstatement_sample_number: reinstatement = True else: reinstatement = False # get blank sample IDs from mapping file or sample ID list if mapping_fp and valid_states: blank_sample_ids = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) blanks = True elif blank_id_fp is not None: blank_id_f = open(blank_id_fp, 'Ur') blank_sample_ids = set([line.strip().split()[0] for line in blank_id_f if not line.startswith('#')]) blank_id_f.close() blanks = True else: blanks = False # Initialize output objets output_dict = {} contaminant_types = [] contamination_stats_dict = None contamination_stats_header = None corr_data_dict = None # Do blank-based stats calculations, if not there check to make sure no # blank-dependent methods are requested: if blanks: if prescreen_threshold: low_contam_libraries = prescreen_libraries(unique_seq_biom, blank_sample_ids, removal_stat_sample, removal_stat_blank, removal_differential, prescreen_threshold) contamination_stats_header, contamination_stats_dict = \ get_contamination_stats(unique_seq_biom, blank_sample_ids, exp_sample_ids=low_contam_libraries) else: contamination_stats_header, contamination_stats_dict = \ get_contamination_stats(unique_seq_biom, blank_sample_ids) elif (blank_stats_removal or reinstatement or prescreen_threshold): option_parser.error("Blank-based filtering requested but no blank" "samples indicated in mapping file or ID file.") else: contamination_stats_header, contamination_stats_dict = \ get_contamination_stats(unique_seq_biom) seq_ids = unique_seq_biom.ids(axis='observation') # Do blank-based contaminant identification if min_relabund_threshold: output_dict['below_relabund_threshold'] = pick_min_relabund_threshold( contamination_stats_dict, contamination_stats_header, min_relabund_threshold) if blank_stats_removal: output_dict['abund_contaminants'] = compare_blank_abundances(contamination_stats_dict, contamination_stats_header, removal_stat_sample, removal_stat_blank, removal_differential, negate=True) contaminant_types.append('abund_contaminants') # Do reference-based contaminant identification if contaminant_db_fp: output_dict['ref_contaminants'] = pick_ref_contaminants(seq_ids, contaminant_db_fp, input_fasta_fp, contaminant_similarity, output_dir) contaminant_types.append('ref_contaminants') # Do spearman correlation based contaminant identification if max_correlation: metadata_dict = parse_mapping_file_to_dict(open(mapping_fp, 'U'))[0] corr_data_dict = {x: float(metadata_dict[x][correlate_header]) for x in metadata_dict} output_dict['corr_contaminants'], corr_contaminant_dict = pick_corr_contaminants(unique_seq_biom, corr_data_dict, max_correlation) contaminant_types.append('corr_contaminants') else: corr_contaminant_dict = None # Putative contaminants are those that have been identified by any method output_dict['putative_contaminants'] = set.union(*map(set, [output_dict[x] for x in contaminant_types])) # If considering low abundance sequences, remove those from consideration as potential contaminants if 'below_relabund_threshold' in output_dict: output_dict['putative_contaminants'] = output_dict['putative_contaminants'] - set(output_dict['below_relabund_threshold']) # Pick abundance-criterion seqs to reinstate if (reinstatement_stat_blank and reinstatement_stat_sample and reinstatement_differential): output_dict['abund_reinstated_seqs'] = reinstate_abund_seqs(output_dict['putative_contaminants'], contamination_stats_dict, contamination_stats_header, reinstatement_stat_sample, reinstatement_stat_blank, reinstatement_differential) output_dict['reinstated_seqs'] = output_dict['abund_reinstated_seqs'] # Pick incidence-criterion seqs to reinstate if reinstatement_sample_number: output_dict['incidence_reinstated_seqs'] = reinstate_incidence_seqs( output_dict['putative_contaminants'], unique_seq_biom, blank_sample_ids, reinstatement_sample_number) output_dict['reinstated_seqs'] = output_dict['incidence_reinstated_seqs'] # combine incidence and abundance reinstatements if reinstatement_sample_number and reinstatement_stat_blank: if reinstatement_method == "union": output_dict['reinstated_seqs'] = output_dict['abund_reinstated_seqs'] | output_dict['incidence_reinstated_seqs'] elif reinstatement_method == "intersection": output_dict['reinstated_seqs'] = output_dict['abund_reinstated_seqs'] & output_dict['incidence_reinstated_seqs'] # make sets for sequence _never_ identified as contaminants: output_dict['ever_good_seqs'] = set(seq_ids) - output_dict['putative_contaminants'] # If considering low abundance sequences, remove those from consideration as potential contaminants if 'below_relabund_threshold' in output_dict: output_dict['ever_good_seqs'] = output_dict['ever_good_seqs'] - set(output_dict['below_relabund_threshold']) # Make set of good seqs for final filtering final_good_seqs = output_dict['ever_good_seqs'] # ...and those either never ID'd as contaminants or reinstated: if reinstatement: output_dict['all_good_seqs'] = set(output_dict['ever_good_seqs'] | output_dict['reinstated_seqs']) final_good_seqs = output_dict['all_good_seqs'] # ...and those who remain contaminants after reinstatement: output_dict['never_good_seqs'] = set(output_dict['putative_contaminants'] - output_dict['reinstated_seqs']) # print filtered OTU maps if given a QIIME OTU map input if otu_map_fp: print_filtered_output('otu_map', otu_map_fp, output_dir, output_dict) # print filtered Mothur counts tables if given a Mothur counts table input if mothur_output: print_filtered_output('mothur_counts', mothur_counts_fp, output_dir, output_dict) # print filtered seq header files if requested if write_output_seq_lists: print_filtered_output('seq_headers', seq_ids, output_dir, output_dict) # filter final biom file to just good seqs filtered_biom = unique_seq_biom.filter(lambda val, id_, metadata: id_ in final_good_seqs, axis='observation', invert=False, inplace=False) # drop heavily contaminated libraries if requested if drop_lib_threshold: dropped_libs = unique_seq_biom.norm(inplace=False).filter(lambda val, id_, metadata: id_ in final_good_seqs, axis='observation', invert=False, inplace=False).filter(lambda val, id_, metadata: sum(val) >= drop_lib_threshold, axis='sample', invert=True, inplace=False).ids(axis='sample') filtered_biom.filter(lambda val, id_, metadata: id_ in dropped_libs, axis='sample', invert=True, inplace=True) else: dropped_libs = [] # print filtered biom/mothur_output if library filtering is requested if write_filtered_output: if mothur_output: output_counts_string = biom_to_mothur_counts(filtered_biom) with open(os.path.join(output_dir,'decontaminated_table.counts'), "w") as output_counts_file: output_counts_file.write(output_counts_string) else: output_biom_string = filtered_biom.to_json('Filtered by decontaminate.py') output_biom_string with open(os.path.join(output_dir,'decontaminated_otu_table.biom'), "w") as output_biom_file: output_biom_file.write(output_biom_string) # print per-library stats if requested if write_per_library_stats: per_library_stats, per_library_stats_header = calc_per_library_decontam_stats(unique_seq_biom, output_dict) library_stats_string = print_per_library_stats(per_library_stats, per_library_stats_header, unique_seq_biom.ids(axis='sample'), dropped_libs=dropped_libs) with open(os.path.join(output_dir,'decontamination_per_library_stats.txt'), "w") as output_stats_file: output_stats_file.write(library_stats_string) # print otu by disposition file if requested if write_per_seq_disposition: per_seq_disposition = print_otu_disposition(seq_ids, output_dict) with open(os.path.join(output_dir,'decontamination_per_otu_disposition.txt'), "w") as output_stats_file: output_stats_file.write(per_seq_disposition) # print log file / per-seq info if write_per_seq_stats: print_results_file(seq_ids, output_dict, os.path.join(output_dir,'contamination_summary.txt'), contamination_stats_header, contamination_stats_dict, corr_contaminant_dict)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if (mapping_fp is None and valid_states is not None): option_parser.error("--mapping_fp must be provided if --valid_states " "is passed.") if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error("No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those).") if (mapping_fp and valid_states) and sample_id_fp: option_parser.error("Providing both --sample_id_fp and " "--mapping_fp/--valid_states is not supported.") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) negate_sample_id_fp = opts.negate_sample_id_fp if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) negate_sample_id_fp = False else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: o = open(sample_id_fp, 'U') sample_id_f_ids = set( [l.strip().split()[0] for l in o if not l.startswith('#')]) o.close() sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table( otu_table, sample_ids_to_keep, min_count, max_count, negate_ids_to_keep=negate_sample_id_fp) try: write_biom_table(filtered_otu_table, output_fp) except EmptyBIOMTableError: option_parser.error( "Filtering resulted in an empty BIOM table. " "This indicates that no samples remained after filtering.") # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.ids()) open(output_mapping_fp, 'w').write(format_mapping_file(mapping_headers, mapping_data))
def format_vectors_to_js(mapping_file_data, mapping_file_headers, coords_data, coords_headers, connected_by_header, sorted_by_header=None): """Write a string representing the vectors in a PCoA plot as javascript Inputs: mapping_file_data: contents of the mapping file mapping_file_headers: headers of the mapping file coords_data: coordinates of the PCoA plot in a numpy 2-D array or a list of numpy 2-D arrays for jackknifed input coords_headers: headers of the coords in the PCoA plot or a list of lists with the headers for jackknifed input connected_by_header: header of the mapping file that represents how the lines will be connected sorted_by_header: numeric-only header name to sort the samples in the vectors Output: js_vectors_string: string that represents the vectors in the shape of a javascript object Notes: If using jackknifed input, the coordinates and headers that will be used are the ones belonging to the master coords i. e. the first element. """ js_vectors_string = [] js_vectors_string.append('\nvar g_vectorPositions = new Array();\n') if connected_by_header != None: # check if we are processing jackknifed input, if so just get the master if type(coords_data) == list: coords_data = coords_data[0] coords_headers = coords_headers[0] columns_to_keep = ['SampleID', connected_by_header] # do not ad None if sorted_by_header is None or empty if sorted_by_header: columns_to_keep.append(sorted_by_header) # reduce the amount of data by keeping the required fields only mapping_file_data, mapping_file_headers =\ keep_columns_from_mapping_file(mapping_file_data, mapping_file_headers, columns_to_keep) # format the mapping file to use this with the filtering function mf_string = format_mapping_file(mapping_file_headers, mapping_file_data) index = mapping_file_headers.index(connected_by_header) connected_by = list(set([line[index] for line in mapping_file_data])) for category in connected_by: # convert to StringIO to for each iteration; else the object # won't be usable after the first iteration & you'll get an error sample_ids = sample_ids_from_metadata_description( StringIO(mf_string),'%s:%s' % (connected_by_header,category)) # if there is a sorting header, sort the coords using these values if sorted_by_header: sorting_index = mapping_file_headers.index(sorted_by_header) to_sort = [line for line in mapping_file_data if line[0] in\ sample_ids] # get the sorted sample ids from the sorted-reduced mapping file sample_ids = zip(*sorted(to_sort, key=lambda x: float(x[sorting_index])))[0] # each category value is a new vector js_vectors_string.append("g_vectorPositions['%s'] = new Array();\n" % (category)) for s in sample_ids: index = coords_headers.index(s) # print the first three elements of each coord for each sample js_vectors_string.append("g_vectorPositions['%s']['%s'] = %s;\n" % (category, s, coords_data[index, :3].tolist())) return ''.join(js_vectors_string)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) mapping_fp = opts.mapping_fp state_values = opts.state_values.split(',') metadata_categories = opts.metadata_categories state_category = opts.state_category individual_id_category = opts.individual_id_category output_dir = opts.output_dir biom_table_fp = opts.biom_table_fp observation_ids = opts.observation_ids if not observation_ids is None: observation_ids = observation_ids.split(',') valid_states = opts.valid_states ymin = opts.ymin ymax = opts.ymax line_color = opts.line_color # validate the input - currently only supports either biom data # or mapping file data. if useful in the future it shouldn't be too # hard to allow the user to provide both. if metadata_categories and biom_table_fp: option_parser.error( "Can only pass --metadata_categories or --biom_table_fp, not both." ) elif not (metadata_categories or biom_table_fp): option_parser.error( "Must pass either --metadata_categories or --biom_table_fp.") else: pass # parse the mapping file to a dict mapping_data = parse_mapping_file_to_dict(open(mapping_fp, 'U'))[0] # currently only support for pre/post (ie, two-state) tests if len(state_values) != 2: option_parser.error( "Exactly two state_values must be passed separated by a comma.") # filter mapping_data, if requested if valid_states: sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) for sid in mapping_data.keys(): if sid not in sample_ids_to_keep: del mapping_data[sid] if biom_table_fp: biom_table = parse_biom_table(open(biom_table_fp, 'U')) analysis_categories = observation_ids or biom_table.ObservationIds personal_ids_to_state_values = \ extract_per_individual_state_metadata_from_sample_metadata_and_biom( mapping_data, biom_table, state_category, state_values, individual_id_category, observation_ids=analysis_categories) else: analysis_categories = metadata_categories.split(',') personal_ids_to_state_values = \ extract_per_individual_state_metadata_from_sample_metadata( mapping_data, state_category, state_values, individual_id_category, analysis_categories) paired_difference_analyses(personal_ids_to_state_values, analysis_categories, state_values, output_dir, line_color=line_color, ymin=ymin, ymax=ymax)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) category = opts.category mapping_fp = opts.mapping_fp colors_used = [] if (category and mapping_fp is None) or (category is None and mapping_fp): option_parser.error('If coloring by a metadata category, both the ' 'category and the mapping file must be supplied.') elif mapping_fp and category: mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_fp, 'U')) if category not in mapping_headers: option_parser.error("The category supplied must exist in the " "metadata mapping file, '%s' does not exist." % category) index = mapping_headers.index(category) categories = list(set([line[index] for line in mapping_data])) list_of_plots = [] if opts.binning is None: ranges = [] else: # simple ranges format validation if opts.binning.count('[') != opts.binning.count(']') or\ opts.binning.count('[') != opts.binning.count(','): raise ValueError("The binning input has an error: '%s'; " % + "\nthe format should be [increment1,top_limit1][increment2,top_limit2]") # spliting in ranges rgn_txt = opts.binning.split('][') # removing left [ and right ] rgn_txt[0] = rgn_txt[0][1:] rgn_txt[-1] = rgn_txt[-1][:-1] # converting into int ranges = [] max = 0 for i, r in enumerate(rgn_txt): try: values = map(float, r.split(',')) except ValueError: raise ValueError( "Not a valid format for binning %s" % opts.binning) if len(values) != 2: raise ValueError( "All ranges must have only 2 values: [%s]" % r) elif i + 1 != len(rgn_txt): if values[0] > values[1]: raise ValueError( "The bin value can't be greater than the max value: [%s]" % r) elif values < 0: raise ValueError( "This value can not be negative: [%s]" % r) elif max > values[1]: raise ValueError( "This value can not smaller than the previous one: [%s]" % r) else: max = values[1] ranges.append(values) x_samples, x_distmtx = parse_distmat(open(opts.input_path_x, 'U')) y_samples, y_distmtx = parse_distmat(open(opts.input_path_y, 'U')) if opts.ignore_missing_samples: ignoring_from_x = list(set(x_samples) - set(y_samples)) ignoring_from_y = list(set(y_samples) - set(x_samples)) if opts.verbose: print '\nFrom %s we are ignoring: %s\n' % (opts.input_path_x, ignoring_from_x) print '\nFrom %s we are ignoring: %s\n' % (opts.input_path_y, ignoring_from_y) print '\nOnly using: %s\n' % (list(set(x_samples) & set(y_samples))) x_file = StringIO( filter_samples_from_distance_matrix((x_samples, x_distmtx), ignoring_from_x)) x_samples, x_distmtx = parse_distmat(x_file) y_file = StringIO( filter_samples_from_distance_matrix((y_samples, y_distmtx), ignoring_from_y)) y_samples, y_distmtx = parse_distmat(y_file) else: if x_distmtx.shape != y_distmtx.shape: raise ValueError('The distance matrices have different sizes. ' + 'You can cancel this error by passing --ignore_missing_samples') figure() if category is None: x_val, y_val, x_fit, y_fit, func_text = fit_semivariogram( (x_samples, x_distmtx), (y_samples, y_distmtx), opts.model, ranges) plot( x_val, y_val, color=opts.dot_color, marker=opts.dot_marker, linestyle="None", alpha=opts.dot_alpha) plot( x_fit, y_fit, linewidth=2.0, color=opts.line_color, alpha=opts.line_alpha) else: # not all the categories that are going to be enumerated are found in # the distance matrices i.e. the mapping file is a superset that can # contain more samples than the distance matrices used_categories = deepcopy(categories) for index, single_category in enumerate(categories): good_sample_ids = sample_ids_from_metadata_description( open(mapping_fp), '%s:%s' % (category, single_category)) try: _y_samples, _y_distmtx = parse_distmat(StringIO( filter_samples_from_distance_matrix((y_samples, y_distmtx), good_sample_ids, negate=True))) _x_samples, _x_distmtx = parse_distmat(StringIO( filter_samples_from_distance_matrix((x_samples, x_distmtx), good_sample_ids, negate=True))) except ValueError: # no samples found for this category used_categories.remove(single_category) continue x_val, y_val, x_fit, y_fit, func_text = fit_semivariogram( (_x_samples, _x_distmtx), (_y_samples, _y_distmtx), opts.model, ranges) # retrieve one of the colors the "QIIME" colors and add it to the # list of used colors for the creation of the legends in the plot color_only = get_qiime_hex_string_color(index) colors_used.append(color_only) plot(x_val, y_val, color=color_only, marker=opts.dot_marker, linestyle="None", alpha=opts.dot_alpha) plot(x_fit, y_fit, linewidth=2.0, color=color_only, alpha=opts.line_alpha, label=single_category) # set plot limits if requested x_lb, x_ub = xlim() y_lb, y_ub = ylim() if opts.x_min is not None: x_lb = opts.x_min if opts.x_max is not None: x_ub = opts.x_max if opts.y_min is not None: y_lb = opts.y_min if opts.y_max is not None: y_ub = opts.y_max xlim(x_lb, x_ub) ylim(y_lb, y_ub) x_label = opts.x_label y_label = opts.y_label fig_title = '%s (%s)' % (opts.fig_title, opts.model) xlabel(x_label) ylabel(y_label) if opts.print_model: title(fig_title + ' ' + func_text) else: title(fig_title) savefig(opts.output_path) # print the legends after the figure is exported to avoid conflicts if category: # if there's a desired format, use that, else default it to png _, extension = splitext(opts.output_path) # remove the dot, else, make_legend will add it to the filename extension = extension.replace('.', '') if extension == '': extension = 'png' make_legend(used_categories, colors_used, 0, 0, 'black', 'white', opts.output_path, extension, 80)
def get_seqs_to_keep_lookup_from_mapping_file(mapping_f, valid_states): sample_ids = set( sample_ids_from_metadata_description(mapping_f, valid_states)) return sample_ids
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_dir = opts.output_dir if opts.num_fraction_for_core_steps < 2: option_parser.error( "Must perform at least two steps. Increase --num_fraction_for_core_steps.") fractions_for_core = np.linspace(opts.min_fraction_for_core, opts.max_fraction_for_core, opts.num_fraction_for_core_steps) otu_md = opts.otu_md valid_states = opts.valid_states mapping_fp = opts.mapping_fp create_dir(output_dir) if valid_states and opts.mapping_fp: sample_ids = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) if len(sample_ids) < 1: option_parser.error( "--valid_states pattern didn't match any entries in mapping file: \"%s\"" % valid_states) else: # get core across all samples if user doesn't specify a subset of the # samples to work with sample_ids = None input_table = parse_biom_table(open(input_fp, 'U')) otu_counts = [] summary_figure_fp = join(output_dir, 'core_otu_size.pdf') for fraction_for_core in fractions_for_core: # build a string representation of the fraction as that gets used # several times fraction_for_core_str = "%1.0f" % (fraction_for_core * 100.) # prep output files output_fp = join( output_dir, 'core_otus_%s.txt' % fraction_for_core_str) output_table_fp = join( output_dir, 'core_table_%s.biom' % fraction_for_core_str) output_f = open(output_fp, 'w') try: core_table = filter_table_to_core(input_table, sample_ids, fraction_for_core) except TableException: output_f.write( "# No OTUs present in %s %% of samples." % fraction_for_core_str) output_f.close() otu_counts.append(0) continue # write some header information to file if sample_ids is None: output_f.write( "# Core OTUs across %s %% of samples.\n" % fraction_for_core_str) else: output_f.write( "# Core OTUs across %s %% of samples matching the sample metadata pattern \"%s\":\n# %s\n" % (fraction_for_core_str, valid_states, ' '.join(sample_ids))) # write the otu id and corresponding metadata for all core otus otu_count = 0 for value, id_, md in core_table.iter(axis='observation'): output_f.write('%s\t%s\n' % (id_, md[otu_md])) otu_count += 1 output_f.close() # write the core biom table write_biom_table(core_table, output_table_fp) # append the otu count to the list of counts otu_counts.append(otu_count) plot(fractions_for_core, otu_counts) xlim(min(fractions_for_core), max(fractions_for_core)) ylim(0, max(otu_counts) + 1) xlabel( "Fraction of samples that OTU must be observed in to be considered 'core'") ylabel("Number of OTUs") savefig(summary_figure_fp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) category = opts.category mapping_fp = opts.mapping_fp colors_used = [] if (category and mapping_fp == None) or (category == None and mapping_fp): option_parser.error('If coloring by a metadata category, both the ' 'category and the mapping file must be supplied.') elif mapping_fp and category: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) if category not in mapping_headers: option_parser.error("The category supplied must exist in the " "metadata mapping file, '%s' does not exist." % category) index = mapping_headers.index(category) categories = list(set([line[index] for line in mapping_data])) list_of_plots = [] if opts.binning is None: ranges = [] else: # simple ranges format validation if opts.binning.count('[')!=opts.binning.count(']') or\ opts.binning.count('[')!=opts.binning.count(','): raise ValueError, "The binning input has an error: '%s'; " % +\ "\nthe format should be [increment1,top_limit1][increment2,top_limit2]" # spliting in ranges rgn_txt = opts.binning.split('][') # removing left [ and right ] rgn_txt[0] = rgn_txt[0][1:] rgn_txt[-1] = rgn_txt[-1][:-1] # converting into int ranges = [] max = 0 for i, r in enumerate(rgn_txt): try: values = map(float, r.split(',')) except ValueError: raise ValueError, "Not a valid format for binning %s" % opts.binning if len(values) != 2: raise ValueError, "All ranges must have only 2 values: [%s]" % r elif i + 1 != len(rgn_txt): if values[0] > values[1]: raise ValueError, "The bin value can't be greater than the max value: [%s]" % r elif values < 0: raise ValueError, "This value can not be negative: [%s]" % r elif max > values[1]: raise ValueError, "This value can not smaller than the previous one: [%s]" % r else: max = values[1] ranges.append(values) x_samples, x_distmtx = parse_distmat(open(opts.input_path_x, 'U')) y_samples, y_distmtx = parse_distmat(open(opts.input_path_y, 'U')) if opts.ignore_missing_samples: ignoring_from_x = list(set(x_samples) - set(y_samples)) ignoring_from_y = list(set(y_samples) - set(x_samples)) if opts.verbose: print '\nFrom %s we are ignoring: %s\n' % (opts.input_path_x, ignoring_from_x) print '\nFrom %s we are ignoring: %s\n' % (opts.input_path_y, ignoring_from_y) print '\nOnly using: %s\n' % ( list(set(x_samples) & set(y_samples))) x_file = StringIO(\ filter_samples_from_distance_matrix((x_samples, x_distmtx), ignoring_from_x)) x_samples, x_distmtx = parse_distmat(x_file) y_file = StringIO(\ filter_samples_from_distance_matrix((y_samples, y_distmtx), ignoring_from_y)) y_samples, y_distmtx = parse_distmat(y_file) else: if x_distmtx.shape != y_distmtx.shape: raise ValueError, 'The distance matrices have different sizes. ' +\ 'You can cancel this error by passing --ignore_missing_samples' figure() if category == None: x_val, y_val, x_fit, y_fit, func_text = fit_semivariogram( (x_samples, x_distmtx), (y_samples, y_distmtx), opts.model, ranges) plot(x_val, y_val, color=opts.dot_color, marker=opts.dot_marker, linestyle="None", alpha=opts.dot_alpha) plot(x_fit, y_fit, linewidth=2.0, color=opts.line_color, alpha=opts.line_alpha) else: for index, single_category in enumerate(categories): good_sample_ids = sample_ids_from_metadata_description( open(mapping_fp), '%s:%s' % (category, single_category)) _y_samples, _y_distmtx = parse_distmat( StringIO( filter_samples_from_distance_matrix((y_samples, y_distmtx), good_sample_ids, negate=True))) _x_samples, _x_distmtx = parse_distmat( StringIO( filter_samples_from_distance_matrix((x_samples, x_distmtx), good_sample_ids, negate=True))) x_val, y_val, x_fit, y_fit, func_text = fit_semivariogram( (_x_samples, _x_distmtx), (_y_samples, _y_distmtx), opts.model, ranges) # retrieve one of the colors the "QIIME" colors and add it to the # list of used colors for the creation of the legends in the plot color_only = get_qiime_hex_string_color(index) colors_used.append(color_only) plot(x_val, y_val, color=color_only, marker=opts.dot_marker, linestyle="None", alpha=opts.dot_alpha) plot(x_fit, y_fit, linewidth=2.0, color=color_only, alpha=opts.line_alpha, label=single_category) if opts.x_min != None and opts.x_max != None: xlim([opts.x_min, opts.x_max]) if opts.y_min != None and opts.y_max != None: ylim([opts.y_min, opts.y_max]) x_label = opts.x_label y_label = opts.y_label fig_title = '%s (%s)' % (opts.fig_title, opts.model) xlabel(x_label) ylabel(y_label) if opts.print_model: title(fig_title + ' ' + func_text) else: title(fig_title) savefig(opts.output_path) # print the legends after the figure is exported to avoid conflicts if category: # if there's a desired format, use that, else default it to png _, extension = splitext(opts.output_path) # remove the dot, else, make_legend will add it to the filename extension = extension.replace('.', '') if extension == '': extension = 'png' make_legend(categories, colors_used, 0, 0, 'black', 'white', opts.output_path, extension, 80)
def get_seqs_to_keep_lookup_from_mapping_file(mapping_f, valid_states): sample_ids = set(sample_ids_from_metadata_description(mapping_f, valid_states)) return sample_ids
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) mapping_fp = opts.mapping_fp state_values = opts.state_values.split(',') metadata_categories = opts.metadata_categories state_category = opts.state_category individual_id_category = opts.individual_id_category output_dir = opts.output_dir biom_table_fp = opts.biom_table_fp observation_ids = opts.observation_ids if not observation_ids is None: observation_ids = observation_ids.split(',') valid_states = opts.valid_states ymin = opts.ymin ymax = opts.ymax line_color = opts.line_color # validate the input - currently only supports either biom data # or mapping file data. if useful in the future it shouldn't be too # hard to allow the user to provide both. if metadata_categories and biom_table_fp: option_parser.error( "Can only pass --metadata_categories or --biom_table_fp, not both.") elif not (metadata_categories or biom_table_fp): option_parser.error( "Must pass either --metadata_categories or --biom_table_fp.") else: pass # parse the mapping file to a dict mapping_data = parse_mapping_file_to_dict(open(mapping_fp, 'U'))[0] # currently only support for pre/post (ie, two-state) tests if len(state_values) != 2: option_parser.error( "Exactly two state_values must be passed separated by a comma.") # filter mapping_data, if requested if valid_states: sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) for sid in mapping_data.keys(): if sid not in sample_ids_to_keep: del mapping_data[sid] if biom_table_fp: biom_table = parse_biom_table(open(biom_table_fp, 'U')) analysis_categories = observation_ids or biom_table.ObservationIds personal_ids_to_state_values = \ extract_per_individual_state_metadata_from_sample_metadata_and_biom( mapping_data, biom_table, state_category, state_values, individual_id_category, observation_ids=analysis_categories) else: analysis_categories = metadata_categories.split(',') personal_ids_to_state_values = \ extract_per_individual_state_metadata_from_sample_metadata( mapping_data, state_category, state_values, individual_id_category, analysis_categories) paired_difference_analyses(personal_ids_to_state_values, analysis_categories, state_values, output_dir, line_color=line_color, ymin=ymin, ymax=ymax)