def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.attempt_read_reorientation: if not opts.mapping_fp: option_parser.error("To use --attempt_read_reorientation, one must " "supply a mapping file that contains both LinkerPrimerSequence " "and ReversePrimer columns.") if opts.input_type == "barcode_paired_end": if not opts.fastq2: option_parser.error("To use input_type of barcode_paired_end, " "a second fastq file must be specified with --fastq2") if not opts.fastq2: disable_header_match = True else: disable_header_match = opts.disable_header_match fastq1 = qiime_open(opts.fastq1) if opts.fastq2: fastq2 = qiime_open(opts.fastq2) else: fastq2 = None create_dir(opts.output_dir) if opts.mapping_fp: map_fp = qiime_open(opts.mapping_fp) else: map_fp = None extract_barcodes(fastq1, fastq2, opts.output_dir, opts.input_type, opts.bc1_len, opts.bc2_len, opts.rev_comp_bc1, opts.rev_comp_bc2, opts.char_delineator, opts.switch_bc_order, map_fp, opts.attempt_read_reorientation, disable_header_match)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) md, mh, _ = parse_mapping_file(open(mapping_fp)) body_sites = ['Gut','Tongue','Palm','Forehead'] intraindividual_distances = [] print "Unweighted UniFrac" for b in body_sites: dm_fp = "/Users/caporaso/analysis/student-microbiome-project/beta-diversity/unweighted_unifrac_dm.%s_ts_only.txt.gz" % b.lower() h, d = parse_distmat(qiime_open(dm_fp)) intraindividual_distances.append(get_grouped_distances(h, d, mh, md, 'PersonalID')) for i in range(len(body_sites)): for j in range(i): r = correlated_variability(intraindividual_distances[i],intraindividual_distances[j]) print "%s/%s (n=%d): rho:%1.3f, p=%f" % (body_sites[i],body_sites[j],r[0],r[1][0],r[1][3]) intraindividual_distances = [] print "**" print "Weighted UniFrac" for b in body_sites: dm_fp = "/Users/caporaso/analysis/student-microbiome-project/beta-diversity/weighted_unifrac_dm.%s_ts_only.txt.gz" % b.lower() h, d = parse_distmat(qiime_open(dm_fp)) intraindividual_distances.append(get_grouped_distances(h, d, mh, md, 'PersonalID')) for i in range(len(body_sites)): for j in range(i): r = correlated_variability(intraindividual_distances[i],intraindividual_distances[j]) print "%s/%s (n=%d): rho:%1.3f, p=%f" % (body_sites[i],body_sites[j],r[0],r[1][0],r[1][3])
def write_synced_barcodes_fastq(joined_fp, index_fp): """Writes new index file based on surviving assembled paired-ends. -joined_fp : file path to paired-end assembled fastq file -index_fp : file path to index / barcode reads fastq file This function iterates through the joined reads file and index file. Only those index-reads within the file at index_fp, that have headers matching those within the joined-pairs at joined_fp, are written to file. WARNING: Assumes reads are in the same order in both files, except for cases in which the corresponding read in the joined_fp file is missing (i.e. pairs failed to assemble). """ # open files (handles normal / gzipped data) jh = qiime_open(joined_fp) ih = qiime_open(index_fp) # base new index file name on joined paired-end file name: j_path, ext = os.path.splitext(joined_fp) filtered_bc_outfile_path = j_path + '_barcodes.fastq' fbc_fh = open(filtered_bc_outfile_path, 'w') # Set up iterators index_fastq_iter = parse_fastq(ih, strict=False) joined_fastq_iter = parse_fastq(jh, strict=False) # Write barcodes / index reads that we observed within # the joined paired-ends. Warn if index and joined data # are not in order. for joined_label, joined_seq, joined_qual in joined_fastq_iter: index_label, index_seq, index_qual = index_fastq_iter.next() while joined_label != index_label: try: index_label, index_seq, index_qual = index_fastq_iter.next() except StopIteration: raise StopIteration( "\n\nReached end of index-reads file" + " before iterating through joined paired-end-reads file!" + " Except for missing paired-end reads that did not survive" + " assembly, your index and paired-end reads files must be in" + " the same order! Also, check that the index-reads and" + " paired-end reads have identical headers. The last joined" + " paired-end ID processed was:\n\'%s\'\n" % (joined_label)) else: fastq_string = '@%s\n%s\n+\n%s\n'\ % (index_label, index_seq, index_qual) fbc_fh.write(fastq_string) ih.close() jh.close() fbc_fh.close() return filtered_bc_outfile_path
def write_synced_barcodes_fastq(joined_fp, index_fp): """Writes new index file based on surviving assembled paired-ends. -joined_fp : file path to paired-end assembled fastq file -index_fp : file path to index / barcode reads fastq file This function iterates through the joined reads file and index file. Only those index-reads within the file at index_fp, that have headers matching those within the joined-pairs at joined_fp, are written to file. WARNING: Assumes reads are in the same order in both files, except for cases in which the corresponding read in the joined_fp file is missing (i.e. pairs failed to assemble). """ # open files (handles normal / gzipped data) jh = qiime_open(joined_fp) ih = qiime_open(index_fp) # base new index file name on joined paired-end file name: j_path,ext = os.path.splitext(joined_fp) filtered_bc_outfile_path = j_path + '_barcodes.fastq' fbc_fh = open(filtered_bc_outfile_path, 'w') # Set up iterators index_fastq_iter = MinimalFastqParser(ih, strict=False) joined_fastq_iter = MinimalFastqParser(jh, strict=False) # Write barcodes / index reads that we observed within # the joined paired-ends. Warn if index and joined data # are not in order. for joined_label,joined_seq,joined_qual in joined_fastq_iter: index_label,index_seq,index_qual = index_fastq_iter.next() while joined_label != index_label: try: index_label,index_seq,index_qual = index_fastq_iter.next() except StopIteration: raise StopIteration, "\n\nReached end of index-reads file"+\ " before iterating through joined paired-end-reads file!"+\ " Except for missing paired-end reads that did not survive"+\ " assembly, your index and paired-end reads files must be in"+\ " the same order! Also, check that the index-reads and"+\ " paired-end reads have identical headers. The last joined"+\ " paired-end ID processed was:\n\'%s\'\n" %(joined_label) else: fastq_string = '@%s\n%s\n+\n%s\n'\ %(index_label,index_seq,index_qual) fbc_fh.write(fastq_string) ih.close() jh.close() fbc_fh.close() return filtered_bc_outfile_path
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) sample_novelty_data = compute_sample_novelty( [qiime_open(otu_table_fp) for otu_table_fp in opts.otu_table_fps], qiime_open(opts.rep_set_fp), opts.verbose) with open(opts.output_fp, 'w') as out_f: header = ['SampleID', 'Number of novel OTUs', 'Percent novel sequences'] table_writer = writer(out_f, delimiter='\t', lineterminator='\n') table_writer.writerow(header) table_writer.writerows(sample_novelty_data)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) sample_novelty_data = compute_sample_novelty( [qiime_open(otu_table_fp) for otu_table_fp in opts.otu_table_fps], qiime_open(opts.rep_set_fp), opts.verbose) with open(opts.output_fp, 'w') as out_f: header = [ 'SampleID', 'Number of novel OTUs', 'Percent novel sequences' ] table_writer = writer(out_f, delimiter='\t', lineterminator='\n') table_writer.writerow(header) table_writer.writerows(sample_novelty_data)
def extract_reads_from_interleaved(input_fp, forward_id, reverse_id, output_dir): """Parses a single fastq file and creates two new files: forward and reverse, based on the two values (comma separated) in read_direction_identifiers input_fp: file path to input read_direction_identifiers: comma separated values to identify forward and reverse reads output_folder: file path to the output folder """ forward_fp = join(output_dir, "forward_reads.fastq") reverse_fp = join(output_dir, "reverse_reads.fastq") ffp = open(forward_fp, 'w') rfp = open(reverse_fp, 'w') for label, seq, qual in parse_fastq(qiime_open(input_fp), strict=False, enforce_qual_range=False): fastq_string = format_fastq_record(label, seq, qual) if forward_id in label: ffp.write(fastq_string) elif reverse_id in label and forward_id not in label: rfp.write(fastq_string) else: ffp.close() rfp.close() raise ValueError( "One of the input sequences doesn't have either identifier " "or it has both.\nLabel: %s\nForward: %s\n Reverse: %s" % (label, forward_id, reverse_id)) ffp.close() rfp.close()
def extract_reads_from_interleaved( input_fp, forward_id, reverse_id, output_dir): """Parses a single fastq file and creates two new files: forward and reverse, based on the two values (comma separated) in read_direction_identifiers input_fp: file path to input read_direction_identifiers: comma separated values to identify forward and reverse reads output_folder: file path to the output folder """ forward_fp = join(output_dir, "forward_reads.fastq") reverse_fp = join(output_dir, "reverse_reads.fastq") ffp = open(forward_fp, 'w') rfp = open(reverse_fp, 'w') for label, seq, qual in parse_fastq(qiime_open(input_fp), strict=False): fastq_string = format_fastq_record(label, seq, qual) if forward_id in label: ffp.write(fastq_string) elif reverse_id in label and forward_id not in label: rfp.write(fastq_string) else: ffp.close() rfp.close() raise ValueError("One of the input sequences doesn't have either identifier " "or it has both.\nLabel: %s\nForward: %s\n Reverse: %s" % (label, forward_id, reverse_id)) ffp.close() rfp.close()
def get_biom_tables(otu_table_dir): """Pass in a directory containing biom tables, either .biom or .biom.gz Returns a list of biom table objects""" otu_tables_fp = glob("%s/*biom*" % otu_table_dir) # look for both .biom and .biom.gz biom_table_objects = [] for otu_table in otu_tables_fp: biom_table = parse_biom_table(qiime_open(otu_table)) # qiime_open will open .biom and .biom.gz files biom_table_objects.append(biom_table) return biom_table_objects
def make_flow_txt(sff_fp, output_fp, use_sfftools=False): """Makes flowgram file from sff file.""" if use_sfftools: _fail_on_gzipped_sff(sff_fp) check_sffinfo() _check_call(['sffinfo', sff_fp], stdout=open(output_fp, 'w')) else: try: format_binary_sff(qiime_open(sff_fp, 'rb'), open(output_fp, 'w')) except: raise IOError("Could not parse SFF %s" % sff_fp)
def convert_Ti_to_FLX(sff_fp, output_fp, use_sfftools=False): """Converts Titanium SFF to FLX length reads.""" if use_sfftools: _fail_on_gzipped_sff(sff_fp) check_sfffile() _check_call(['sfffile', '-flx', '-o', output_fp, sff_fp], stdout=open(os.devnull, 'w')) else: header, reads = adjust_sff_cycles( parse_binary_sff(qiime_open(sff_fp, 'rb'), True), 100) write_binary_sff(open(output_fp, 'w'), header, reads)
def convert_Ti_to_FLX(sff_fp, output_fp, use_sfftools=False): """Converts Titanium SFF to FLX length reads.""" if use_sfftools: _fail_on_gzipped_sff(sff_fp) check_sfffile() _check_call( ['sfffile', '-flx', '-o', output_fp, sff_fp], stdout=open(os.devnull, 'w')) else: header, reads = adjust_sff_cycles(parse_binary_sff(qiime_open(sff_fp, 'rb'), True), 100) write_binary_sff(open(output_fp, 'w'), header, reads)
def get_biom_tables(otu_table_dir): """Pass in a directory containing biom tables, either .biom or .biom.gz Returns a list of biom table objects""" otu_tables_fp = glob("%s/*biom*" % otu_table_dir) # look for both .biom and .biom.gz biom_table_objects = [] for otu_table in otu_tables_fp: biom_table = parse_biom_table(qiime_open( otu_table)) # qiime_open will open .biom and .biom.gz files biom_table_objects.append(biom_table) return biom_table_objects
def test_adjust_sff_cycles(self): sff_data = parse_binary_sff(open(self.sff_fp)) sff_gz_data = parse_binary_sff(qiime_open(self.sff_gz_fp)) header, reads = adjust_sff_cycles(sff_data, 2) header_gz, reads_gz = adjust_sff_cycles(sff_gz_data, 2) expected_header = { 'header_length': 48, 'version': 1, 'index_length': 0, 'magic_number': 779314790, 'number_of_flows_per_read': 8, 'flowgram_format_code': 1, 'flow_chars': 'TACGTACG', 'index_offset': 0, 'key_sequence': 'TCAG', 'number_of_reads': 1, 'key_length': 4, } self.assertEqual(header, expected_header) self.assertEqual(header_gz, expected_header) expected_read = { 'name_length': 14, 'Name': 'FA6P1OK01CGMHQ', 'flowgram_values': [1.04, 0.0, 1.01, 0.0, 0.0, 0.95999999999999996, 0.0, 1.02], 'clip_adapter_left': 0, 'read_header_length': 32, 'Bases': 'TCAG', 'number_of_bases': 4, 'flow_index_per_base': (1, 2, 3, 2), 'clip_qual_left': 4, 'clip_adapter_right': 0, 'clip_qual_right': 4, 'quality_scores': (32, 32, 32, 32), } reads = list(reads) reads_gz = list(reads_gz) self.assertEqual(len(reads), 1) self.assertEqual(len(reads_gz), 1) self.assertEqual(reads[0], expected_read) self.assertEqual(reads_gz[0], expected_read)
def make_qual(sff_fp, output_fp, use_sfftools=False,no_trim=False): """Makes qual file from sff file.""" if use_sfftools: _fail_on_gzipped_sff(sff_fp) check_sffinfo() if no_trim: _check_call(['sffinfo','-notrim','-q', sff_fp], stdout=open(output_fp, 'w')) else: _check_call(['sffinfo', '-q', sff_fp], stdout=open(output_fp, 'w')) else: try: format_binary_sff_as_fna(qiime_open(sff_fp, 'rb'), open(output_fp, 'w'), qual=True) except: raise IOError("Could not parse SFF %s" % sff_fp)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fps = glob(opts.input_glob) d = {} for input_fp in input_fps: t = parse_biom_table(qiime_open(input_fp)) for obs_values, obs_id, _ in t.iterObservations(): if obs_id not in d: d[obs_id] = set() for i,c in enumerate(obs_values): if c > 0: d[obs_id].add(t.SampleIds[i]) for k,v in d.items(): print '%s\t%s' % (k,'\t'.join(v))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fps = glob(opts.input_glob) d = {} for input_fp in input_fps: t = parse_biom_table(qiime_open(input_fp)) for obs_values, obs_id, _ in t.iterObservations(): if obs_id not in d: d[obs_id] = set() for i, c in enumerate(obs_values): if c > 0: d[obs_id].add(t.SampleIds[i]) for k, v in d.items(): print '%s\t%s' % (k, '\t'.join(v))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) otu_table_fp = opts.otu_table_fp otu_table = parse_biom_table(qiime_open(otu_table_fp)) min_counts, max_counts, median_counts, mean_counts, counts_per_sample = compute_seqs_per_library_stats( otu_table, opts.num_otus ) num_otus = len(otu_table.ObservationIds) counts_per_sample_values = counts_per_sample.values() med_abs_dev = median_absolute_deviation(counts_per_sample_values)[0] even_sampling_depth = guess_even_sampling_depth(counts_per_sample_values) num_samples = len(counts_per_sample) print "Num samples: %s" % str(num_samples) print "Num otus: %s" % str(num_otus) if not opts.num_otus: num_observations = sum(counts_per_sample_values) print "Num observations (sequences): %s" % str(num_observations) # port denisty functionality to a tested function. the following is broken (should be # count of non-zero cells rather than number of observations in the numerator) # print 'Table density (fraction of non-zero values): %1.4f' % (num_observations/(num_samples * num_otus)) print if opts.num_otus: print "OTUs/sample summary:" else: print "Seqs/sample summary:" print " Min: %s" % str(min_counts) print " Max: %s" % str(max_counts) print " Median: %s" % str(median_counts) print " Mean: %s" % str(mean_counts) print " Std. dev.: %s" % (str(std(counts_per_sample_values))) print " Median Absolute Deviation: %s" % str(med_abs_dev) print " Default even sampling depth in\n core_qiime_analyses.py (just a suggestion): %s" % str(even_sampling_depth) print "" if opts.num_otus: print "OTUs/sample detail:" else: print "Seqs/sample detail:" sorted_counts_per_sample = [(v, k) for k, v in counts_per_sample.items()] sorted_counts_per_sample.sort() total_count = 0 for v, k in sorted_counts_per_sample: total_count += v print " %s: %s" % (k, str(v)) if opts.mapping_fp: if not opts.output_mapping_fp: raise RuntimeError("input mapping file supplied, but no path to" + " output file") f = open(opts.mapping_fp, "U") mapping_lines, headers, comments = parse_mapping_file(f) f.close() if len(headers) == 1: endoffset = 0 # if we only have the sample id, this data -> last col else: endoffset = 1 # usually make this data the penultimate column. headers.insert(len(headers) - endoffset, "NumIndividuals") for map_line in mapping_lines: sample_id = map_line try: depth = str(counts_per_sample[map_line[0]]) except KeyError: depth = "na" map_line.insert(len(map_line) - endoffset, depth) new_map_str = format_mapping_file(headers, mapping_lines, comments) f = open(opts.output_mapping_fp, "w") f.write(new_map_str) f.close()
def main(): option_parser, opts,args = parse_command_line_parameters(**script_info) otu_table_fp = opts.otu_table_fp otu_table = parse_biom_table(qiime_open(otu_table_fp)) min_counts, max_counts, median_counts, mean_counts, counts_per_sample =\ compute_seqs_per_library_stats(otu_table, opts.num_otus) num_otus = len(otu_table.ObservationIds) counts_per_sample_values = counts_per_sample.values() med_abs_dev = median_absolute_deviation(counts_per_sample_values)[0] even_sampling_depth = guess_even_sampling_depth(counts_per_sample_values) try: sample_md_keys = otu_table.SampleMetadata[0].keys() except TypeError: sample_md_keys = ["None provided"] try: observation_md_keys = otu_table.ObservationMetadata[0].keys() except TypeError: observation_md_keys = ["None provided"] num_samples = len(counts_per_sample) print 'Num samples: %s' % str(num_samples) print 'Num otus: %s' % str(num_otus) if not opts.num_otus: num_observations = sum(counts_per_sample_values) print 'Num observations (sequences): %s' % str(num_observations) print 'Table density (fraction of non-zero values): %1.4f' % \ otu_table.getTableDensity() print if opts.num_otus: print 'OTUs/sample summary:' else: print 'Seqs/sample summary:' print ' Min: %s' % str(min_counts) print ' Max: %s' % str(max_counts) print ' Median: %s' % str(median_counts) print ' Mean: %s' % str(mean_counts) print ' Std. dev.: %s' % (str(std(counts_per_sample_values))) print ' Median Absolute Deviation: %s' % str(med_abs_dev) print ' Default even sampling depth in\n core_qiime_analyses.py (just a suggestion): %s' %\ str(even_sampling_depth) print ' Sample Metadata Categories: %s' % '; '.join(sample_md_keys) print ' Observation Metadata Categories: %s' % '; '.join(observation_md_keys) print '' if opts.num_otus: print 'OTUs/sample detail:' else: print 'Seqs/sample detail:' sorted_counts_per_sample = [(v,k) for k,v in counts_per_sample.items()] sorted_counts_per_sample.sort() total_count = 0 for v,k in sorted_counts_per_sample: total_count += v print ' %s: %s' % (k,str(v)) if opts.mapping_fp: if not opts.output_mapping_fp: raise RuntimeError('input mapping file supplied, but no path to'+\ ' output file') f = open(opts.mapping_fp,'U') mapping_lines, headers, comments = parse_mapping_file(f) f.close() if len(headers)==1: endoffset = 0 # if we only have the sample id, this data -> last col else: endoffset = 1 # usually make this data the penultimate column. headers.insert(len(headers)-endoffset,'SequenceCount') for map_line in mapping_lines: sample_id = map_line try: depth = str(counts_per_sample[map_line[0]]) except KeyError: depth = 'na' map_line.insert(len(map_line)-endoffset,depth) new_map_str = format_mapping_file(headers, mapping_lines, comments) f = open(opts.output_mapping_fp, 'w') f.write(new_map_str) f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) m = parse_mapping_file_to_dict(open(opts.mapping_fp, "U"))[0] adjacent_unifrac_analyses = False pcoa_analyses = True if pcoa_analyses: wpc_h, wpc, _, _ = parse_coords(qiime_open(wpc_fp)) upc_h, upc, _, _ = parse_coords(qiime_open(upc_fp)) ugly_pc_function( m, wpc_h, wpc, ["GutTimeseries", "TongueTimeseries", "ForeheadTimeseries", "PalmTimeseries"], ["SampleAntibioticDisturbance", "SampleMenstruationDisturbance", "SampleSicknessDisturbance"], "CUB000", "pc-weighted.pdf", ) ugly_pc_function( m, upc_h, upc, ["GutTimeseries", "TongueTimeseries", "ForeheadTimeseries", "PalmTimeseries"], ["SampleAntibioticDisturbance", "SampleMenstruationDisturbance", "SampleSicknessDisturbance"], "CUB000", "pc-unweighted.pdf", ) wh, wdm = parse_distmat(qiime_open(wdm_fp, "U")) uh, udm = parse_distmat(qiime_open(udm_fp, "U")) if adjacent_unifrac_analyses: plot_adjacent_unifracs( uh, udm, m, ["GutTimeseries", "TongueTimeseries", "ForeheadTimeseries", "PalmTimeseries"], "Yes", ["SampleAntibioticDisturbance", "SampleMenstruationDisturbance", "SampleSicknessDisturbance"], "Yes", output_fp="unweighted-unifrac.pdf", ) plot_adjacent_unifracs( wh, wdm, m, ["GutTimeseries", "TongueTimeseries", "ForeheadTimeseries", "PalmTimeseries"], "Yes", ["SampleAntibioticDisturbance", "SampleMenstruationDisturbance", "SampleSicknessDisturbance"], "Yes", output_fp="weighted-unifrac.pdf", ) r = score_ranked_adjacent_unifracs( m, udm, uh, inclusion_field="GutTimeseries", inclusion_value="Yes", personal_id_field="PersonalID", disturbed_field="SampleAntibioticDisturbance", disturbed_value="Yes", ) print "Gut Unweighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % ( r[1], len(r[2]), len(r[3]), ) r = score_ranked_adjacent_unifracs( m, wdm, wh, inclusion_field="GutTimeseries", inclusion_value="Yes", personal_id_field="PersonalID", disturbed_field="SampleAntibioticDisturbance", disturbed_value="Yes", ) print "Gut Weighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % ( r[1], len(r[2]), len(r[3]), ) r = score_ranked_adjacent_unifracs( m, udm, uh, inclusion_field="TongueTimeseries", inclusion_value="Yes", personal_id_field="PersonalID", disturbed_field="SampleAntibioticDisturbance", disturbed_value="Yes", ) print "Tongue Unweighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % ( r[1], len(r[2]), len(r[3]), ) r = score_ranked_adjacent_unifracs( m, wdm, wh, inclusion_field="TongueTimeseries", inclusion_value="Yes", personal_id_field="PersonalID", disturbed_field="SampleAntibioticDisturbance", disturbed_value="Yes", ) print "Tongue Weighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % ( r[1], len(r[2]), len(r[3]), ) r = score_ranked_adjacent_unifracs( m, udm, uh, inclusion_field="PalmTimeseries", inclusion_value="Yes", personal_id_field="PersonalID", disturbed_field="SampleAntibioticDisturbance", disturbed_value="Yes", ) print "Palm Unweighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % ( r[1], len(r[2]), len(r[3]), ) r = score_ranked_adjacent_unifracs( m, wdm, wh, inclusion_field="PalmTimeseries", inclusion_value="Yes", personal_id_field="PersonalID", disturbed_field="SampleAntibioticDisturbance", disturbed_value="Yes", ) print "Palm Weighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % ( r[1], len(r[2]), len(r[3]), ) r = score_ranked_adjacent_unifracs( m, udm, uh, inclusion_field="ForeheadTimeseries", inclusion_value="Yes", personal_id_field="PersonalID", disturbed_field="SampleAntibioticDisturbance", disturbed_value="Yes", ) print "Forehead Unweighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % ( r[1], len(r[2]), len(r[3]), ) r = score_ranked_adjacent_unifracs( m, wdm, wh, inclusion_field="ForeheadTimeseries", inclusion_value="Yes", personal_id_field="PersonalID", disturbed_field="SampleAntibioticDisturbance", disturbed_value="Yes", ) print "Forehead Weighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % ( r[1], len(r[2]), len(r[3]), )
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) otu_table_fp = opts.otu_table_fp otu_table = parse_biom_table(qiime_open(otu_table_fp)) min_counts, max_counts, median_counts, mean_counts, counts_per_sample =\ compute_seqs_per_library_stats(otu_table, opts.num_otus) num_otus = len(otu_table.ObservationIds) counts_per_sample_values = counts_per_sample.values() med_abs_dev = median_absolute_deviation(counts_per_sample_values)[0] even_sampling_depth = guess_even_sampling_depth(counts_per_sample_values) num_samples = len(counts_per_sample) print 'Num samples: %s' % str(num_samples) print 'Num otus: %s' % str(num_otus) if not opts.num_otus: num_observations = sum(counts_per_sample_values) print 'Num observations (sequences): %s' % str(num_observations) # port denisty functionality to a tested function. the following is broken (should be # count of non-zero cells rather than number of observations in the numerator) #print 'Table density (fraction of non-zero values): %1.4f' % (num_observations/(num_samples * num_otus)) print if opts.num_otus: print 'OTUs/sample summary:' else: print 'Seqs/sample summary:' print ' Min: %s' % str(min_counts) print ' Max: %s' % str(max_counts) print ' Median: %s' % str(median_counts) print ' Mean: %s' % str(mean_counts) print ' Std. dev.: %s' % (str(std(counts_per_sample_values))) print ' Median Absolute Deviation: %s' % str(med_abs_dev) print ' Default even sampling depth in\n core_qiime_analyses.py (just a suggestion): %s' %\ str(even_sampling_depth) print '' if opts.num_otus: print 'OTUs/sample detail:' else: print 'Seqs/sample detail:' sorted_counts_per_sample = [(v, k) for k, v in counts_per_sample.items()] sorted_counts_per_sample.sort() total_count = 0 for v, k in sorted_counts_per_sample: total_count += v print ' %s: %s' % (k, str(v)) if opts.mapping_fp: if not opts.output_mapping_fp: raise RuntimeError('input mapping file supplied, but no path to'+\ ' output file') f = open(opts.mapping_fp, 'U') mapping_lines, headers, comments = parse_mapping_file(f) f.close() if len(headers) == 1: endoffset = 0 # if we only have the sample id, this data -> last col else: endoffset = 1 # usually make this data the penultimate column. headers.insert(len(headers) - endoffset, 'NumIndividuals') for map_line in mapping_lines: sample_id = map_line try: depth = str(counts_per_sample[map_line[0]]) except KeyError: depth = 'na' map_line.insert(len(map_line) - endoffset, depth) new_map_str = format_mapping_file(headers, mapping_lines, comments) f = open(opts.output_mapping_fp, 'w') f.write(new_map_str) f.close()