Esempio n. 1
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    if opts.attempt_read_reorientation:
        if not opts.mapping_fp:
            option_parser.error("To use --attempt_read_reorientation, one must "
                                "supply a mapping file that contains both LinkerPrimerSequence "
                                "and ReversePrimer columns.")
    if opts.input_type == "barcode_paired_end":
        if not opts.fastq2:
            option_parser.error("To use input_type of barcode_paired_end, "
                                "a second fastq file must be specified with --fastq2")

    if not opts.fastq2:
        disable_header_match = True
    else:
        disable_header_match = opts.disable_header_match

    fastq1 = qiime_open(opts.fastq1)
    if opts.fastq2:
        fastq2 = qiime_open(opts.fastq2)
    else:
        fastq2 = None
    create_dir(opts.output_dir)
    if opts.mapping_fp:
        map_fp = qiime_open(opts.mapping_fp)
    else:
        map_fp = None

    extract_barcodes(fastq1, fastq2, opts.output_dir, opts.input_type,
                     opts.bc1_len, opts.bc2_len, opts.rev_comp_bc1, opts.rev_comp_bc2,
                     opts.char_delineator, opts.switch_bc_order, map_fp,
                     opts.attempt_read_reorientation, disable_header_match)
Esempio n. 2
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    if opts.attempt_read_reorientation:
        if not opts.mapping_fp:
            option_parser.error("To use --attempt_read_reorientation, one must "
                                "supply a mapping file that contains both LinkerPrimerSequence "
                                "and ReversePrimer columns.")
    if opts.input_type == "barcode_paired_end":
        if not opts.fastq2:
            option_parser.error("To use input_type of barcode_paired_end, "
                                "a second fastq file must be specified with --fastq2")

    if not opts.fastq2:
        disable_header_match = True
    else:
        disable_header_match = opts.disable_header_match

    fastq1 = qiime_open(opts.fastq1)
    if opts.fastq2:
        fastq2 = qiime_open(opts.fastq2)
    else:
        fastq2 = None
    create_dir(opts.output_dir)
    if opts.mapping_fp:
        map_fp = qiime_open(opts.mapping_fp)
    else:
        map_fp = None

    extract_barcodes(fastq1, fastq2, opts.output_dir, opts.input_type,
                     opts.bc1_len, opts.bc2_len, opts.rev_comp_bc1, opts.rev_comp_bc2,
                     opts.char_delineator, opts.switch_bc_order, map_fp,
                     opts.attempt_read_reorientation, disable_header_match)
Esempio n. 3
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    md, mh, _ = parse_mapping_file(open(mapping_fp))
    
    body_sites = ['Gut','Tongue','Palm','Forehead']
    intraindividual_distances = []
    
    print "Unweighted UniFrac"
    for b in body_sites:
        dm_fp = "/Users/caporaso/analysis/student-microbiome-project/beta-diversity/unweighted_unifrac_dm.%s_ts_only.txt.gz" % b.lower()
        h, d = parse_distmat(qiime_open(dm_fp))
        intraindividual_distances.append(get_grouped_distances(h, d, mh, md, 'PersonalID'))
    
    for i in range(len(body_sites)):
        for j in range(i):
            r = correlated_variability(intraindividual_distances[i],intraindividual_distances[j])
            print "%s/%s (n=%d): rho:%1.3f, p=%f" % (body_sites[i],body_sites[j],r[0],r[1][0],r[1][3])
    
    intraindividual_distances = []
    print "**"
    print "Weighted UniFrac"
    for b in body_sites:
        dm_fp = "/Users/caporaso/analysis/student-microbiome-project/beta-diversity/weighted_unifrac_dm.%s_ts_only.txt.gz" % b.lower()
        h, d = parse_distmat(qiime_open(dm_fp))
        intraindividual_distances.append(get_grouped_distances(h, d, mh, md, 'PersonalID'))
    
    for i in range(len(body_sites)):
        for j in range(i):
            r = correlated_variability(intraindividual_distances[i],intraindividual_distances[j])
            print "%s/%s (n=%d): rho:%1.3f, p=%f" % (body_sites[i],body_sites[j],r[0],r[1][0],r[1][3])
def write_synced_barcodes_fastq(joined_fp, index_fp):
    """Writes new index file based on surviving assembled paired-ends.
       -joined_fp : file path to paired-end assembled fastq file
       -index_fp : file path to index / barcode reads fastq file

       This function iterates through the joined reads file and index file.
       Only those index-reads within the file at index_fp, that have headers
       matching those within the joined-pairs at joined_fp, are written
       to file.

     WARNING: Assumes reads are in the same order in both files,
              except for cases in which the corresponding
              read in the joined_fp file is missing (i.e. pairs
              failed to assemble).

    """

    # open files (handles normal / gzipped data)
    jh = qiime_open(joined_fp)
    ih = qiime_open(index_fp)

    # base new index file name on joined paired-end file name:
    j_path, ext = os.path.splitext(joined_fp)
    filtered_bc_outfile_path = j_path + '_barcodes.fastq'
    fbc_fh = open(filtered_bc_outfile_path, 'w')

    # Set up iterators
    index_fastq_iter = parse_fastq(ih, strict=False)
    joined_fastq_iter = parse_fastq(jh, strict=False)
    # Write barcodes / index reads that we observed within
    # the joined paired-ends. Warn if index and joined data
    # are not in order.
    for joined_label, joined_seq, joined_qual in joined_fastq_iter:
        index_label, index_seq, index_qual = index_fastq_iter.next()
        while joined_label != index_label:
            try:
                index_label, index_seq, index_qual = index_fastq_iter.next()
            except StopIteration:
                raise StopIteration(
                    "\n\nReached end of index-reads file" +
                    " before iterating through joined paired-end-reads file!" +
                    " Except for missing paired-end reads that did not survive"
                    +
                    " assembly, your index and paired-end reads files must be in"
                    + " the same order! Also, check that the index-reads and" +
                    " paired-end reads have identical headers. The last joined"
                    + " paired-end ID processed was:\n\'%s\'\n" %
                    (joined_label))
        else:
            fastq_string = '@%s\n%s\n+\n%s\n'\
                % (index_label, index_seq, index_qual)
            fbc_fh.write(fastq_string)

    ih.close()
    jh.close()
    fbc_fh.close()

    return filtered_bc_outfile_path
Esempio n. 5
0
def write_synced_barcodes_fastq(joined_fp, index_fp):
    """Writes new index file based on surviving assembled paired-ends.
       -joined_fp : file path to paired-end assembled fastq file
       -index_fp : file path to index / barcode reads fastq file

       This function iterates through the joined reads file and index file. 
       Only those index-reads within the file at index_fp, that have headers
       matching those within the joined-pairs at joined_fp, are written 
       to file. 

     WARNING: Assumes reads are in the same order in both files,
              except for cases in which the corresponding
              read in the joined_fp file is missing (i.e. pairs 
              failed to assemble).

    """

    # open files (handles normal / gzipped data)
    jh = qiime_open(joined_fp)
    ih = qiime_open(index_fp)

    # base new index file name on joined paired-end file name:
    j_path,ext = os.path.splitext(joined_fp)
    filtered_bc_outfile_path = j_path + '_barcodes.fastq'
    fbc_fh = open(filtered_bc_outfile_path, 'w')


    # Set up iterators
    index_fastq_iter = MinimalFastqParser(ih, strict=False)
    joined_fastq_iter = MinimalFastqParser(jh, strict=False) 
    # Write barcodes / index reads that we observed within
    # the joined paired-ends. Warn if index and joined data
    # are not in order.
    for joined_label,joined_seq,joined_qual in joined_fastq_iter:
        index_label,index_seq,index_qual = index_fastq_iter.next()
        while joined_label != index_label:
            try:
                index_label,index_seq,index_qual = index_fastq_iter.next()
            except StopIteration:
                raise StopIteration, "\n\nReached end of index-reads file"+\
                 " before iterating through joined paired-end-reads file!"+\
                 " Except for missing paired-end reads that did not survive"+\
                 " assembly, your index and paired-end reads files must be in"+\
                 " the same order! Also, check that the index-reads and"+\
                 " paired-end reads have identical headers. The last joined"+\
                 " paired-end ID processed was:\n\'%s\'\n" %(joined_label)
        else:
            fastq_string = '@%s\n%s\n+\n%s\n'\
                            %(index_label,index_seq,index_qual)
            fbc_fh.write(fastq_string)
    
    ih.close()
    jh.close()
    fbc_fh.close()

    return filtered_bc_outfile_path
Esempio n. 6
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    sample_novelty_data = compute_sample_novelty(
            [qiime_open(otu_table_fp) for otu_table_fp in opts.otu_table_fps],
            qiime_open(opts.rep_set_fp), opts.verbose)

    with open(opts.output_fp, 'w') as out_f:
        header = ['SampleID', 'Number of novel OTUs',
                  'Percent novel sequences']
        table_writer = writer(out_f, delimiter='\t', lineterminator='\n')
        table_writer.writerow(header)
        table_writer.writerows(sample_novelty_data)
Esempio n. 7
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    sample_novelty_data = compute_sample_novelty(
        [qiime_open(otu_table_fp) for otu_table_fp in opts.otu_table_fps],
        qiime_open(opts.rep_set_fp), opts.verbose)

    with open(opts.output_fp, 'w') as out_f:
        header = [
            'SampleID', 'Number of novel OTUs', 'Percent novel sequences'
        ]
        table_writer = writer(out_f, delimiter='\t', lineterminator='\n')
        table_writer.writerow(header)
        table_writer.writerows(sample_novelty_data)
def extract_reads_from_interleaved(input_fp, forward_id, reverse_id,
                                   output_dir):
    """Parses a single fastq file and creates two new files: forward and reverse, based on
    the two values (comma separated) in read_direction_identifiers

    input_fp: file path to input
    read_direction_identifiers: comma separated values to identify forward and reverse reads
    output_folder: file path to the output folder
    """
    forward_fp = join(output_dir, "forward_reads.fastq")
    reverse_fp = join(output_dir, "reverse_reads.fastq")
    ffp = open(forward_fp, 'w')
    rfp = open(reverse_fp, 'w')

    for label, seq, qual in parse_fastq(qiime_open(input_fp),
                                        strict=False,
                                        enforce_qual_range=False):
        fastq_string = format_fastq_record(label, seq, qual)
        if forward_id in label:
            ffp.write(fastq_string)
        elif reverse_id in label and forward_id not in label:
            rfp.write(fastq_string)
        else:
            ffp.close()
            rfp.close()
            raise ValueError(
                "One of the input sequences doesn't have either identifier "
                "or it has both.\nLabel: %s\nForward: %s\n Reverse: %s" %
                (label, forward_id, reverse_id))
    ffp.close()
    rfp.close()
Esempio n. 9
0
def extract_reads_from_interleaved(
        input_fp, forward_id, reverse_id, output_dir):
    """Parses a single fastq file and creates two new files: forward and reverse, based on
    the two values (comma separated) in read_direction_identifiers

    input_fp: file path to input
    read_direction_identifiers: comma separated values to identify forward and reverse reads
    output_folder: file path to the output folder
    """
    forward_fp = join(output_dir, "forward_reads.fastq")
    reverse_fp = join(output_dir, "reverse_reads.fastq")
    ffp = open(forward_fp, 'w')
    rfp = open(reverse_fp, 'w')

    for label, seq, qual in parse_fastq(qiime_open(input_fp), strict=False):
        fastq_string = format_fastq_record(label, seq, qual)
        if forward_id in label:
            ffp.write(fastq_string)
        elif reverse_id in label and forward_id not in label:
            rfp.write(fastq_string)
        else:
            ffp.close()
            rfp.close()
            raise ValueError("One of the input sequences doesn't have either identifier "
                             "or it has both.\nLabel: %s\nForward: %s\n Reverse: %s" %
                             (label, forward_id, reverse_id))
    ffp.close()
    rfp.close()
Esempio n. 10
0
def get_biom_tables(otu_table_dir):
    """Pass in a directory containing biom tables, either .biom or .biom.gz
    Returns a list of biom table objects"""

    otu_tables_fp = glob("%s/*biom*" % otu_table_dir) # look for both .biom and .biom.gz
    biom_table_objects = []
    for otu_table in otu_tables_fp:
        biom_table = parse_biom_table(qiime_open(otu_table)) # qiime_open will open .biom and .biom.gz files
        biom_table_objects.append(biom_table)
    return biom_table_objects
Esempio n. 11
0
def make_flow_txt(sff_fp, output_fp, use_sfftools=False):
    """Makes flowgram file from sff file."""
    if use_sfftools:
        _fail_on_gzipped_sff(sff_fp)
        check_sffinfo()
        _check_call(['sffinfo', sff_fp], stdout=open(output_fp, 'w'))
    else:
        try:
            format_binary_sff(qiime_open(sff_fp, 'rb'), open(output_fp, 'w'))
        except:
            raise IOError("Could not parse SFF %s" % sff_fp)
Esempio n. 12
0
def convert_Ti_to_FLX(sff_fp, output_fp, use_sfftools=False):
    """Converts Titanium SFF to FLX length reads."""
    if use_sfftools:
        _fail_on_gzipped_sff(sff_fp)
        check_sfffile()
        _check_call(['sfffile', '-flx', '-o', output_fp, sff_fp],
                    stdout=open(os.devnull, 'w'))
    else:
        header, reads = adjust_sff_cycles(
            parse_binary_sff(qiime_open(sff_fp, 'rb'), True), 100)
        write_binary_sff(open(output_fp, 'w'), header, reads)
Esempio n. 13
0
def make_flow_txt(sff_fp, output_fp, use_sfftools=False):
    """Makes flowgram file from sff file."""
    if use_sfftools:
        _fail_on_gzipped_sff(sff_fp)
        check_sffinfo()
        _check_call(['sffinfo', sff_fp], stdout=open(output_fp, 'w'))
    else:
        try:
            format_binary_sff(qiime_open(sff_fp, 'rb'), open(output_fp, 'w'))
        except:
            raise IOError("Could not parse SFF %s" % sff_fp)
Esempio n. 14
0
def convert_Ti_to_FLX(sff_fp, output_fp, use_sfftools=False):
    """Converts Titanium SFF to FLX length reads."""
    if use_sfftools:
        _fail_on_gzipped_sff(sff_fp)
        check_sfffile()
        _check_call(
            ['sfffile', '-flx', '-o', output_fp, sff_fp],
            stdout=open(os.devnull, 'w'))
    else:
        header, reads = adjust_sff_cycles(parse_binary_sff(qiime_open(sff_fp, 'rb'),
                                          True), 100)
        write_binary_sff(open(output_fp, 'w'), header, reads)
Esempio n. 15
0
def get_biom_tables(otu_table_dir):
    """Pass in a directory containing biom tables, either .biom or .biom.gz
    Returns a list of biom table objects"""

    otu_tables_fp = glob("%s/*biom*" %
                         otu_table_dir)  # look for both .biom and .biom.gz
    biom_table_objects = []
    for otu_table in otu_tables_fp:
        biom_table = parse_biom_table(qiime_open(
            otu_table))  # qiime_open will open .biom and .biom.gz files
        biom_table_objects.append(biom_table)
    return biom_table_objects
Esempio n. 16
0
    def test_adjust_sff_cycles(self):
        sff_data = parse_binary_sff(open(self.sff_fp))
        sff_gz_data = parse_binary_sff(qiime_open(self.sff_gz_fp))
        header, reads = adjust_sff_cycles(sff_data, 2)
        header_gz, reads_gz = adjust_sff_cycles(sff_gz_data, 2)
        expected_header = {
            'header_length': 48,
            'version': 1,
            'index_length': 0,
            'magic_number': 779314790,
            'number_of_flows_per_read': 8,
            'flowgram_format_code': 1,
            'flow_chars': 'TACGTACG',
            'index_offset': 0,
            'key_sequence': 'TCAG',
            'number_of_reads': 1,
            'key_length': 4,
        }
        self.assertEqual(header, expected_header)
        self.assertEqual(header_gz, expected_header)

        expected_read = {
            'name_length':
            14,
            'Name':
            'FA6P1OK01CGMHQ',
            'flowgram_values':
            [1.04, 0.0, 1.01, 0.0, 0.0, 0.95999999999999996, 0.0, 1.02],
            'clip_adapter_left':
            0,
            'read_header_length':
            32,
            'Bases':
            'TCAG',
            'number_of_bases':
            4,
            'flow_index_per_base': (1, 2, 3, 2),
            'clip_qual_left':
            4,
            'clip_adapter_right':
            0,
            'clip_qual_right':
            4,
            'quality_scores': (32, 32, 32, 32),
        }
        reads = list(reads)
        reads_gz = list(reads_gz)
        self.assertEqual(len(reads), 1)
        self.assertEqual(len(reads_gz), 1)
        self.assertEqual(reads[0], expected_read)
        self.assertEqual(reads_gz[0], expected_read)
Esempio n. 17
0
def make_qual(sff_fp, output_fp, use_sfftools=False,no_trim=False):
    """Makes qual file from sff file."""
    if use_sfftools:
        _fail_on_gzipped_sff(sff_fp)
        check_sffinfo()
        if no_trim:
            _check_call(['sffinfo','-notrim','-q', sff_fp], 
                        stdout=open(output_fp, 'w'))
        else:
            _check_call(['sffinfo', '-q', sff_fp], stdout=open(output_fp, 'w'))
    else:
        try:
            format_binary_sff_as_fna(qiime_open(sff_fp, 'rb'), open(output_fp, 'w'), qual=True)
        except:
            raise IOError("Could not parse SFF %s" % sff_fp)
Esempio n. 18
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_fps = glob(opts.input_glob)
    d = {}
    for input_fp in input_fps:
        t = parse_biom_table(qiime_open(input_fp))
        for obs_values, obs_id, _ in t.iterObservations():
            if obs_id not in d:
                d[obs_id] = set()
            for i,c in enumerate(obs_values):
                if c > 0:
                    d[obs_id].add(t.SampleIds[i])
    
    for k,v in d.items():
        print '%s\t%s' % (k,'\t'.join(v))
Esempio n. 19
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_fps = glob(opts.input_glob)
    d = {}
    for input_fp in input_fps:
        t = parse_biom_table(qiime_open(input_fp))
        for obs_values, obs_id, _ in t.iterObservations():
            if obs_id not in d:
                d[obs_id] = set()
            for i, c in enumerate(obs_values):
                if c > 0:
                    d[obs_id].add(t.SampleIds[i])

    for k, v in d.items():
        print '%s\t%s' % (k, '\t'.join(v))
Esempio n. 20
0
    def test_adjust_sff_cycles(self):
        sff_data = parse_binary_sff(open(self.sff_fp))
        sff_gz_data = parse_binary_sff(qiime_open(self.sff_gz_fp))
        header, reads = adjust_sff_cycles(sff_data, 2)
        header_gz, reads_gz = adjust_sff_cycles(sff_gz_data, 2)
        expected_header = {
            'header_length': 48,
            'version': 1,
            'index_length': 0,
            'magic_number': 779314790,
            'number_of_flows_per_read': 8,
            'flowgram_format_code': 1,
            'flow_chars': 'TACGTACG',
            'index_offset': 0,
            'key_sequence': 'TCAG',
            'number_of_reads': 1,
            'key_length': 4,
        }
        self.assertEqual(header, expected_header)
        self.assertEqual(header_gz, expected_header)

        expected_read = {
            'name_length': 14,
            'Name': 'FA6P1OK01CGMHQ',
            'flowgram_values':
            [1.04, 0.0, 1.01, 0.0, 0.0, 0.95999999999999996, 0.0, 1.02],
            'clip_adapter_left': 0,
            'read_header_length': 32,
            'Bases': 'TCAG',
            'number_of_bases': 4,
            'flow_index_per_base': (1, 2, 3, 2),
            'clip_qual_left': 4,
            'clip_adapter_right': 0,
            'clip_qual_right': 4,
            'quality_scores': (32, 32, 32, 32),
        }
        reads = list(reads)
        reads_gz = list(reads_gz)
        self.assertEqual(len(reads), 1)
        self.assertEqual(len(reads_gz), 1)
        self.assertEqual(reads[0], expected_read)
        self.assertEqual(reads_gz[0], expected_read)
Esempio n. 21
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    otu_table_fp = opts.otu_table_fp
    otu_table = parse_biom_table(qiime_open(otu_table_fp))
    min_counts, max_counts, median_counts, mean_counts, counts_per_sample = compute_seqs_per_library_stats(
        otu_table, opts.num_otus
    )
    num_otus = len(otu_table.ObservationIds)

    counts_per_sample_values = counts_per_sample.values()
    med_abs_dev = median_absolute_deviation(counts_per_sample_values)[0]
    even_sampling_depth = guess_even_sampling_depth(counts_per_sample_values)

    num_samples = len(counts_per_sample)
    print "Num samples: %s" % str(num_samples)
    print "Num otus: %s" % str(num_otus)
    if not opts.num_otus:
        num_observations = sum(counts_per_sample_values)
        print "Num observations (sequences): %s" % str(num_observations)
        # port denisty functionality to a tested function. the following is broken (should be
        # count of non-zero cells rather than number of observations in the numerator)
        # print 'Table density (fraction of non-zero values): %1.4f' % (num_observations/(num_samples * num_otus))
    print

    if opts.num_otus:
        print "OTUs/sample summary:"
    else:
        print "Seqs/sample summary:"
    print " Min: %s" % str(min_counts)
    print " Max: %s" % str(max_counts)
    print " Median: %s" % str(median_counts)
    print " Mean: %s" % str(mean_counts)
    print " Std. dev.: %s" % (str(std(counts_per_sample_values)))
    print " Median Absolute Deviation: %s" % str(med_abs_dev)
    print " Default even sampling depth in\n  core_qiime_analyses.py (just a suggestion): %s" % str(even_sampling_depth)
    print ""
    if opts.num_otus:
        print "OTUs/sample detail:"
    else:
        print "Seqs/sample detail:"
    sorted_counts_per_sample = [(v, k) for k, v in counts_per_sample.items()]
    sorted_counts_per_sample.sort()
    total_count = 0
    for v, k in sorted_counts_per_sample:
        total_count += v
        print " %s: %s" % (k, str(v))

    if opts.mapping_fp:
        if not opts.output_mapping_fp:
            raise RuntimeError("input mapping file supplied, but no path to" + " output file")
        f = open(opts.mapping_fp, "U")
        mapping_lines, headers, comments = parse_mapping_file(f)
        f.close()
        if len(headers) == 1:
            endoffset = 0  # if we only have the sample id, this data -> last col
        else:
            endoffset = 1  # usually make this data the penultimate column.
        headers.insert(len(headers) - endoffset, "NumIndividuals")
        for map_line in mapping_lines:
            sample_id = map_line
            try:
                depth = str(counts_per_sample[map_line[0]])
            except KeyError:
                depth = "na"
            map_line.insert(len(map_line) - endoffset, depth)

        new_map_str = format_mapping_file(headers, mapping_lines, comments)
        f = open(opts.output_mapping_fp, "w")
        f.write(new_map_str)
        f.close()
Esempio n. 22
0
def main():
    option_parser, opts,args = parse_command_line_parameters(**script_info)
    otu_table_fp = opts.otu_table_fp
    otu_table = parse_biom_table(qiime_open(otu_table_fp))
    min_counts, max_counts, median_counts, mean_counts, counts_per_sample =\
     compute_seqs_per_library_stats(otu_table, opts.num_otus)
    num_otus = len(otu_table.ObservationIds)
    
    counts_per_sample_values = counts_per_sample.values()
    med_abs_dev = median_absolute_deviation(counts_per_sample_values)[0]
    even_sampling_depth = guess_even_sampling_depth(counts_per_sample_values)
    
    try:
        sample_md_keys = otu_table.SampleMetadata[0].keys()
    except TypeError:
        sample_md_keys = ["None provided"]
    try:
        observation_md_keys = otu_table.ObservationMetadata[0].keys()
    except TypeError:
        observation_md_keys = ["None provided"]
    
    num_samples = len(counts_per_sample)
    print 'Num samples: %s' % str(num_samples)
    print 'Num otus: %s' % str(num_otus)
    if not opts.num_otus:
        num_observations = sum(counts_per_sample_values)
        print 'Num observations (sequences): %s' % str(num_observations)
        print 'Table density (fraction of non-zero values): %1.4f' % \
              otu_table.getTableDensity()
    print

    if opts.num_otus:
        print 'OTUs/sample summary:'
    else:
        print 'Seqs/sample summary:' 
    print ' Min: %s' % str(min_counts)
    print ' Max: %s' % str(max_counts)
    print ' Median: %s' % str(median_counts)
    print ' Mean: %s' % str(mean_counts)
    print ' Std. dev.: %s' % (str(std(counts_per_sample_values)))
    print ' Median Absolute Deviation: %s' % str(med_abs_dev)
    print ' Default even sampling depth in\n  core_qiime_analyses.py (just a suggestion): %s' %\
     str(even_sampling_depth)
    print ' Sample Metadata Categories: %s' % '; '.join(sample_md_keys)
    print ' Observation Metadata Categories: %s' % '; '.join(observation_md_keys)
     
    print ''
    if opts.num_otus:
        print 'OTUs/sample detail:'
    else:
        print 'Seqs/sample detail:'
    sorted_counts_per_sample = [(v,k) for k,v in counts_per_sample.items()]
    sorted_counts_per_sample.sort()
    total_count = 0
    for v,k in sorted_counts_per_sample:
        total_count += v
        print ' %s: %s' % (k,str(v))

    if opts.mapping_fp:
        if not opts.output_mapping_fp:
            raise RuntimeError('input mapping file supplied, but no path to'+\
             ' output file')
        f = open(opts.mapping_fp,'U')
        mapping_lines, headers, comments = parse_mapping_file(f)
        f.close()
        if len(headers)==1:
            endoffset = 0 # if we only have the sample id, this data -> last col
        else:
            endoffset = 1 # usually make this data the penultimate column.
        headers.insert(len(headers)-endoffset,'SequenceCount')
        for map_line in mapping_lines:
            sample_id = map_line
            try:
                depth = str(counts_per_sample[map_line[0]])
            except KeyError:
                depth = 'na'
            map_line.insert(len(map_line)-endoffset,depth)

        new_map_str = format_mapping_file(headers, mapping_lines, comments)
        f = open(opts.output_mapping_fp, 'w')
        f.write(new_map_str)
        f.close()
Esempio n. 23
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    m = parse_mapping_file_to_dict(open(opts.mapping_fp, "U"))[0]

    adjacent_unifrac_analyses = False
    pcoa_analyses = True

    if pcoa_analyses:
        wpc_h, wpc, _, _ = parse_coords(qiime_open(wpc_fp))
        upc_h, upc, _, _ = parse_coords(qiime_open(upc_fp))
        ugly_pc_function(
            m,
            wpc_h,
            wpc,
            ["GutTimeseries", "TongueTimeseries", "ForeheadTimeseries", "PalmTimeseries"],
            ["SampleAntibioticDisturbance", "SampleMenstruationDisturbance", "SampleSicknessDisturbance"],
            "CUB000",
            "pc-weighted.pdf",
        )
        ugly_pc_function(
            m,
            upc_h,
            upc,
            ["GutTimeseries", "TongueTimeseries", "ForeheadTimeseries", "PalmTimeseries"],
            ["SampleAntibioticDisturbance", "SampleMenstruationDisturbance", "SampleSicknessDisturbance"],
            "CUB000",
            "pc-unweighted.pdf",
        )

        wh, wdm = parse_distmat(qiime_open(wdm_fp, "U"))
        uh, udm = parse_distmat(qiime_open(udm_fp, "U"))

    if adjacent_unifrac_analyses:
        plot_adjacent_unifracs(
            uh,
            udm,
            m,
            ["GutTimeseries", "TongueTimeseries", "ForeheadTimeseries", "PalmTimeseries"],
            "Yes",
            ["SampleAntibioticDisturbance", "SampleMenstruationDisturbance", "SampleSicknessDisturbance"],
            "Yes",
            output_fp="unweighted-unifrac.pdf",
        )
        plot_adjacent_unifracs(
            wh,
            wdm,
            m,
            ["GutTimeseries", "TongueTimeseries", "ForeheadTimeseries", "PalmTimeseries"],
            "Yes",
            ["SampleAntibioticDisturbance", "SampleMenstruationDisturbance", "SampleSicknessDisturbance"],
            "Yes",
            output_fp="weighted-unifrac.pdf",
        )

        r = score_ranked_adjacent_unifracs(
            m,
            udm,
            uh,
            inclusion_field="GutTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Gut Unweighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )
        r = score_ranked_adjacent_unifracs(
            m,
            wdm,
            wh,
            inclusion_field="GutTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Gut Weighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )

        r = score_ranked_adjacent_unifracs(
            m,
            udm,
            uh,
            inclusion_field="TongueTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Tongue Unweighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )
        r = score_ranked_adjacent_unifracs(
            m,
            wdm,
            wh,
            inclusion_field="TongueTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Tongue Weighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )

        r = score_ranked_adjacent_unifracs(
            m,
            udm,
            uh,
            inclusion_field="PalmTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Palm Unweighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )
        r = score_ranked_adjacent_unifracs(
            m,
            wdm,
            wh,
            inclusion_field="PalmTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Palm Weighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )

        r = score_ranked_adjacent_unifracs(
            m,
            udm,
            uh,
            inclusion_field="ForeheadTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Forehead Unweighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )
        r = score_ranked_adjacent_unifracs(
            m,
            wdm,
            wh,
            inclusion_field="ForeheadTimeseries",
            inclusion_value="Yes",
            personal_id_field="PersonalID",
            disturbed_field="SampleAntibioticDisturbance",
            disturbed_value="Yes",
        )
        print "Forehead Weighted UniFrac: %1.3f (n-disturbed samples: %d, n-undisturbed samples: %d) " % (
            r[1],
            len(r[2]),
            len(r[3]),
        )
Esempio n. 24
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    otu_table_fp = opts.otu_table_fp
    otu_table = parse_biom_table(qiime_open(otu_table_fp))
    min_counts, max_counts, median_counts, mean_counts, counts_per_sample =\
     compute_seqs_per_library_stats(otu_table, opts.num_otus)
    num_otus = len(otu_table.ObservationIds)

    counts_per_sample_values = counts_per_sample.values()
    med_abs_dev = median_absolute_deviation(counts_per_sample_values)[0]
    even_sampling_depth = guess_even_sampling_depth(counts_per_sample_values)

    num_samples = len(counts_per_sample)
    print 'Num samples: %s' % str(num_samples)
    print 'Num otus: %s' % str(num_otus)
    if not opts.num_otus:
        num_observations = sum(counts_per_sample_values)
        print 'Num observations (sequences): %s' % str(num_observations)
        # port denisty functionality to a tested function. the following is broken (should be
        # count of non-zero cells rather than number of observations in the numerator)
        #print 'Table density (fraction of non-zero values): %1.4f' % (num_observations/(num_samples * num_otus))
    print

    if opts.num_otus:
        print 'OTUs/sample summary:'
    else:
        print 'Seqs/sample summary:'
    print ' Min: %s' % str(min_counts)
    print ' Max: %s' % str(max_counts)
    print ' Median: %s' % str(median_counts)
    print ' Mean: %s' % str(mean_counts)
    print ' Std. dev.: %s' % (str(std(counts_per_sample_values)))
    print ' Median Absolute Deviation: %s' % str(med_abs_dev)
    print ' Default even sampling depth in\n  core_qiime_analyses.py (just a suggestion): %s' %\
     str(even_sampling_depth)
    print ''
    if opts.num_otus:
        print 'OTUs/sample detail:'
    else:
        print 'Seqs/sample detail:'
    sorted_counts_per_sample = [(v, k) for k, v in counts_per_sample.items()]
    sorted_counts_per_sample.sort()
    total_count = 0
    for v, k in sorted_counts_per_sample:
        total_count += v
        print ' %s: %s' % (k, str(v))

    if opts.mapping_fp:
        if not opts.output_mapping_fp:
            raise RuntimeError('input mapping file supplied, but no path to'+\
             ' output file')
        f = open(opts.mapping_fp, 'U')
        mapping_lines, headers, comments = parse_mapping_file(f)
        f.close()
        if len(headers) == 1:
            endoffset = 0  # if we only have the sample id, this data -> last col
        else:
            endoffset = 1  # usually make this data the penultimate column.
        headers.insert(len(headers) - endoffset, 'NumIndividuals')
        for map_line in mapping_lines:
            sample_id = map_line
            try:
                depth = str(counts_per_sample[map_line[0]])
            except KeyError:
                depth = 'na'
            map_line.insert(len(map_line) - endoffset, depth)

        new_map_str = format_mapping_file(headers, mapping_lines, comments)
        f = open(opts.output_mapping_fp, 'w')
        f.write(new_map_str)
        f.close()