Beispiel #1
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    md, mh, _ = parse_mapping_file(open(mapping_fp))
    
    body_sites = ['Gut','Tongue','Palm','Forehead']
    intraindividual_distances = []
    
    print "Unweighted UniFrac"
    for b in body_sites:
        dm_fp = "/Users/caporaso/analysis/student-microbiome-project/beta-diversity/unweighted_unifrac_dm.%s_ts_only.txt.gz" % b.lower()
        h, d = parse_distmat(qiime_open(dm_fp))
        intraindividual_distances.append(get_grouped_distances(h, d, mh, md, 'PersonalID'))
    
    for i in range(len(body_sites)):
        for j in range(i):
            r = correlated_variability(intraindividual_distances[i],intraindividual_distances[j])
            print "%s/%s (n=%d): rho:%1.3f, p=%f" % (body_sites[i],body_sites[j],r[0],r[1][0],r[1][3])
    
    intraindividual_distances = []
    print "**"
    print "Weighted UniFrac"
    for b in body_sites:
        dm_fp = "/Users/caporaso/analysis/student-microbiome-project/beta-diversity/weighted_unifrac_dm.%s_ts_only.txt.gz" % b.lower()
        h, d = parse_distmat(qiime_open(dm_fp))
        intraindividual_distances.append(get_grouped_distances(h, d, mh, md, 'PersonalID'))
    
    for i in range(len(body_sites)):
        for j in range(i):
            r = correlated_variability(intraindividual_distances[i],intraindividual_distances[j])
            print "%s/%s (n=%d): rho:%1.3f, p=%f" % (body_sites[i],body_sites[j],r[0],r[1][0],r[1][3])
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
       
    sample_id_map_fp = opts.sample_id_map_fp
    if sample_id_map_fp:
        sample_id_map = dict([(k,v[0]) \
         for k,v in fields_to_dict(open(sample_id_map_fp, "U")).items()])
    else:
        sample_id_map = None
    
    input_dm_fps = opts.input_dms.split(',')
    output_f = open(opts.output_fp,'w')
    output_f.write(comment)
    output_f.write('DM1\tDM2\tNumber of entries\tMantel p-value\n')
    num_iterations = opts.num_iterations
    for i,fp1 in enumerate(input_dm_fps):
        for fp2 in input_dm_fps[i+1:]:
            (dm1_labels, dm1), (dm2_labels, dm2) =\
             make_compatible_distance_matrices(parse_distmat(open(fp1,'U')),
                                               parse_distmat(open(fp2,'U')),
                                               lookup=sample_id_map)
            if len(dm1_labels) < 2:
                output_f.write('%s\t%s\t%d\tToo few samples\n' % (fp1,fp2,len(dm1_labels)))
                continue
            p = mantel(dm1,dm2,n=num_iterations)
            p_str = format_p_value_for_num_iters(p,num_iterations)
            output_f.write('%s\t%s\t%d\t%s\n' % (fp1,fp2,len(dm1_labels),p_str))
    output_f.close()
    def setUp(self):
        """Define some distance matrices that will be used by the tests."""
        self.dm1_str = ["\ts1\ts2\ts3", "s1\t0\t0.5\t0.2", "s2\t0.5\t0\t0.3",
                        "s3\t0.2\t0.3\t0"]
        self.dm1 = parse_distmat(self.dm1_str)
        self.dm2_str = ["\ts1\ts2\ts3", "s1\t0\t0.8\t0.25", "s2\t0.8\t0\t0.4",
                        "s3\t0.25\t0.4\t0"]
        self.dm2 = parse_distmat(self.dm2_str)
        self.dm3_str = ["\ts1\ts2\ts3", "s1\t0\t0.1\t0.2", "s2\t0.1\t0\t0.9",
                        "s3\t0.2\t0.9\t0"]
        self.dm3 = parse_distmat(self.dm3_str)
        self.dm4_str = ["\tz1\tz2\tz3", "z1\t0\t0.1\t0.2", "z2\t0.1\t0\t0.9",
                        "z3\t0.2\t0.9\t0"]
        self.dm4 = parse_distmat(self.dm4_str)
        self.distmats = [self.dm1, self.dm2, self.dm3]

        # Sample filepaths (these aren't created or modified, just used as
        # strings to be added to the results).
        self.fp1 = "foo.txt"
        self.fp2 = "bar.txt"
        self.fp3 = "baz.txt"
        self.fps = [self.fp1, self.fp2, self.fp3]

        # Some sample parameters to use for many of the tests.
        self.num_perms = 999
        self.comment = "# A sample comment.\n"
        self.alpha = 0.01
        self.tail_type = 'greater'
        self.sample_id_map = {'z1':'s1', 'z2':'s2', 'z3':'s3', 's1':'s1',
                              's2':'s2', 's3':'s3'}
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Open the input distance matrices, parse them, find the intersection, and
    # write the two new distance matrices to the output filepaths.
    input_dm_fps = opts.input_dms.split(',')
    output_dm_fps = opts.output_dms.split(',')
    if len(input_dm_fps) != 2 or len(output_dm_fps) != 2:
        option_parser.error("You must provide exactly two input and output "
            "distance matrix filepaths.")

    labels1, dm1_data = parse_distmat(open(input_dm_fps[0], 'U'))
    labels2, dm2_data = parse_distmat(open(input_dm_fps[1], 'U'))

    (dm1_labels, dm1), (dm2_labels, dm2) = make_compatible_distance_matrices(
        parse_distmat(open(input_dm_fps[0],'U')),
        parse_distmat(open(input_dm_fps[1],'U')))
    assert (dm1_labels == dm2_labels), "The order of sample IDs is not the " +\
        "same for the two matrices."

    output1_f = open(output_dm_fps[0], 'w')
    output2_f = open(output_dm_fps[1], 'w')
    output1_f.write(format_distance_matrix(dm1_labels, dm1))
    output2_f.write(format_distance_matrix(dm2_labels, dm2))
    output1_f.close()
    output2_f.close()
 def test_filter_samples_from_distance_matrix(self):
     """filter_samples_from_distance_matrix functions as expected """
     actual = filter_samples_from_distance_matrix(parse_distmat(self.input_dm1),
                                            ["GHI blah","XYZ"])
     self.assertEqual(actual,expected_dm1a)
     actual = filter_samples_from_distance_matrix(parse_distmat(self.input_dm1),
                                           ["GHI","DEF"])
     self.assertEqual(actual,expected_dm1b)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        if not path.exists(opts.output_dir):
            create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")
    sample_id_map = None
    if opts.sample_id_map_fp:
        sample_id_map = dict([(k, v[0]) for k, v in fields_to_dict(
            open(opts.sample_id_map_fp, "U")).items()])
    input_dm_fps = opts.input_dms
    distmats = [parse_distmat(open(dm_fp, 'U')) for dm_fp in input_dm_fps]

    if opts.method == 'mantel':
        output_f = open(path.join(opts.output_dir, 'mantel_results.txt'), 'w')
        output_f.write(
            run_mantel_test('mantel',
                            input_dm_fps,
                            distmats,
                            opts.num_permutations,
                            opts.tail_type,
                            comment_mantel_pmantel,
                            sample_id_map=sample_id_map))
    elif opts.method == 'partial_mantel':
        output_f = open(
            path.join(opts.output_dir, 'partial_mantel_results.txt'), 'w')
        output_f.write(
            run_mantel_test('partial_mantel',
                            input_dm_fps,
                            distmats,
                            opts.num_permutations,
                            opts.tail_type,
                            comment_mantel_pmantel,
                            control_dm_fp=opts.control_dm,
                            control_dm=parse_distmat(open(
                                opts.control_dm, 'U')),
                            sample_id_map=sample_id_map))
    elif opts.method == 'mantel_corr':
        output_f = open(
            path.join(opts.output_dir, 'mantel_correlogram_results.txt'), 'w')
        result_str, correlogram_fps, correlograms = run_mantel_correlogram(
            input_dm_fps,
            distmats,
            opts.num_permutations,
            comment_corr,
            opts.alpha,
            sample_id_map=sample_id_map,
            variable_size_distance_classes=opts.variable_size_distance_classes)

        output_f.write(result_str)
        for corr_fp, corr in zip(correlogram_fps, correlograms):
            corr.savefig(path.join(opts.output_dir, corr_fp + opts.image_type),
                         format=opts.image_type)
    output_f.close()
 def test_filter_samples_from_distance_matrix_negate(self):
     """filter_samples_from_distance_matrix functions w negate """
     actual = filter_samples_from_distance_matrix(
       parse_distmat(self.input_dm1),
       ["ABC blah","DEF"],
       negate=True)
     self.assertEqual(actual,expected_dm1a)
     actual = filter_samples_from_distance_matrix(\
      parse_distmat(self.input_dm1),
      ["ABC","XYZ"],
      negate=True)
     self.assertEqual(actual,expected_dm1b)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        if not path.exists(opts.output_dir):
            create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory " "specified with the -o option.")
    sample_id_map = None
    if opts.sample_id_map_fp:
        sample_id_map = dict([(k, v[0]) for k, v in fields_to_dict(open(opts.sample_id_map_fp, "U")).items()])
    input_dm_fps = opts.input_dms
    distmats = [parse_distmat(open(dm_fp, "U")) for dm_fp in input_dm_fps]

    if opts.method == "mantel":
        output_f = open(path.join(opts.output_dir, "mantel_results.txt"), "w")
        output_f.write(
            run_mantel_test(
                "mantel",
                input_dm_fps,
                distmats,
                opts.num_permutations,
                opts.tail_type,
                comment_mantel_pmantel,
                sample_id_map=sample_id_map,
            )
        )
    elif opts.method == "partial_mantel":
        output_f = open(path.join(opts.output_dir, "partial_mantel_results.txt"), "w")
        output_f.write(
            run_mantel_test(
                "partial_mantel",
                input_dm_fps,
                distmats,
                opts.num_permutations,
                opts.tail_type,
                comment_mantel_pmantel,
                control_dm_fp=opts.control_dm,
                control_dm=parse_distmat(open(opts.control_dm, "U")),
                sample_id_map=sample_id_map,
            )
        )
    elif opts.method == "mantel_corr":
        output_f = open(path.join(opts.output_dir, "mantel_correlogram_results.txt"), "w")
        result_str, correlogram_fps, correlograms = run_mantel_correlogram(
            input_dm_fps, distmats, opts.num_permutations, comment_corr, opts.alpha, sample_id_map=sample_id_map
        )
        output_f.write(result_str)
        for corr_fp, corr in zip(correlogram_fps, correlograms):
            corr.savefig(path.join(opts.output_dir, corr_fp + opts.image_type), format=opts.image_type)
    output_f.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    
    if opts.binning is None:
        ranges = []
    else:
        # simple ranges format validation
        if opts.binning.count('[')!=opts.binning.count(']') or\
          opts.binning.count('[')!=opts.binning.count(','):
            raise ValueError, "The binning input has an error: '%s'; " % opts.binning +\
             "\nthe format should be [increment1,top_limit1][increment2,top_limit2]" 
        # spliting in ranges
        rgn_txt = opts.binning.split('][')
        # removing left [ and right ]
        rgn_txt[0] = rgn_txt[0][1:]
        rgn_txt[-1] = rgn_txt[-1][:-1]
        # converting into int
        ranges = []
        max = 0
        
        for i,r in enumerate(rgn_txt):
            values = map(float,r.split(','))
            if len(values)!=2:
                raise ValueError, "All ranges must have only 2 values: [%s]" % r
            elif i+1!=len(rgn_txt): 
                if values[0]>values[1]:
                    raise ValueError, "The bin value can't be greater than the max value: [%s]" % r
                elif values<0:
                    raise ValueError, "This value can not be negative: [%s]" % r
                elif max>values[1]:
                    raise ValueError, "This value can not smaller than the previous one: [%s]" % r
                else:
                    max=values[1]
            
            ranges.append(values)
    
    x_samples, x_distmtx = parse_distmat(open(opts.input_path_x,'U'))
    y_samples, y_distmtx = parse_distmat(open(opts.input_path_y,'U'))
    (x_val,y_val,x_fit,y_fit) = fit_semivariogram(x_distmtx, y_distmtx, opts.model, ranges)
    
    plot(x_val, y_val, 'o', color="white")   
    plot(x_fit, y_fit, linewidth=2.0, color="blue")
    
    x_label = 'Distance (m)'
    y_label = 'Community Dissimilarity'
    fig_title = 'Semivariogram (%s)' % opts.model
    
    xlabel(x_label)
    ylabel(y_label)
    title(fig_title)
    
    savefig(opts.output_path)
Beispiel #10
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if opts.output_path != None:
        outf = open(opts.output_path,'w')
    else:
        outf = sys.stdout

    dists = parse_distmat(open(opts.input_path,'U'))
    map_data = parse_mapping_file_to_dict(open(opts.map,'U'))
    diff_dists, same_dists = clust_qual_ratio(dists, map_data, opts.category)



    if opts.short:
        print >> outf, numpy.mean(diff_dists)/numpy.mean(same_dists)
    else:
        print >> outf, "dissimilarity ratio between/within (large for clustered data):"
        print >> outf, numpy.mean(diff_dists)/numpy.mean(same_dists)
        print >> outf, "dissimilarities between clusters: mean, std, num:"
        print >> outf, '\t'.join(map(str,[numpy.mean(diff_dists), numpy.std(diff_dists),
         len(diff_dists)]))
        print >> outf, "dissimilarities within clusters: mean, std, num:"
        print >> outf, '\t'.join(map(str,[numpy.mean(same_dists), numpy.std(same_dists),
         len(same_dists)]))
Beispiel #11
0
    def test_get_adjacent_distances(self):
        """ extracting adjacent distances works as expected
        """
        dm_str = ["\ts1\ts2\ts3", "s1\t0\t2\t4", "s2\t2\t0\t3.2", "s3\t4\t3.2\t0"]
        dm_header, dm = parse_distmat(dm_str)
        # error cases: fewer than 2 valid sample ids
        self.assertRaises(ValueError, get_adjacent_distances, dm_header, dm, [])
        self.assertRaises(ValueError, get_adjacent_distances, dm_header, dm, ["s1"])
        self.assertRaises(ValueError, get_adjacent_distances, dm_header, dm, ["s0", "s1"])
        self.assertRaises(ValueError, get_adjacent_distances, dm_header, dm, ["s1", "s4"])

        # one pair of valid distances
        self.assertEqual(get_adjacent_distances(dm_header, dm, ["s1", "s2"]), ([2], [("s1", "s2")]))
        self.assertEqual(get_adjacent_distances(dm_header, dm, ["s1", "s1"]), ([0], [("s1", "s1")]))
        self.assertEqual(get_adjacent_distances(dm_header, dm, ["s1", "s3"]), ([4], [("s1", "s3")]))
        self.assertEqual(get_adjacent_distances(dm_header, dm, ["s2", "s3"]), ([3.2], [("s2", "s3")]))

        # multiple valid distances
        self.assertEqual(
            get_adjacent_distances(dm_header, dm, ["s1", "s2", "s3"]), ([2, 3.2], [("s1", "s2"), ("s2", "s3")])
        )
        self.assertEqual(
            get_adjacent_distances(dm_header, dm, ["s1", "s3", "s2", "s1"]),
            ([4, 3.2, 2], [("s1", "s3"), ("s3", "s2"), ("s2", "s1")]),
        )

        # mixed valid and invalid distances ignores invalid distances
        self.assertEqual(
            get_adjacent_distances(dm_header, dm, ["s1", "s3", "s4", "s5", "s6", "s2", "s1"]),
            ([4, 3.2, 2], [("s1", "s3"), ("s3", "s2"), ("s2", "s1")]),
        )
        # strict=True results in missing sample ids raising an error
        self.assertRaises(
            ValueError, get_adjacent_distances, dm_header, dm, ["s1", "s3", "s4", "s5", "s6", "s2", "s1"], strict=True
        )
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    output_f = open(opts.output_distance_matrix, 'w')
    if opts.otu_table_fp:
        otu_table = load_table(opts.otu_table_fp)
        samples_to_keep = otu_table.ids()
        # samples_to_keep = \
        # sample_ids_from_otu_table(open(opts.otu_table_fp,'U'))
    elif opts.sample_id_fp:
        samples_to_keep = \
            get_seqs_to_keep_lookup_from_seq_id_file(
                open(opts.sample_id_fp, 'U'))
    elif opts.mapping_fp and opts.valid_states:
        try:
            samples_to_keep = sample_ids_from_metadata_description(
                open(opts.mapping_fp, 'U'), opts.valid_states)
        except ValueError as e:
            option_parser.error(e.message)
    else:
        option_parser.error('must pass either --sample_id_fp, -t, or -m and '
                            '-s')
    # note that negate gets a little weird here. The function we're calling
    # removes the specified samples from the distance matrix, but the other
    # QIIME filter scripts keep these samples specified.  So, the interface of
    # this script is designed to keep the specified samples, and therefore
    # negate=True is passed to filter_samples_from_distance_matrix by default.
    d = filter_samples_from_distance_matrix(
        parse_distmat(
            open(opts.input_distance_matrix, 'U')),
        samples_to_keep,
        negate=not opts.negate)
    output_f.write(d)
    output_f.close()
Beispiel #13
0
def nmds(file, dimensions=2):
    samples, distmtx = parse_distmat(file)
    nmds_res = nmds_module.NMDS(distmtx, verbosity=0, dimension=dimensions)
    pts = nmds_res.getPoints()
    stress = nmds_res.getStress()

    return format_nmds_coords(samples, pts, stress)
Beispiel #14
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if opts.output_path != None:
        outf = open(opts.output_path, 'w')
    else:
        outf = sys.stdout

    dists = parse_distmat(open(opts.input_path, 'U'))
    map_data = parse_mapping_file_to_dict(open(opts.map, 'U'))
    diff_dists, same_dists = clust_qual_ratio(dists, map_data, opts.category)

    if opts.short:
        print >> outf, numpy.mean(diff_dists) / numpy.mean(same_dists)
    else:
        print >> outf, "dissimilarity ratio between/within (large for clustered data):"
        print >> outf, numpy.mean(diff_dists) / numpy.mean(same_dists)
        print >> outf, "dissimilarities between clusters: mean, std, num:"
        print >> outf, '\t'.join(
            map(str, [
                numpy.mean(diff_dists),
                numpy.std(diff_dists),
                len(diff_dists)
            ]))
        print >> outf, "dissimilarities within clusters: mean, std, num:"
        print >> outf, '\t'.join(
            map(str, [
                numpy.mean(same_dists),
                numpy.std(same_dists),
                len(same_dists)
            ]))
Beispiel #15
0
def nmds(file,dimensions=2):
    samples, distmtx = parse_distmat(file)
    nmds_res = nmds_module.NMDS(distmtx,verbosity=0,dimension=dimensions)
    pts = nmds_res.getPoints()
    stress = nmds_res.getStress()
    
    return format_nmds_coords(samples, pts, stress)
Beispiel #16
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    output_f = open(opts.output_distance_matrix, 'w')
    if opts.otu_table_fp:
        otu_table = parse_biom_table(open(opts.otu_table_fp, 'U'))
        samples_to_keep = otu_table.SampleIds
        #samples_to_keep = \
        # sample_ids_from_otu_table(open(opts.otu_table_fp,'U'))
    elif opts.sample_id_fp:
        samples_to_keep = \
         get_seqs_to_keep_lookup_from_seq_id_file(open(opts.sample_id_fp,'U'))
    elif opts.mapping_fp and opts.valid_states:
        samples_to_keep = sample_ids_from_metadata_description(
            open(opts.mapping_fp, 'U'), opts.valid_states)
    else:
        option_parser.error(
            'must pass either --sample_id_fp, -t, or -m and -s')
    # note that negate gets a little weird here. The function we're calling removes the specified
    # samples from the distance matrix, but the other QIIME filter scripts keep these samples specified.
    # So, the interface of this script is designed to keep the specified samples, and therefore
    # negate=True is passed to filter_samples_from_distance_matrix by default.
    d = filter_samples_from_distance_matrix(parse_distmat(
        open(opts.input_distance_matrix, 'U')),
                                            samples_to_keep,
                                            negate=not opts.negate)
    output_f.write(d)
    output_f.close()
Beispiel #17
0
    def test_shuffle_dm(self):
        """Test shuffling labels of distance matrix."""
        exp_labels, exp_dm = parse_distmat(self.dm_f1)

        order_changed = False
        for i in range(20):
            obs_labels, obs_dm = parse_distmat(
                    shuffle_dm(self.dm_f1).split('\n'))
            self.assertFloatEqual(obs_dm, exp_dm)

            try:
                self.assertIsPermutation(obs_labels, exp_labels)
            except AssertionError:
                pass
            else:
                order_changed = True

        self.assertTrue(order_changed)
Beispiel #18
0
    def test_subset_dm(self):
        """Test picking a subset of a distance matrix."""
        # Don't actually subset.
        exp = parse_distmat(self.dm_f1)
        obs = parse_distmat(subset_dm(self.dm_f1, 3).split('\n'))
        self.assertFloatEqual(obs, exp)

        obs_labels, obs_dm = parse_distmat(
                subset_dm(self.dm_f1, 1).split('\n'))
        self.assertEqual(len(obs_labels), 1)
        self.assertTrue(obs_labels[0] in exp[0])

        obs_labels, obs_dm = parse_distmat(
                subset_dm(self.dm_f1, 2).split('\n'))
        self.assertEqual(len(obs_labels), 2)
        self.assertTrue(obs_labels[0] in exp[0])
        self.assertTrue(obs_labels[1] in exp[0])

        self.assertRaises(ValueError, subset_dm, self.dm_f1, 4)
Beispiel #19
0
    def test_subset_groups(self):
        """Test picking subsets of sample groups in distance matrix."""
        # Don't filter anything out.
        exp = parse_distmat(self.dm_f1)
        obs = parse_distmat(subset_groups(
                self.dm_f1, self.map_f1, 'Category', 2).split('\n'))
        self.assertFloatEqual(obs, exp)

        obs = parse_distmat(subset_groups(
                self.dm_f1, self.map_f1, 'Category', 3).split('\n'))
        self.assertFloatEqual(obs, exp)

        # Pick groups of size 1.
        obs_labels, obs_dm = parse_distmat(subset_groups(
                self.dm_f1, self.map_f1, 'Category', 1).split('\n'))
        self.assertTrue('S2' in obs_labels)

        # XOR: either S1 or S3 should be in obs_labels, but not both.
        self.assertTrue(('S1' in obs_labels) != ('S3' in obs_labels))
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Open the input distance matrix and parse it. Shuffle its labels and write
    # them and the original data to the output file.
    labels, dm_data = parse_distmat(open(opts.input_distance_matrix, 'U'))
    shuffle(labels)
    output_f = open(opts.output_distance_matrix, 'w')
    output_f.write(format_distance_matrix(labels, dm_data))
    output_f.close()
Beispiel #21
0
    def test_parse_distmat(self):
        """parse_distmat should read distmat correctly"""
        lines = """\ta\tb\tc
a\t0\t1\t2
b\t1\t0\t3.5
c\t1\t3.5\t0
""".splitlines()
        exp = (['a','b','c'], array([[0,1,2],[1,0,3.5],[1,3.5,0]]))
        obs = parse_distmat(lines)
        self.assertEqual(obs, exp)
Beispiel #22
0
    def test_parse_distmat(self):
        """parse_distmat should read distmat correctly"""
        lines = """\ta\tb\tc
a\t0\t1\t2
b\t1\t0\t3.5
c\t1\t3.5\t0
""".splitlines()
        exp = (['a', 'b', 'c'], array([[0, 1, 2], [1, 0, 3.5], [1, 3.5, 0]]))
        obs = parse_distmat(lines)
        self.assertEqual(obs, exp)
Beispiel #23
0
def group_distances(mapping_file,
                    dmatrix_file,
                    fields,
                    dir_prefix='',
                    subdir_prefix='group_distances'):
    """Calculate all lists of distance groups.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.
    """
    distance_groups = {}
    mapping, header, comments = parse_mapping_file(open(mapping_file, 'U'))
    header = [header]
    header.extend(mapping)
    mapping = header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file, 'U'))

    if fields == []:
        raise ValueError(
            'Since no fields were defined and the values within your fields are either all the same or all unique, a field was not chosen for analysis. Please define a field to analyse.'
        )

    single_field = defaultdict(dict)
    for i in range(len(fields)):
        field = fields[i]
        groups = group_by_field(mapping, field)
        data = distances_by_groups(distance_header, distance_matrix, groups)
        # Need to remove pound signs from field name.
        field_name = field.replace('#', '')
        single_field[field_name] = data

    write_distance_files(group_distance_dict=single_field,
                         dir_prefix=dir_prefix,
                         subdir_prefix=subdir_prefix + '_single')

    paired_field = defaultdict(dict)
    paired_field_for_writing = defaultdict(dict)
    for i in range(len(fields)):
        for j in range(i, len(fields)):
            fieldi = fields[i]
            fieldj = fields[j]
            groups = group_by_fields(mapping, [fieldi, fieldj])
            data = distances_by_groups(distance_header, distance_matrix,
                                       groups)
            paired_field[fieldi + '_to_' + fieldj] = data
            paired_field_for_writing[fieldi + '_to_' + field] = data

    write_distance_files(group_distance_dict=paired_field_for_writing,
                         dir_prefix=dir_prefix,
                         subdir_prefix=subdir_prefix + '_pairs')

    return single_field, paired_field, distance_matrix
def pcoa(file):
    samples, distmtx = parse_distmat(file)
    # coords, each row is an axis
    coords, eigvals = ms.principal_coordinates_analysis(distmtx)

    pcnts = (numpy.abs(eigvals) / sum(numpy.abs(eigvals))) * 100
    idxs_descending = pcnts.argsort()[::-1]
    coords = coords[idxs_descending]
    eigvals = eigvals[idxs_descending]
    pcnts = pcnts[idxs_descending]

    return format_coords(samples, coords.T, eigvals, pcnts)
def isMatrix(fstr):

    try:
        result = parse_distmat(fstr.splitlines())
        if result[0] == None:
            return False

        else:
            return True

    except:
        return False
Beispiel #26
0
def pcoa(file):
    samples, distmtx = parse_distmat(file)
    # coords, each row is an axis
    coords, eigvals = ms.principal_coordinates_analysis(distmtx)

    pcnts = (numpy.abs(eigvals) / sum(numpy.abs(eigvals))) * 100
    idxs_descending = pcnts.argsort()[::-1]
    coords = coords[idxs_descending]
    eigvals = eigvals[idxs_descending]
    pcnts = pcnts[idxs_descending]

    return format_coords(samples, coords.T, eigvals, pcnts)
def group_distances(mapping_file, dmatrix_file, fields, dir_prefix='',
                    subdir_prefix='group_distances'):
    """Calculate all lists of distance groups.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.
    """
    distance_groups = {}
    mapping, header, comments = parse_mapping_file(open(mapping_file, 'U'))
    header = [header]
    header.extend(mapping)
    mapping = header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file, 'U'))

    if fields == []:
        raise ValueError(
            'Since no fields were defined and the values within your fields are either all the same or all unique, a field was not chosen for analysis. Please define a field to analyse.')

    single_field = defaultdict(dict)
    for i in range(len(fields)):
        field = fields[i]
        groups = group_by_field(mapping, field)
        data = distances_by_groups(distance_header, distance_matrix, groups)
        # Need to remove pound signs from field name.
        field_name = field.replace('#', '')
        single_field[field_name] = data

    write_distance_files(group_distance_dict=single_field,
                         dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_single')

    paired_field = defaultdict(dict)
    paired_field_for_writing = defaultdict(dict)
    for i in range(len(fields)):
        for j in range(i, len(fields)):
            fieldi = fields[i]
            fieldj = fields[j]
            groups = group_by_fields(mapping, [fieldi, fieldj])
            data = distances_by_groups(
                distance_header,
                distance_matrix,
                groups)
            paired_field[fieldi + '_to_' + fieldj] = data
            paired_field_for_writing[fieldi + '_to_' + field] = data

    write_distance_files(group_distance_dict=paired_field_for_writing,
                         dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_pairs')

    return single_field, paired_field, distance_matrix
Beispiel #28
0
 def test_get_adjacent_distances(self):
     """ extracting adjacent distances works as expected
     """
     dm_str = ["\ts1\ts2\ts3", "s1\t0\t2\t4", "s2\t2\t0\t3.2",
                     "s3\t4\t3.2\t0"]
     dm_header, dm = parse_distmat(dm_str)
     # error cases: fewer than 2 valid sample ids
     self.assertRaises(ValueError,
                       get_adjacent_distances,dm_header, dm,
                       [])
     self.assertRaises(ValueError,
                       get_adjacent_distances,dm_header, dm,
                       ['s1'])
     self.assertRaises(ValueError,
                       get_adjacent_distances,dm_header, dm,
                       ['s0','s1'])
     self.assertRaises(ValueError,
                       get_adjacent_distances,dm_header, dm,
                       ['s1','s4'])
     
     # one pair of valid distances
     self.assertEqual(get_adjacent_distances(dm_header, dm, ['s1','s2']),
                      ([2],[('s1','s2')]))
     self.assertEqual(get_adjacent_distances(dm_header, dm, ['s1','s1']),
                      ([0],[('s1','s1')]))
     self.assertEqual(get_adjacent_distances(dm_header, dm, ['s1','s3']),
                      ([4],[('s1','s3')]))
     self.assertEqual(get_adjacent_distances(dm_header, dm, ['s2','s3']),
                      ([3.2],[('s2','s3')]))
     
     # multiple valid distances
     self.assertEqual(get_adjacent_distances(dm_header, 
                                             dm, 
                                             ['s1','s2','s3']),
                      ([2,3.2],[('s1','s2'),('s2','s3')]))
     self.assertEqual(get_adjacent_distances(dm_header, 
                                             dm, 
                                             ['s1','s3','s2','s1']),
                      ([4,3.2,2],[('s1','s3'),('s3','s2'),('s2','s1')]))
     
     # mixed valid and invalid distances ignores invalid distances
     self.assertEqual(get_adjacent_distances(dm_header, 
                                             dm, 
                                             ['s1','s3','s4','s5','s6','s2','s1']),
                      ([4,3.2,2],[('s1','s3'),('s3','s2'),('s2','s1')]))
     # strict=True results in missing sample ids raising an error
     self.assertRaises(ValueError,get_adjacent_distances,
                                  dm_header, 
                                  dm,
                                  ['s1','s3','s4','s5','s6','s2','s1'],
                                  strict=True)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    subject_header_name = opts.subject
    # gradient_header_name = opts.gradient
    matrix_fp = opts.matrix_fp
    mapping_fp = opts.mapping_fp

    matrix_header, matrix_data = parse_distmat(open(matrix_fp, 'U'))
    mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_fp, 'U'))

    out_dict = ratios_for_category(matrix_header, matrix_data, mapping_headers, mapping_data, subject_header_name)

    print 'Subject\tPercent'
    for key, value in out_dict.iteritems():
        print '%s\t%f' % (key, (value[0]/value[1])*100)
Beispiel #30
0
def choose_gradient_subsets(dm_f, map_f, gradient, subset_sizes, num_subsets):
    subsets = []

    mdm, _ = parse_mapping_file_to_dict(map_f)
    dm_labels, dm_data = parse_distmat(dm_f)

    # Only keep the sample IDs that are in both the mapping file and distance
    # matrix.
    samp_ids = [(samp_id, float(metadata[gradient]))
                for samp_id, metadata in mdm.items() if samp_id in dm_labels]
    samp_ids.sort(key=lambda samp_id: samp_id[1])

    for subset_size in subset_sizes:
        # Adapted from http://stackoverflow.com/a/9873935
        # We add 1 to the number of samples we want because we want subset_size
        # intervals to choose from.
        bin_idxs = [int(ceil(i * len(samp_ids) / (subset_size + 1)))
                    for i in range(subset_size + 1)]

        for subset_num in range(num_subsets):
            samp_ids_to_keep = []

            for i in range(len(bin_idxs) - 1):
                if i == len(bin_idxs) - 2:
                    # We're at the last bin, so choose from the entire bin
                    # range.
                    if bin_idxs[i + 1] < len(samp_ids):
                        end_idx = bin_idxs[i + 1]
                    else:
                        end_idx = bin_idxs[i + 1] - 1

                    samp_ids_to_keep.append(
                            samp_ids[randint(bin_idxs[i], end_idx)][0])
                else:
                    # We subtract 1 since randint is inclusive on both sides,
                    # and we don't want to choose the same sample ID multiple
                    # times from different bins.
                    samp_ids_to_keep.append(
                            samp_ids[randint(bin_idxs[i],
                                             bin_idxs[i + 1] - 1)][0])

            assert len(samp_ids_to_keep) == subset_size, \
                   "%d != %d" % (len(samp_ids_to_keep), subset_size)

            subsets.append(samp_ids_to_keep)

    return subsets
def compare_treatment_dists(chosen_samples, category, mf, bt, m, tr):
    """Calculate avg between, within, and to-all distances for chosen_samples.
    Notes: 
     chosen_samples is a list of lists of ids that collectively have some amount
     of different values under category in the mapping file. these samples will
     br grouped by the value they have and then these groupings will be 
     compared. the between distance is the all the pairwise distances between 
     the groupings. the within distance is the distance between the samples in a 
     single group. the to-all distance is the distance from the group to all
     other samples in the distmat.
    Inputs:
     chosen_samples - list of ids. e.g. [sam1,sam7,sam3,sam6,..]
     category - str, field in mf.
     mf - parsed mapping file, dict of sample_id:metadata.
     bt - biom table containing at least all samples contained in the mf.
     m - str, metric to used for beta diversity calculation. 
     tr - tree object, containing at least all nodes in bt.
    Output:
     A list of marginals that are the treatments of the groups, i.e. ['HF','LF']
     bt_wi_m - a 2d upper triangular array that has the average distances
     between treatment groups (or in the case of the main diagonal, the average
     within treatment/group distance).
     bt_wi_se - the standard errors for bt_wi_m.
     ta_m_se - 2d array with number of treatments/groups rows, and 2 cols where 
     the first col is the average distance between that treatment and all others
     and the second col is the se. 
    """
    dm = single_object_beta(bt, m, tr) #make the sample-sample distance matrix
    samples, data = parse_distmat(dm) #parse dm which is list of strs
    tc = treatment_covering(chosen_samples, category, mf)
    output_marginals = tc.keys()
    # make 3 arrays for output, between-within means, between-within ses, 
    # to-all means and ses,
    bt_wi_m = zeros((len(output_marginals),len(output_marginals)))
    bt_wi_se = zeros((len(output_marginals),len(output_marginals)))
    ta_m_se = zeros((len(output_marginals),2))
    for i,t in enumerate(output_marginals): # calculate within and to-all
        ta_m_se[i][0], ta_m_se[i][1] = treatment_dist(tc[t], samples, data)
        bt_wi_m[i][i], bt_wi_se[i][i] = within_treatment_dist(tc[t], samples,
            data)
    for t1, t2 in combinations(output_marginals, 2): #calculate between dists
        t1_ind = output_marginals.index(t1)
        t2_ind = output_marginals.index(t2)
        bt_wi_m[t1_ind][t2_ind], bt_wi_se[t1_ind][t2_ind] = \
            between_treatments_dist(tc[t1], tc[t2], samples, data)
    return output_marginals, bt_wi_m, bt_wi_se, ta_m_se
def filter_samples_from_distance_matrix(dm, samples_to_discard, negate=False):
    from numpy import array, inf
    """ Remove specified samples from distance matrix 
    
        dm: (sample_ids, dm_data) tuple, as returned from 
         qiime.parse.parse_distmat; or a file handle that can be passed
         to qiime.parse.parse_distmat
    
    """
    try:
        sample_ids, dm_data = dm

    except ValueError:
        # input was provide as a file handle
        sample_ids, dm_data = parse_distmat(dm)

    sample_lookup = {}.fromkeys([e.split()[0] for e in samples_to_discard])
    temp_dm_data = []
    new_dm_data = []
    new_sample_ids = []

    if negate:
        def keep_sample(s):
            return s in sample_lookup

    else:

        def keep_sample(s):
            return s not in sample_lookup

    for row, sample_id in zip(dm_data, sample_ids):

        if keep_sample(sample_id):
            temp_dm_data.append(row)
            new_sample_ids.append(sample_id)

    temp_dm_data = array(temp_dm_data).transpose()

    for col, sample_id in zip(temp_dm_data, sample_ids):

        if keep_sample(sample_id):
            new_dm_data.append(col)

    new_dm_data = array(new_dm_data).transpose()

    return (new_sample_ids, new_dm_data)
def single_file_nj(input_file, output_file):
    # read in dist matrix
    f = open(input_file, 'U')
    headers, data = parse_distmat(f)
    f.close()

    # do nj
    distdict = {}
    for i in range(len(headers)):
        for j in range(len(headers)):
            distdict[(headers[i],headers[j])] = data[i,j] # need j,i too?

    tree = nj(distdict)

    # write output
    f = open(output_file,'w')
    f.write(tree.getNewick(with_distances=True))
    f.close()
Beispiel #34
0
def single_file_upgma(input_file, output_file):
    # read in dist matrix
    f = open(input_file, 'U')
    headers, data = parse_distmat(f)
    f.close()
    
    # do upgma
    nodes = map(PhyloNode, headers)
    BIG = 1e305
    U = data.copy()
    for i in range(len(U)):
        U[i,i] = BIG
    c = UPGMA_cluster(U, nodes, BIG)

    # write output
    f = open(output_file,'w')
    f.write(c.getNewick(with_distances=True))
    f.close()
def single_file_nj(input_file, output_file):
    # read in dist matrix
    f = open(input_file, 'U')
    headers, data = parse_distmat(f)
    f.close()

    # do nj
    distdict = {}
    for i in range(len(headers)):
        for j in range(len(headers)):
            distdict[(headers[i], headers[j])] = data[i, j]  # need j,i too?

    tree = nj(distdict)

    # write output
    f = open(output_file, 'w')
    f.write(tree.getNewick(with_distances=True))
    f.close()
def group_distances(mapping_file,dmatrix_file,fields,dir_prefix='',\
    subdir_prefix='group_distances'):
    """Calculate all lists of distance groups."""
    distance_groups = {}
    mapping, header, comments = parse_mapping_file(open(mapping_file,'U'))
    header = [header]
    header.extend(mapping)
    mapping=header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file,'U'))

    if fields == []:
        raise ValueError, 'Since no fields were defined and the values within your fields are either all the same or all unique, a field was not chosen for analysis. Please define a field to analyse.'
        
    single_field = defaultdict(dict)
    for i in range(len(fields)):
        field = fields[i]
        groups = group_by_field(mapping, field)
        data = distances_by_groups(distance_header, distance_matrix, groups)
        #Need to remove pound signs from field name.
        field_name = field.replace('#','')
        single_field[field_name]=data

    write_distance_files(group_distance_dict=single_field,\
        dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_single')
        
    paired_field = defaultdict(dict)
    paired_field_for_writing = defaultdict(dict)
    for i in range(len(fields)):
        for j in range(i,len(fields)):
            fieldi = fields[i]
            fieldj = fields[j]
            groups = group_by_fields(mapping, [fieldi,fieldj])
            data = distances_by_groups(distance_header, distance_matrix, groups)
            paired_field[fieldi+'_to_'+fieldj]=data
            paired_field_for_writing[fieldi+'_to_'+field]=data
    
    write_distance_files(group_distance_dict=paired_field_for_writing,\
        dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_pairs')
    
    return single_field, paired_field, distance_matrix
Beispiel #37
0
def subset_groups(dm_f, map_f, category, max_group_size):
    dm_labels, dm_data = parse_distmat(dm_f)
    metadata_map = MetadataMap.parseMetadataMap(map_f)

    category_map = defaultdict(list)
    for samp_id in metadata_map.SampleIds:
        # Mapping files can have more samples than distance matrices, which can
        # happen in this case since we are dealing with rarefied OTU tables
        # (samples get dropped).
        if samp_id in dm_labels:
            category_val = metadata_map.getCategoryValue(samp_id, category)
            category_map[category_val].append(samp_id)

    samp_ids_to_keep = []
    for category_val, samp_ids in category_map.items():
        samp_ids_to_keep.extend(
                sample(samp_ids, min(max_group_size, len(samp_ids))))

    return filter_samples_from_distance_matrix((dm_labels, dm_data),
                                               samp_ids_to_keep, negate=True)
Beispiel #38
0
def group_distances(mapping_file,dmatrix_file,fields,dir_prefix='',\
    subdir_prefix='group_distances'):
    """Calculate all lists of distance groups."""
    distance_groups = {}
    mapping, header, comments = parse_mapping_file(open(mapping_file, 'U'))
    header = [header]
    header.extend(mapping)
    mapping = header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file,'U'))

    if fields is None:
        fields = [mapping[0][0]]
    single_field = defaultdict(dict)
    for i in range(len(fields)):
        field = fields[i]
        groups = group_by_field(mapping, field)
        data = distances_by_groups(distance_header, distance_matrix, groups)
        #Need to remove pound signs from field name.
        field_name = field.replace('#', '')

        single_field[field_name] = data

    write_distance_files(group_distance_dict=single_field,\
        dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_single')

    paired_field = defaultdict(dict)
    for i in range(len(fields)):
        for j in range(i, len(fields)):
            fieldi = fields[i]
            fieldj = fields[j]
            groups = group_by_fields(mapping, [fieldi, fieldj])
            data = distances_by_groups(distance_header, distance_matrix,
                                       groups)
            paired_field[fieldi + '_to_' + fieldj] = data

    write_distance_files(group_distance_dict=paired_field,\
        dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_pairs')

    return single_field, paired_field, distance_matrix
Beispiel #39
0
def filter_samples_from_distance_matrix(dm, samples_to_discard, negate=False):
    """ Remove specified samples from distance matrix 
    
        dm: (sample_ids, dm_data) tuple, as returned from 
         qiime.parse.parse_distmat; or a file handle that can be passed
         to qiime.parse.parse_distmat
    
    """
    try:
        sample_ids, dm_data = dm
    except ValueError:
        # input was provide as a file handle
        sample_ids, dm_data = parse_distmat(dm)

    sample_lookup = {}.fromkeys([e.split()[0] for e in samples_to_discard])
    temp_dm_data = []
    new_dm_data = []
    new_sample_ids = []

    if negate:

        def keep_sample(s):
            return s in sample_lookup
    else:

        def keep_sample(s):
            return s not in sample_lookup

    for row, sample_id in zip(dm_data, sample_ids):
        if keep_sample(sample_id):
            temp_dm_data.append(row)
            new_sample_ids.append(sample_id)
    temp_dm_data = array(temp_dm_data).transpose()

    for col, sample_id in zip(temp_dm_data, sample_ids):
        if keep_sample(sample_id):
            new_dm_data.append(col)
    new_dm_data = array(new_dm_data).transpose()

    return format_distance_matrix(new_sample_ids, new_dm_data)
def cogent_dist_to_qiime_dist(dist_tuple_dict):
    """
    This takes a dict with tuple keys and distance values, such as is output
    by the getDistances() method of a PhyloNode object, and converts it to a 
    QIIME-style distance matrix object: an ordered tuple with a list of samples
    in [0] and a numpy array of the distance matrix in [1].

    EDITED AND UPDATED 2013-07-09 Aaron Behr
    """

    headers = []
    dist_dict = {}

    # loop through dist_tuple_dict, returning (k1,k2):v tuples simultaneously
    for item in dist_tuple_dict.iteritems():
        # if k1 is not in headers, add it to headers
        if item[0][0] not in headers:
            headers.append(item[0][0])
            dist_dict[item[0][0]] = {item[0][0]: 0.0}  # null self-distance

        dist_dict[item[0][0]][item[0][1]] = item[1]  # dist_dict[k1][k2] = v
    headers.sort()

    # Initialize dict2d, with data from dist_dict (dict of dicts).
    # Also, RowOrder and ColOrder are set to the order of the sorted headers.
    # NOTE: no longer using the fromDicts() method to pass dist_dict to dict2d
    dict2d = Dict2D(dist_dict, headers, headers)

    # reflect dict2d so that it is no longer sparse
    dict2d.reflect(largest)

    # output tab-delimited printable string of the items in dict2d including
    # headers.
    dist_delim = dict2d.toDelimited()

    # generate and return Qiime distance matrix
    return parse_distmat(StringIO(dist_delim[1:]))
def single_file_upgma(input_file, output_file):
    # read in dist matrix
    f = open(input_file, 'U')
    headers, data = parse_distmat(f)
    f.close()

    # do upgma
    nodes = map(PhyloNode, headers)
    BIG = 1e305
    U = data.copy()
    for i in range(len(U)):
        U[i, i] = BIG
    c = UPGMA_cluster(U, nodes, BIG)

    # write output
    f = open(output_file, 'w')
    try:
        f.write(c.getNewick(with_distances=True))
    except AttributeError:
        if c == None:
            raise RuntimeError("""input file %s did not make a UPGMA tree.
 Ensure it has more than one sample present""" % (str(input_file), ))
        raise
    f.close()
Beispiel #42
0
def generate_data_make_html(dm_lines):
    """Generates a dictionary from the distance matrix with the plot info

    Inputs:
        dm_lines: distance matrix open file object

    Return dict of:
        {
            LD_NAME: plot_name,
            LD_HEADERS: {LD_HEADERS_VER:[], LD_HEADERS_HOR:[]},
            LD_MATRIX : list of lists containing the float values to plot
            LD_TRANSFORM_VALUES: {(val1, val2) : (plot_value, label)}
                must have a key of form (None, None)
                Is a dictionary which allows to transform the continue matrix 
                values into a discrete values to plot.
            LD_TABLE_TITLE: table_title
        }
        Contains all the needed information to generate the html file.
    """
    header, dist_mat = parse_distmat(dm_lines)
    # Distance matrix are symmetric, get only the upper triangle
    dist_mat = get_upper_triangle(dist_mat)
    # Generate the dictionary
    result = {}
    result[LD_NAME] = "Distance matrix"
    # In this case, the headers are symmetric
    headers = {}
    headers[LD_HEADERS_HOR] = header
    headers[LD_HEADERS_VER] = header

    result[LD_HEADERS] = headers
    result[LD_MATRIX] = dist_mat
    result[LD_TRANSFORM_VALUES] = generate_trans_values_dict(dist_mat)
    result[LD_TABLE_TITLE] = "Distance matrix"

    return result
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    
    indir = opts.input_dir
    outdir = opts.output_dir
    if not os.path.exists(outdir):
      os.makedirs(outdir)

    #input    
    file_names = os.listdir(indir)
    file_names = [fname for fname in file_names if not fname.startswith('.')]

    distmats = []
    headers_list = []
    for fname in file_names:
      f = open(os.path.join(indir,fname), 'U')
      headers, data = parse_distmat(f)
      f.close()
      distmats.append(data)
      headers_list.append(headers)

    #calcs
    headers, means, medians, stdevs = matrix_stats(headers_list, distmats)

    #output
    f = open(os.path.join(outdir,'means.txt'), 'w')
    f.write(format_distance_matrix(headers,means))
    f.close()

    f = open(os.path.join(outdir,'medians.txt'), 'w')
    f.write(format_distance_matrix(headers,medians))
    f.close()

    f = open(os.path.join(outdir,'stdevs.txt'), 'w')
    f.write(format_distance_matrix(headers,stdevs))
    f.close()
Beispiel #44
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    indir = opts.input_dir
    outdir = opts.output_dir
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    # input
    file_names = os.listdir(indir)
    file_names = [fname for fname in file_names if not fname.startswith('.')]

    distmats = []
    headers_list = []
    for fname in file_names:
        f = open(os.path.join(indir, fname), 'U')
        headers, data = parse_distmat(f)
        f.close()
        distmats.append(data)
        headers_list.append(headers)

    # calcs
    headers, means, medians, stdevs = matrix_stats(headers_list, distmats)

    # output
    f = open(os.path.join(outdir, 'means.txt'), 'w')
    f.write(format_distance_matrix(headers, means))
    f.close()

    f = open(os.path.join(outdir, 'medians.txt'), 'w')
    f.write(format_distance_matrix(headers, medians))
    f.close()

    f = open(os.path.join(outdir, 'stdevs.txt'), 'w')
    f.write(format_distance_matrix(headers, stdevs))
    f.close()
def single_file_upgma(input_file, output_file):
    # read in dist matrix
    f = open(input_file, 'U')
    headers, data = parse_distmat(f)
    f.close()
    
    # do upgma
    nodes = map(PhyloNode, headers)
    BIG = 1e305
    U = data.copy()
    for i in range(len(U)):
        U[i,i] = BIG
    c = UPGMA_cluster(U, nodes, BIG)

    # write output
    f = open(output_file,'w')
    try:
        f.write(c.getNewick(with_distances=True))
    except AttributeError:
        if c == None:
            raise RuntimeError("""input file %s did not make a UPGMA tree.
 Ensure it has more than one sample present""" % (str(input_file),))
        raise
    f.close()
Beispiel #46
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    category = opts.category
    mapping_fp = opts.mapping_fp

    colors_used = []

    if (category and mapping_fp == None) or (category == None and mapping_fp):
        option_parser.error('If coloring by a metadata category, both the '
                            'category and the mapping file must be supplied.')
    elif mapping_fp and category:
        mapping_data, mapping_headers, _ = parse_mapping_file(
            open(mapping_fp, 'U'))
        if category not in mapping_headers:
            option_parser.error("The category supplied must exist in the "
                                "metadata mapping file, '%s' does not exist." %
                                category)
        index = mapping_headers.index(category)
        categories = list(set([line[index] for line in mapping_data]))
    list_of_plots = []

    if opts.binning is None:
        ranges = []
    else:
        # simple ranges format validation
        if opts.binning.count('[')!=opts.binning.count(']') or\
          opts.binning.count('[')!=opts.binning.count(','):
            raise ValueError, "The binning input has an error: '%s'; " % +\
             "\nthe format should be [increment1,top_limit1][increment2,top_limit2]"
        # spliting in ranges
        rgn_txt = opts.binning.split('][')
        # removing left [ and right ]
        rgn_txt[0] = rgn_txt[0][1:]
        rgn_txt[-1] = rgn_txt[-1][:-1]
        # converting into int
        ranges = []
        max = 0

        for i, r in enumerate(rgn_txt):
            try:
                values = map(float, r.split(','))
            except ValueError:
                raise ValueError, "Not a valid format for binning %s" % opts.binning
            if len(values) != 2:
                raise ValueError, "All ranges must have only 2 values: [%s]" % r
            elif i + 1 != len(rgn_txt):
                if values[0] > values[1]:
                    raise ValueError, "The bin value can't be greater than the max value: [%s]" % r
                elif values < 0:
                    raise ValueError, "This value can not be negative: [%s]" % r
                elif max > values[1]:
                    raise ValueError, "This value can not smaller than the previous one: [%s]" % r
                else:
                    max = values[1]

            ranges.append(values)

    x_samples, x_distmtx = parse_distmat(open(opts.input_path_x, 'U'))
    y_samples, y_distmtx = parse_distmat(open(opts.input_path_y, 'U'))

    if opts.ignore_missing_samples:
        ignoring_from_x = list(set(x_samples) - set(y_samples))
        ignoring_from_y = list(set(y_samples) - set(x_samples))

        if opts.verbose:
            print '\nFrom %s we are ignoring: %s\n' % (opts.input_path_x,
                                                       ignoring_from_x)
            print '\nFrom %s we are ignoring: %s\n' % (opts.input_path_y,
                                                       ignoring_from_y)
            print '\nOnly using: %s\n' % (
                list(set(x_samples) & set(y_samples)))

        x_file = StringIO(\
            filter_samples_from_distance_matrix((x_samples, x_distmtx), ignoring_from_x))
        x_samples, x_distmtx = parse_distmat(x_file)

        y_file = StringIO(\
            filter_samples_from_distance_matrix((y_samples, y_distmtx), ignoring_from_y))
        y_samples, y_distmtx = parse_distmat(y_file)
    else:
        if x_distmtx.shape != y_distmtx.shape:
            raise ValueError, 'The distance matrices have different sizes. ' +\
                'You can cancel this error by passing --ignore_missing_samples'

    figure()
    if category == None:
        x_val, y_val, x_fit, y_fit, func_text = fit_semivariogram(
            (x_samples, x_distmtx), (y_samples, y_distmtx), opts.model, ranges)

        plot(x_val,
             y_val,
             color=opts.dot_color,
             marker=opts.dot_marker,
             linestyle="None",
             alpha=opts.dot_alpha)
        plot(x_fit,
             y_fit,
             linewidth=2.0,
             color=opts.line_color,
             alpha=opts.line_alpha)
    else:
        for index, single_category in enumerate(categories):
            good_sample_ids = sample_ids_from_metadata_description(
                open(mapping_fp), '%s:%s' % (category, single_category))

            _y_samples, _y_distmtx = parse_distmat(
                StringIO(
                    filter_samples_from_distance_matrix((y_samples, y_distmtx),
                                                        good_sample_ids,
                                                        negate=True)))
            _x_samples, _x_distmtx = parse_distmat(
                StringIO(
                    filter_samples_from_distance_matrix((x_samples, x_distmtx),
                                                        good_sample_ids,
                                                        negate=True)))

            x_val, y_val, x_fit, y_fit, func_text = fit_semivariogram(
                (_x_samples, _x_distmtx), (_y_samples, _y_distmtx), opts.model,
                ranges)

            # retrieve one of the colors the "QIIME" colors and add it to the
            # list of used colors for the creation of the legends in the plot
            color_only = get_qiime_hex_string_color(index)
            colors_used.append(color_only)

            plot(x_val,
                 y_val,
                 color=color_only,
                 marker=opts.dot_marker,
                 linestyle="None",
                 alpha=opts.dot_alpha)
            plot(x_fit,
                 y_fit,
                 linewidth=2.0,
                 color=color_only,
                 alpha=opts.line_alpha,
                 label=single_category)

    if opts.x_min != None and opts.x_max != None:
        xlim([opts.x_min, opts.x_max])
    if opts.y_min != None and opts.y_max != None:
        ylim([opts.y_min, opts.y_max])

    x_label = opts.x_label
    y_label = opts.y_label
    fig_title = '%s (%s)' % (opts.fig_title, opts.model)

    xlabel(x_label)
    ylabel(y_label)
    if opts.print_model:
        title(fig_title + ' ' + func_text)
    else:
        title(fig_title)

    savefig(opts.output_path)

    # print the legends after the figure is exported to avoid conflicts
    if category:
        # if there's a desired format, use that, else default it to png
        _, extension = splitext(opts.output_path)

        # remove the dot, else, make_legend will add it to the filename
        extension = extension.replace('.', '')

        if extension == '':
            extension = 'png'
        make_legend(categories, colors_used, 0, 0, 'black', 'white',
                    opts.output_path, extension, 80)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_path = opts.input_path
    output_path = opts.output_path
    iterations = opts.iterations
    verbose = opts.verbose
    y_max = opts.y_max
    labels = opts.labels.split(',')

    results = {}
    for input_file in input_path:
        if verbose:
            print input_file

        # Reading OTU/biom table
        samples, distmat = parse_distmat(open(input_file, 'U'))
        possible_samples = range(len(distmat[0]))

        result_iteration = []
        for iteration in range(iterations):
            iter_vals = []
            for n in possible_samples:
                if n < 1:
                    continue
                curr_samples = sample(possible_samples, n+1)

                curr_vals = []
                for curr_i, i in enumerate(curr_samples):
                    for j in curr_samples[curr_i+1:]:
                        curr_vals.append(distmat[i][j])
                iter_vals.append(min(curr_vals))
            result_iteration.append(iter_vals)

        results[input_file] = [mean(result_iteration, axis=0),
                               std(result_iteration, axis=0)]

        if verbose:
            f = open(output_path + '.txt', 'a')
            f.write('\t'.join(map(str,results[input_file][0])))
            f.write('\n')
            f.write('\t'.join(map(str,results[input_file][1])))
            f.write('\n')
            f.close()

    # generating plot, some parts taken from
    # http://stackoverflow.com/questions/4700614
    fig = figure()
    ax = subplot(111)

    max_x, max_y = -inf, -inf
    for i, (label, input_file) in enumerate(zip(labels, input_path)):
        len_x = len(results[input_file][0])
        len_y = max(results[input_file][0])
        if max_x < len_x:
            max_x = len_x
        if max_y < len_y:
            max_y = len_y
        if i % 2 == 0:
            coloring = (215/255.0, 48/255.0, 39/255.0)
        else:
            coloring = (69/255.0, 177/255.0, 180/255.0)

        ax.errorbar(range(1, len_x+1), results[input_file][0],
                    yerr=results[input_file][1], fmt='o', color=coloring,
                    label=label)

    if y_max:
        axis([0, max_x, 0, max_y])
    else:
        axis([0, max_x, 0, y_max])

    # Shrink current axis by 20%
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    # Put a legend to the right of the current axis
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    title(opts.title)
    xlabel('Samples')
    ylabel(opts.ylabel)
    grid(True)
    savefig(output_path)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_path = opts.input_path
    output_path = opts.output_path
    iterations = opts.iterations
    verbose = opts.verbose
    y_max = opts.y_max
    labels = opts.labels.split(',')

    results = {}
    for idx, input_file in enumerate(input_path):
        if verbose:
            print input_file

        # Reading OTU/biom table
        samples, distmat = parse_distmat(open(input_file, 'U'))
        possible_samples = range(len(distmat[0]))
        mask = np.ones(distmat.shape)

        n_possible_samples = len(possible_samples)
        result_iteration = np.zeros((iterations, n_possible_samples))

        for iter_idx, iteration in enumerate(range(iterations)):
            iter_vals = np.zeros(n_possible_samples)
            for idx, n in enumerate(possible_samples):
                if n < 1:
                    continue
                curr_samples = sample(possible_samples, n+1)

                # masked arrays are inverted apparently, so 0 means to keep
                mask.fill(1)
                mask[curr_samples] = 0
                mask[:, curr_samples] = 0
                np.fill_diagonal(mask, 1)
                masked_array = np.ma.array(distmat, mask=mask)
                iter_vals[idx] = masked_array.min()

            result_iteration[iter_idx] = iter_vals

        results[input_file] = [mean(result_iteration, axis=0),
                               std(result_iteration, axis=0)]

        if verbose:
            f = open(output_path + '.txt', 'a')
            f.write('\t'.join(map(str, results[input_file][0])))
            f.write('\n')
            f.write('\t'.join(map(str, results[input_file][1])))
            f.write('\n')
            f.close()

    # generating plot, some parts taken from
    # http://stackoverflow.com/questions/4700614
    figure()
    ax = subplot(111)

    max_x, max_y = -inf, -inf
    for i, (label, input_file) in enumerate(zip(labels, input_path)):
        len_x = len(results[input_file][0])
        len_y = max(results[input_file][0])
        if max_x < len_x:
            max_x = len_x
        if max_y < len_y:
            max_y = len_y
        if i % 2 == 0:
            coloring = (215/255.0, 48/255.0, 39/255.0)
        else:
            coloring = (69/255.0, 177/255.0, 180/255.0)

        ax.errorbar(range(1, len_x+1), results[input_file][0],
                    yerr=results[input_file][1], fmt='o', color=coloring,
                    label=label)

    if y_max:
        axis([0, max_x, 0, y_max])
    else:
        axis([0, max_x, 0, max_y])

    # Shrink current axis by 20%
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    # Put a legend to the right of the current axis
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    title(opts.title)
    xlabel('Samples')
    ylabel(opts.ylabel)
    grid(True)
    savefig(output_path)
Beispiel #49
0
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \
    dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\
    default_iters=10, fields=None):
    """Calculate Monte Carlo stats for specified group distances.
    
    Specifically:
    - find the groups for each specified col (or combination of cols)
    - do t test between each pair of groups
    - randomize matrix n times and find empirical value of t for each pair
    - compare the actual value of t to the randomized values

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.
    """
    mapping, header, comments = parse_mapping_file(open(mapping_file, 'U'))
    header = [header]
    header.extend(mapping)
    mapping = header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file,'U'))

    orig_distance_matrix = distance_matrix.copy()

    path_prefix = path.join(dir_prefix, subdir_prefix)

    #if dir doesn't exist
    if not path.isdir(path_prefix):
        # make directory
        mkdir(path_prefix)

    if fields is None:
        fields = [mapping[0][0]]

    if prefs is None:
        prefs = {}

    if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs:
        prefs = build_monte_carlo_prefs(fields, default_iters)

    for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items():
        if '&&' in field:
            groups = group_by_fields(mapping, field.split('&&'))
        else:
            groups = group_by_field(mapping, field)
        outfile = open(
            path.join(path_prefix, 'group_distances_' + field + '.txt'), 'w')
        outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\
            'Category_2a','Category_2b','Avg','t','p',\
            'p_greater','p_less','Iterations\n']))
        real_dists = distances_by_groups(distance_header, distance_matrix,\
            groups)

        #iterate over the groups
        for i, (first_g1, second_g1, distances_g1) in \
            enumerate(real_dists[:-1]):

            real_dist_1 = average(distances_g1)

            #then for each other pair (not including same group)
            for j in range(i + 1, len(real_dists)):
                first_g2, second_g2, distances_g2 = real_dists[j]

                real_dist_2 = average(distances_g2)

                # permute distances just within these groups!
                rand_dists_1, rand_dists_2 = \
                        permute_between_groups(distances_g1,
                                               distances_g2,
                                               num_iters)

                ttests = [t_two_sample(rand_dists_1[n].flatten(),rand_dists_2[n].flatten())[0] \
                    for n in range(num_iters)]
                real_ttest = t_two_sample(distances_g1.flatten(),
                                          distances_g2.flatten())
                curr_line = [first_g1, second_g1, real_dist_1, \
                    first_g2, second_g2, real_dist_2]
                curr_line.extend([real_ttest[0], real_ttest[1],\
                    (array(ttests)>real_ttest[0]).sum()/float(num_iters), \
                    (array(ttests)<real_ttest[0]).sum()/float(num_iters), \
                    num_iters])
                outfile.write('\t'.join(map(str, curr_line)))
                outfile.write('\n')
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    #Some code for error checking of input args:

    #Check if distance_matrix_file is valid:
    try:
        d_header, d_mat = parse_distmat(open(opts.distance_matrix_file, 'U'))
    except:
        option_parser.error(
            "This does not look like a valid distance matrix file.  Please supply a valid distance matrix file using the -d option."
        )

    if not is_symmetric_and_hollow(d_mat):
        option_parser.error("The distance matrix must be symmetric and "
                            "hollow.")

    #Check if map_fname is valid:
    try:
        mapping, m_header, m_comments = \
            parse_mapping_file(open(opts.map_fname,'U'))
    except QiimeParseError:
        option_parser.error(
            "This does not look like a valid metadata mapping file.  Please supply a valid mapping file using the -m option."
        )

    #make sure background_color is valid
    if opts.background_color not in ['black', 'white']:
        option_parser.error(
            "'%s' is not a valid background color.  Please pass in either 'black' or 'white' using the -k option."
            % (opts.background_color))

    #make sure prefs file is valid if it exists
    if opts.prefs_path is not None:
        try:
            prefs_file = open(opts.prefs_path, 'U').read()
        except IOError:
            option_parser.error(
                "Provided prefs file, '%s', does not exist.  Please pass in a valid prefs file with the -p option."
                % (opts.prefs_path))

    if opts.prefs_path is not None:
        prefs = parse_prefs_file(prefs_file)
    else:
        prefs = None


    color_prefs, color_data, background_color, label_color, ball_scale,\
     arrow_colors=sample_color_prefs_and_map_data_from_options(opts)

    #list of labelname, groups, colors, data_colors, data_color_order
    groups_and_colors=list(iter_color_groups(mapping=color_data['map'],\
        prefs=color_prefs))

    #dict mapping labelname to list of: [groups, colors, data_colors,
    # data_color_order]
    field_to_colors = {}
    for color_info in groups_and_colors:
        field_to_colors[color_info[0]] = color_info[1:]

    qiime_dir = get_qiime_project_dir() + '/qiime/support_files/'

    fields = opts.fields
    if fields is not None:
        fields = map(strip, fields.split(','))
        fields = [i.strip('"').strip("'") for i in fields]
    elif prefs is not None:
        fields = prefs.get('FIELDS', None)
    else:
        fields = get_interesting_mapping_fields(mapping, m_header)

    #Check that all provided fields are valid:
    if fields is not None:
        for f in fields:
            if f not in m_header:
                option_parser.error(
                    "The field, %s, is not in the provided mapping file.  Please supply correct fields (using the -f option or providing a 'FIELDS' list in the prefs file) corresponding to fields in mapping file."
                    % (f))

    within_distances, between_distances, dmat = \
        group_distances(mapping_file=opts.map_fname,\
        dmatrix_file=opts.distance_matrix_file,\
        fields=fields,\
        dir_prefix=get_random_directory_name(output_dir=opts.dir_path,\
            prefix='distances'))

    if not opts.suppress_html_output:
        #histograms output path
        histograms_path = path.join(opts.dir_path, 'histograms')
        try:
            mkdir(histograms_path)
        except OSError:  #raised if dir exists
            pass

        #draw all histograms
        distances_dict, label_to_histogram_filename = \
            draw_all_histograms(single_field=within_distances, \
                paired_field=between_distances, \
                dmat=dmat,\
                histogram_dir=histograms_path,\
                field_to_color_prefs=field_to_colors,\
                background_color=background_color)

        #Get relative path to histogram files.
        label_to_histogram_filename_relative = \
            _make_relative_paths(label_to_histogram_filename, opts.dir_path)

        dm_fname = path.split(opts.distance_matrix_file)[-1]
        basename = path.splitext(dm_fname)[0]
        outfile_name = basename + '_distance_histograms.html'
        make_main_html(distances_dict=distances_dict,\
            label_to_histogram_filename=label_to_histogram_filename_relative,\
            root_outdir=opts.dir_path, \
            outfile_name = outfile_name, \
            title='Distance Histograms')

        #Handle saving web resources locally.
        #javascript file
        javascript_path = path.join(opts.dir_path, 'js')
        try:
            mkdir(javascript_path)
        except OSError:  #raised if dir exists
            pass
        js_out = open(javascript_path + '/histograms.js', 'w')
        js_out.write(open(qiime_dir + 'js/histograms.js').read())
        js_out.close()

    monte_carlo_iters = opts.monte_carlo_iters
    if monte_carlo_iters > 0:
        #Do Monte Carlo for all fields
        monte_carlo_group_distances(mapping_file=opts.map_fname,\
            dmatrix_file=opts.distance_matrix_file,\
            prefs=prefs, \
            dir_prefix = opts.dir_path,\
            fields=fields,\
            default_iters=monte_carlo_iters)

        #Do Monte Carlo for within and between fields
        monte_carlo_group_distances_within_between(\
            single_field=within_distances,\
            paired_field=between_distances, dmat=dmat, \
            dir_prefix = opts.dir_path,\
            num_iters=monte_carlo_iters)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    # Parse the distance matrix and mapping file.
    try:
        dist_matrix_header, dist_matrix = parse_distmat(
            open(opts.distance_matrix_fp, 'U'))
    except:
        option_parser.error(
            "This does not look like a valid distance matrix "
            "file. Please supply a valid distance matrix file using the -d "
            "option.")
    try:
        mapping, mapping_header, mapping_comments = parse_mapping_file(
            open(opts.mapping_fp, 'U'))
    except QiimeParseError:
        option_parser.error(
            "This does not look like a valid metadata mapping "
            "file. Please supply a valid mapping file using the -m option.")

    # Make sure the y_min and y_max options make sense, as they can be either
    # 'auto' or a number.
    y_min = opts.y_min
    y_max = opts.y_max
    try:
        y_min = float(y_min)
    except ValueError:
        if y_min == 'auto':
            y_min = None
        else:
            option_parser.error("The --y_min option must be either a number "
                                "or 'auto'.")
    try:
        y_max = float(y_max)
    except ValueError:
        if y_max == 'auto':
            y_max = None
        else:
            option_parser.error("The --y_max option must be either a number "
                                "or 'auto'.")

    # Parse the field states that will be compared to every other field state.
    comparison_field_states = opts.comparison_groups
    comparison_field_states = map(strip, comparison_field_states.split(','))
    comparison_field_states = [
        field_state.strip('"').strip("'")
        for field_state in comparison_field_states
    ]
    if comparison_field_states is None:
        option_parser.error("You must provide at least one field state to "
                            "compare (using the -c option).")

    # Get distance comparisons between each field state and each of the
    # comparison field states.
    field = opts.field
    comparison_groupings = get_field_state_comparisons(
        dist_matrix_header, dist_matrix, mapping_header, mapping, field,
        comparison_field_states)

    # Grab a list of all field states that had the comparison field states
    # compared against them. These will be plotted along the x-axis.
    field_states = comparison_groupings.keys()

    def custom_comparator(x, y):
        try:
            num_x = float(x)
            num_y = float(y)
            return int(num_x - num_y)
        except:
            if x < y:
                return -1
            elif x > y:
                return 1
            else:
                return 0

    # Sort the field states as numbers if the elements are numbers, else sort
    # them lexically.
    field_states.sort(custom_comparator)

    # If the label type is numeric, get a list of all field states in sorted
    # numeric order. These will be used to determine the spacing of the
    # field state 'points' along the x-axis.
    x_spacing = None
    if opts.label_type == "numeric":
        try:
            x_spacing = map(float, field_states)
            x_spacing.sort()
        except:
            option_parser.error("The 'numeric' label type is invalid because "
                                "not all field states could be converted into "
                                "numbers. Please specify a different label "
                                "type.")

    # Accumulate the data for each field state 'point' along the x-axis.
    plot_data = []
    plot_x_axis_labels = []
    for field_state in field_states:
        field_state_data = []
        for comp_field_state in comparison_field_states:
            field_state_data.append(
                comparison_groupings[field_state][comp_field_state])
        plot_data.append(field_state_data)
        plot_x_axis_labels.append(field_state)

    # Plot the data and labels.
    plot_title = "Distance Comparisons"
    plot_x_label = field
    plot_y_label = "Distance"

    # If we are creating a bar chart or box plot, grab a list of good data
    # colors to use.
    plot_type = opts.plot_type
    plot_colors = None
    if plot_type == "bar" or plot_type == "box":
        plot_colors = [matplotlib_rgb_color(data_colors[color].toRGB()) \
                       for color in data_color_order]

    assert plot_data, "Error: there is no data to plot!"

    width = opts.width
    height = opts.height
    if width <= 0 or height <= 0:
        option_parser.error("The specified width and height of the image must "
                            "be greater than zero.")

    plot_figure = generate_comparative_plots(
        opts.plot_type,
        plot_data,
        x_values=x_spacing,
        data_point_labels=plot_x_axis_labels,
        distribution_labels=comparison_field_states,
        distribution_markers=plot_colors,
        x_label=plot_x_label,
        y_label=plot_y_label,
        title=plot_title,
        x_tick_labels_orientation=opts.x_tick_labels_orientation,
        y_min=y_min,
        y_max=y_max,
        whisker_length=opts.whisker_length,
        error_bar_type=opts.error_bar_type,
        distribution_width=opts.distribution_width,
        figure_width=width,
        figure_height=height)

    # Save the plot in the specified format.
    output_plot_fp = join(
        opts.output_dir,
        "%s_Distance_Comparisons.%s" % (field, opts.imagetype))
    plot_figure.savefig(output_plot_fp,
                        format=opts.imagetype,
                        transparent=opts.transparent)

    if not opts.suppress_significance_tests:
        sig_tests_f = open(join(opts.output_dir, "%s_Stats.txt" % field), 'w')

        # Rearrange the plot data into a format suitable for all_pairs_t_test.
        sig_tests_labels = []
        sig_tests_data = []
        for data_point, data_point_label in zip(plot_data, plot_x_axis_labels):
            for dist, comp_field in zip(data_point, comparison_field_states):
                sig_tests_labels.append('%s vs %s' %
                                        (data_point_label, comp_field))
                sig_tests_data.append(dist)

        sig_tests_results = all_pairs_t_test(
            sig_tests_labels,
            sig_tests_data,
            tail_type=opts.tail_type,
            num_permutations=opts.num_permutations)
        sig_tests_f.write(sig_tests_results)
        sig_tests_f.close()

    if opts.save_raw_data:
        # Write the raw plot data into a tab-delimited file, where each line
        # has the distances between a comparison group and another field state
        # 'point' along the x-axis.
        assert (len(plot_x_axis_labels) == len(plot_data)), "The number of " +\
                "labels do not match the number of points along the x-axis."
        raw_data_fp = join(opts.output_dir,
                           "%s_Distance_Comparisons.txt" % field)
        raw_data_f = open(raw_data_fp, 'w')

        raw_data_f.write("#ComparisonGroup\tFieldState\tDistances\n")
        for label, data in zip(plot_x_axis_labels, plot_data):
            assert (len(comparison_field_states) == len(data)), "The " +\
                    "number of specified comparison groups does not match " +\
                    "the number of groups found at the current point along " +\
                    "the x-axis."
            for comp_field_state, comp_grp_data in zip(comparison_field_states,
                                                       data):
                raw_data_f.write(comp_field_state + "\t" + label + "\t" +
                                 "\t".join(map(str, comp_grp_data)) + "\n")
        raw_data_f.close()
Beispiel #52
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    # Parse the distance matrix and mapping file.
    try:
        dist_matrix_header, dist_matrix = parse_distmat(
            open(opts.distance_matrix_fp, 'U'))
    except:
        option_parser.error(
            "This does not look like a valid distance matrix "
            "file. Please supply a valid distance matrix file using the -d "
            "option.")

    try:
        mapping, mapping_header, mapping_comments = parse_mapping_file(
            open(opts.mapping_fp, 'U'))
    except QiimeParseError:
        option_parser.error(
            "This does not look like a valid metadata mapping "
            "file. Please supply a valid mapping file using the -m option.")

    fields = opts.fields
    fields = map(strip, fields.split(','))
    fields = [field.strip('"').strip("'") for field in fields]

    if fields is None:
        option_parser.error("You must provide at least one field using the -f "
                            "option.")

    # Make sure each field is in the mapping file.
    for field in fields:
        if field not in mapping_header:
            option_parser.error(
                "The field '%s' is not in the provided "
                "mapping file. Please supply correct fields (using the -f "
                "option) corresponding to fields in the mapping file." % field)

    # Make sure the y_min and y_max options make sense, as they can be either
    # 'auto' or a number.
    y_min = opts.y_min
    y_max = opts.y_max
    try:
        y_min = float(y_min)
    except ValueError:
        if y_min == 'auto':
            y_min = None
        else:
            option_parser.error("The --y_min option must be either a number "
                                "or 'auto'.")
    try:
        y_max = float(y_max)
    except ValueError:
        if y_max == 'auto':
            y_max = None
        else:
            option_parser.error("The --y_max option must be either a number "
                                "or 'auto'.")

    # Generate the various boxplots, depending on what the user wanted
    # suppressed. Add them all to one encompassing plot.
    for field in fields:
        plot_data = []
        plot_labels = []

        if not opts.suppress_all_within:
            plot_data.append(
                get_all_grouped_distances(dist_matrix_header,
                                          dist_matrix,
                                          mapping_header,
                                          mapping,
                                          field,
                                          within=True))
            plot_labels.append("All within %s" % field)
        if not opts.suppress_all_between:
            plot_data.append(
                get_all_grouped_distances(dist_matrix_header,
                                          dist_matrix,
                                          mapping_header,
                                          mapping,
                                          field,
                                          within=False))
            plot_labels.append("All between %s" % field)
        if not opts.suppress_individual_within:
            within_dists = get_grouped_distances(dist_matrix_header,
                                                 dist_matrix,
                                                 mapping_header,
                                                 mapping,
                                                 field,
                                                 within=True)
            for grouping in within_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))
        if not opts.suppress_individual_between:
            between_dists = get_grouped_distances(dist_matrix_header,
                                                  dist_matrix,
                                                  mapping_header,
                                                  mapping,
                                                  field,
                                                  within=False)
            for grouping in between_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))

        # We now have our data and labels ready, so plot them!
        assert (len(plot_data) == len(plot_labels)), "The number " +\
                "of boxplot labels does not match the number of " +\
                "boxplots."
        if plot_data:
            if opts.sort:
                # Sort our plot data in order of increasing median.
                sorted_data = []
                for label, distribution in zip(plot_labels, plot_data):
                    sorted_data.append(
                        (label, distribution, median(distribution)))
                sorted_data.sort(key=itemgetter(2))
                plot_labels = []
                plot_data = []
                for label, distribution, median_value in sorted_data:
                    plot_labels.append(label)
                    plot_data.append(distribution)

            width = opts.width
            height = opts.height
            if width is None:
                width = len(plot_data) * opts.box_width + 2
            if width <= 0 or height <= 0:
                option_parser.error("The specified width and height of the "
                                    "image must be greater than zero.")

            plot_figure = generate_box_plots(
                plot_data,
                x_tick_labels=plot_labels,
                title="%s Distances" % field,
                x_label="Grouping",
                y_label="Distance",
                x_tick_labels_orientation='vertical',
                y_min=y_min,
                y_max=y_max,
                whisker_length=opts.whisker_length,
                box_width=opts.box_width,
                box_color=opts.box_color,
                figure_width=width,
                figure_height=height)

            output_plot_fp = join(opts.output_dir,
                                  "%s_Distances.%s" % (field, opts.imagetype))
            plot_figure.savefig(output_plot_fp,
                                format=opts.imagetype,
                                transparent=opts.transparent)
        else:
            option_parser.error("You have chosen to suppress all plots. At "
                                "least one type of plot must be unsuppressed.")

        if not opts.suppress_significance_tests:
            sig_tests_f = open(join(opts.output_dir, "%s_Stats.xls" % field),
                               'w')
            sig_tests_results = all_pairs_t_test(
                plot_labels,
                plot_data,
                tail_type=opts.tail_type,
                num_permutations=opts.num_permutations)
            sig_tests_f.write(sig_tests_results)
            sig_tests_f.close()

        if opts.save_raw_data:
            # Write the raw plot data into a tab-delimited file.
            assert (len(plot_labels) == len(plot_data))
            raw_data_fp = join(opts.output_dir, "%s_Distances.xls" % field)
            raw_data_f = open(raw_data_fp, 'w')

            for label, data in zip(plot_labels, plot_data):
                raw_data_f.write(label.replace(" ", "_") + "\t")
                raw_data_f.write("\t".join(map(str, data)))
                raw_data_f.write("\n")
            raw_data_f.close()
Beispiel #53
0
    def single_object_beta(self,
                           otu_table,
                           metric,
                           tree_string,
                           missing_sams=None):
        """ running single_file_beta should give same result using --rows"""
        if missing_sams is None:
            missing_sams = []

        metrics = list_known_nonphylogenetic_metrics()
        metrics.extend(list_known_phylogenetic_metrics())

        # new metrics that don't trivially parallelize must be dealt with
        # carefully
        warnings.filterwarnings(
            'ignore', 'dissimilarity binary_dist_chisq is\
 not parallelized, calculating the whole matrix...')
        warnings.filterwarnings(
            'ignore', 'dissimilarity dist_chisq is not\
 parallelized, calculating the whole matrix...')
        warnings.filterwarnings(
            'ignore', 'dissimilarity dist_gower is not\
 parallelized, calculating the whole matrix...')
        warnings.filterwarnings(
            'ignore', 'dissimilarity dist_hellinger is\
 not parallelized, calculating the whole matrix...')
        warnings.filterwarnings('ignore', 'unifrac had no information for\
 sample M*')

        # self.files_to_remove.extend([input_path,tree_path])
        # self.folders_to_remove.append(output_dir)
        # os.mkdir(output_dir+'/ft/')

        for metric in metrics:
            # do it
            beta_out = single_object_beta(otu_table,
                                          metric,
                                          tree_string,
                                          rowids=None,
                                          full_tree=False)

            sams, dmtx = parse_distmat(beta_out)

            # do it by rows
            for i in range(len(sams)):
                if sams[i] in missing_sams:
                    continue
                rows = sams[i]
                # row_outname = output_dir + '/' + metric + '_' +\
                # in_fname
                r_out = single_object_beta(otu_table,
                                           metric,
                                           tree_string,
                                           rowids=rows,
                                           full_tree=False)
                col_sams, row_sams, row_dmtx = parse_matrix(r_out)

                self.assertEqual(row_dmtx.shape,
                                 (len(rows.split(',')), len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j, k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                 sams.index(col_sams[k])]
                        npt.assert_almost_equal(row_v1, full_v1)

            # full tree run:
            if 'full_tree' in str(metric).lower():
                continue
            # do it by rows with full tree
            for i in range(len(sams)):
                if sams[i] in missing_sams:
                    continue
                rows = sams[i]

                #~ row_outname = output_dir + '/ft/' + metric + '_' +\
                #~ in_fname
                r_out = single_object_beta(otu_table,
                                           metric,
                                           tree_string,
                                           rowids=None,
                                           full_tree=True)
                col_sams, row_sams, row_dmtx = parse_matrix(r_out)

                self.assertEqual(row_dmtx.shape,
                                 (len(rows.split(',')), len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j, k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                 sams.index(col_sams[k])]
                        npt.assert_almost_equal(row_v1, full_v1)

            # do it with full tree
            r_out = single_object_beta(otu_table,
                                       metric,
                                       tree_string,
                                       rowids=None,
                                       full_tree=True)
            sams_ft, dmtx_ft = parse_distmat(r_out)
            self.assertEqual(sams_ft, sams)
            npt.assert_almost_equal(dmtx_ft, dmtx)
Beispiel #54
0
    def single_file_beta(self,
                         otu_table_string,
                         tree_string,
                         missing_sams=None,
                         use_metric_list=False):
        """ running single_file_beta should give same result using --rows"""
        if missing_sams is None:
            missing_sams = []
        # setup
        fd, input_path = mkstemp(suffix='.txt')
        os.close(fd)
        in_fname = os.path.split(input_path)[1]
        f = open(input_path, 'w')
        f.write(otu_table_string)
        f.close()
        fd, tree_path = mkstemp(suffix='.tre')
        os.close(fd)
        f = open(tree_path, 'w')
        f.write(tree_string)
        f.close()
        metrics = list_known_nonphylogenetic_metrics()
        metrics.extend(list_known_phylogenetic_metrics())
        output_dir = mkdtemp()

        # new metrics that don't trivially parallelize must be dealt with
        # carefully
        warnings.filterwarnings(
            'ignore', 'dissimilarity binary_dist_chisq is\
 not parallelized, calculating the whole matrix...')
        warnings.filterwarnings(
            'ignore', 'dissimilarity dist_chisq is not\
 parallelized, calculating the whole matrix...')
        warnings.filterwarnings(
            'ignore', 'dissimilarity dist_gower is not\
 parallelized, calculating the whole matrix...')
        warnings.filterwarnings(
            'ignore', 'dissimilarity dist_hellinger is\
 not parallelized, calculating the whole matrix...')
        warnings.filterwarnings('ignore', 'unifrac had no information for\
 sample M*')

        self.files_to_remove.extend([input_path, tree_path])
        self.folders_to_remove.append(output_dir)
        os.mkdir(output_dir + '/ft/')

        for metric in metrics:
            # do it
            if use_metric_list:
                single_file_beta(input_path, [metric],
                                 tree_path,
                                 output_dir,
                                 rowids=None)
            else:
                single_file_beta(input_path,
                                 metric,
                                 tree_path,
                                 output_dir,
                                 rowids=None)
            sams, dmtx = parse_distmat(
                open(output_dir + '/' + metric + '_' + in_fname))

            # do it by rows
            for i in range(len(sams)):
                if sams[i] in missing_sams:
                    continue
                rows = sams[i]
                row_outname = output_dir + '/' + metric + '_' +\
                    in_fname
                if use_metric_list:
                    single_file_beta(input_path, [metric],
                                     tree_path,
                                     output_dir,
                                     rowids=rows)
                else:
                    single_file_beta(input_path,
                                     metric,
                                     tree_path,
                                     output_dir,
                                     rowids=rows)
                col_sams, row_sams, row_dmtx = parse_matrix(open(row_outname))

                self.assertEqual(row_dmtx.shape,
                                 (len(rows.split(',')), len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j, k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                 sams.index(col_sams[k])]
                        npt.assert_almost_equal(row_v1, full_v1)

            # full tree run:
            if 'full_tree' in str(metric).lower():
                continue
            # do it by rows with full tree
            for i in range(len(sams)):
                if sams[i] in missing_sams:
                    continue
                rows = sams[i]

                row_outname = output_dir + '/ft/' + metric + '_' +\
                    in_fname
                if use_metric_list:
                    single_file_beta(input_path, [metric],
                                     tree_path,
                                     output_dir + '/ft/',
                                     rowids=rows,
                                     full_tree=True)
                else:
                    single_file_beta(input_path,
                                     metric,
                                     tree_path,
                                     output_dir + '/ft/',
                                     rowids=rows,
                                     full_tree=True)
                col_sams, row_sams, row_dmtx = parse_matrix(open(row_outname))

                self.assertEqual(row_dmtx.shape,
                                 (len(rows.split(',')), len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j, k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                 sams.index(col_sams[k])]
                        npt.assert_almost_equal(row_v1, full_v1)

            # do it with full tree
            if use_metric_list:
                single_file_beta(input_path, [metric],
                                 tree_path,
                                 output_dir + '/ft/',
                                 rowids=None,
                                 full_tree=True)
            else:
                single_file_beta(input_path,
                                 metric,
                                 tree_path,
                                 output_dir + '/ft/',
                                 rowids=None,
                                 full_tree=True)
            sams_ft, dmtx_ft = parse_distmat(
                open(output_dir + '/ft/' + metric + '_' + in_fname))
            self.assertEqual(sams_ft, sams)
            npt.assert_almost_equal(dmtx_ft, dmtx)
Beispiel #55
0
    def setUp(self):
        """Create some data to be used in the tests."""
        # Create the mapping file/distance matrix combo from the overview
        # tutorial.
        self.dist_matrix_string = [
            "\tPC.354\tPC.355\tPC.356\tPC.481\tPC.593\
                                    \tPC.607\tPC.634\tPC.635\tPC.636",
            "PC.354\t0.0\t0.625\t0.623\t0.61\t0.577\
                                    \t0.729\t0.8\t0.721\t0.765",
            "PC.355\t0.625\t0.0\t0.615\t0.642\t0.673\
                                    \t0.776\t0.744\t0.749\t0.677",
            "PC.356\t0.623\t0.615\t0.0\t0.682\t0.737\
                                    \t0.734\t0.777\t0.733\t0.724",
            "PC.481\t0.61\t0.642\t0.682\t0.0\t0.704\
                                    \t0.696\t0.675\t0.654\t0.696",
            "PC.593\t0.577\t0.673\t0.737\t0.704\t0.0\
                                    \t0.731\t0.758\t0.738\t0.737",
            "PC.607\t0.729\t0.776\t0.734\t0.696\t0.731\
                                    \t0.0\t0.718\t0.666\t0.727",
            "PC.634\t0.8\t0.744\t0.777\t0.675\t0.758\
                                    \t0.718\t0.0\t0.6\t0.578",
            "PC.635\t0.721\t0.749\t0.733\t0.654\t0.738\
                                    \t0.666\t0.6\t0.0\t0.623",
            "PC.636\t0.765\t0.677\t0.724\t0.696\t0.737\
                                    \t0.727\t0.578\t0.623\t0.0"
        ]

        self.mapping_string = [
            "#SampleID\tBarcodeSequence\tTreatment\tDOB",
            "PC.354\tAGCACGAGCCTA\tControl\t20061218",
            "PC.355\tAACTCGTCGATG\tControl\t20061218",
            "PC.356\tACAGACCACTCA\tControl\t20061126",
            "PC.481\tACCAGCGACTAG\tControl\t20070314",
            "PC.593\tAGCAGCACTTGT\tControl\t20071210",
            "PC.607\tAACTGTGCGTAC\tFast\t20071112",
            "PC.634\tACAGAGTCGGCT\tFast\t20080116",
            "PC.635\tACCGCAGAGTCA\tFast\t20080116",
            "PC.636\tACGGTGAGTGTC\tFast\t20080116"
        ]

        # Field to test on. Field values are either "Control" or "Fast".
        self.field = 'Treatment'

        # Create a tiny distancy matrix/mapping file with a single sample for
        # additional testing.
        self.tiny_dist_matrix_string = ["\tSamp.1", "Samp.1\t0"]
        self.tiny_mapping_string = [
            "#SampleID\tBarcodeSequence\tSampleField",
            "Samp.1\tAGCACGAGCCTA\tSampleFieldState1"
        ]
        self.tiny_field = 'SampleField'

        self.small_dist_matrix_string = [
            "\tSamp.1\tSamp.2", "Samp.1\t0\t0.5", "Samp.2\t0.5\t0"
        ]
        self.small_mapping_string = [
            "#SampleID\tBarcodeSequence\tSampleField",
            "Samp.1\tAGCACGAGCCTA\tSampleFieldState1",
            "Samp.2\tAGCACGAGCCTG\tSampleFieldState2"
        ]
        self.small_field = 'SampleField'

        # Parse mapping "files" (faked here).
        self.mapping, self.mapping_header, self.comments = parse_mapping_file(
            self.mapping_string)
        mapping_data = [self.mapping_header]
        mapping_data.extend(self.mapping)
        self.groups = group_by_field(mapping_data, self.field)

        self.tiny_mapping, self.tiny_mapping_header, self.tiny_comments = \
                parse_mapping_file(self.tiny_mapping_string)
        tiny_mapping_data = [self.tiny_mapping_header]
        tiny_mapping_data.extend(self.tiny_mapping)
        self.tiny_groups = group_by_field(tiny_mapping_data, self.tiny_field)

        self.small_mapping, self.small_mapping_header, self.small_comments = \
                parse_mapping_file(self.small_mapping_string)
        small_mapping_data = [self.small_mapping_header]
        small_mapping_data.extend(self.small_mapping)
        self.small_groups = group_by_field(small_mapping_data,
                                           self.small_field)

        # Parse distance matrix "files" (faked here).
        self.dist_matrix_header, self.dist_matrix = parse_distmat(
            self.dist_matrix_string)

        self.tiny_dist_matrix_header, self.tiny_dist_matrix = parse_distmat(
            self.tiny_dist_matrix_string)

        self.small_dist_matrix_header, self.small_dist_matrix = parse_distmat(
            self.small_dist_matrix_string)
Beispiel #56
0
    if opts.otu_table_fp:
        otu_table = parse_biom_table(open(opts.otu_table_fp, 'U'))
        samples_to_keep = otu_table.SampleIds
        #samples_to_keep = \
        # sample_ids_from_otu_table(open(opts.otu_table_fp,'U'))
    elif opts.sample_id_fp:
        samples_to_keep = \
         get_seqs_to_keep_lookup_from_seq_id_file(open(opts.sample_id_fp,'U'))
    elif opts.mapping_fp and opts.valid_states:
        try:
            samples_to_keep = sample_ids_from_metadata_description(
                open(opts.mapping_fp, 'U'), opts.valid_states)
        except ValueError, e:
            option_parser.error(e.message)
    else:
        option_parser.error(
            'must pass either --sample_id_fp, -t, or -m and -s')
    # note that negate gets a little weird here. The function we're calling removes the specified
    # samples from the distance matrix, but the other QIIME filter scripts keep these samples specified.
    # So, the interface of this script is designed to keep the specified samples, and therefore
    # negate=True is passed to filter_samples_from_distance_matrix by default.
    d = filter_samples_from_distance_matrix(parse_distmat(
        open(opts.input_distance_matrix, 'U')),
                                            samples_to_keep,
                                            negate=not opts.negate)
    output_f.write(d)
    output_f.close()


if __name__ == "__main__":
    main()
    def setUp(self):
        """setup data function for DistanceHistogramsTests."""
        self.working_dir = '/tmp/distance_histogram_tests/'
        try:
            mkdir(self.working_dir)
        except OSError:  #except already exisits
            pass

        self.histogram_dir = self.working_dir + 'histograms/'
        try:
            mkdir(self.histogram_dir)
        except OSError:  #except already exisits remove it and make a new one
            pass

        #Create distance matrix file
        self.dmat_file = self.working_dir + 'dmat.txt'
        dmat_out = open(self.dmat_file, 'w')
        dmat_out.write(DISTANCE_MATRIX_STRING)
        dmat_out.close()

        self.distance_header, self.dmat = \
            parse_distmat(open(self.dmat_file,'U'))

        #Create mapping file
        self.map_file = self.working_dir + 'map.txt'
        map_out = open(self.map_file, 'w')
        map_out.write(MAPPING_STRING)
        map_out.close()

        mapping, header, comments = parse_mapping_file(open(
            self.map_file, 'U'))
        header[0] = '#' + header[0]
        header = [header]
        header.extend(mapping)
        self.mapping = header

        #Create prefs file
        self.prefs_file = self.working_dir + 'prefs.txt'
        prefs_out = open(self.prefs_file, 'w')
        prefs_out.write(str(PREFS))
        prefs_out.close()

        #Build single field dict for 'Treatment' field.
        self.single_field_treatment = defaultdict(dict)
        self.treatment_groups = group_by_field(self.mapping, 'Treatment')
        self.single_field_treatment['Treatment'] = \
            distances_by_groups(self.distance_header,self.dmat,\
                self.treatment_groups)
        self.paired_field_treatment = {'Treatment_to_Treatment':[\
            [('Control','Control'),('Fast','Fast'),\
                             array([[0.729,  0.8  ,  0.721, 0.765],
                                    [0.776,  0.744,  0.749, 0.677],
                                    [0.734,  0.777,  0.733, 0.724],
                                    [0.696,  0.675,  0.654, 0.696],
                                    [0.731,  0.758,  0.738, 0.737]])],\
            [('Control','Control'),('Control','Control'),\
                                 array([0.625,  0.623,  0.61 ,  0.577, 0.615,
                                        0.642,  0.673, 0.682,  0.737, 0.704])],\
            [('Fast','Fast'),('Fast','Fast'),\
                             array([0.718,  0.666, 0.727, 0.6, 0.578, 0.623])]
        ]}

        self.distances_file = self.working_dir + 'distances_out.txt'
        dist_out = open(self.distances_file, 'w')
        dist_out.write(DISTANCES_OUT)
        dist_out.close()
def make_distance_boxplots(dm_f,
                           map_f,
                           fields,
                           width=None,
                           height=6.0,
                           suppress_all_within=False,
                           suppress_all_between=False,
                           suppress_individual_within=False,
                           suppress_individual_between=False,
                           y_min=0.0,
                           y_max=1.0,
                           whisker_length=1.5,
                           box_width=0.5,
                           box_color=None,
                           color_individual_within_by_field=None,
                           sort=None):
    """Generates various types of boxplots for distance comparisons.

    Returns a list of tuples, one for each field. Each tuple contains the
    following:
        1) the name of the field (string)
        2) a matplotlib.figure.Figure object containing the boxplots
        3) a list of lists containing the raw plot data that was passed to mpl
        4) a list of labels for each of the boxplots (string)
        5) a list of mpl-compatible colors (one for each boxplot)

    The Figure can be saved, and the raw data and labels can be useful (for
    example) performing statistical tests or writing the raw data to disk.

    The input arguments are exactly derived from the make_distance_boxplots.py
    script (see the script options for details). To avoid duplicated effort,
    their descriptions are not reproduced here.
    """
    # Parse data files and do some preliminary error checking.
    dm_header, dm_data = parse_distmat(dm_f)
    map_data, map_header, map_comments = parse_mapping_file(map_f)

    if fields is None or len(fields) < 1:
        raise ValueError("You must provide at least one field to analyze.")

    for field in fields:
        if field not in map_header:
            raise ValueError("The field '%s' is not in the provided mapping "
                             "file. Please supply correct fields "
                             "corresponding to fields in the mapping file." %
                             field)

    # Make sure the y_min and y_max options make sense, as they can be either
    # 'auto' or a number.
    y_min = _cast_y_axis_extrema(y_min)
    y_max = _cast_y_axis_extrema(y_max)

    # Collate the distributions of distances that will comprise each boxplot.
    # Suppress the generation of the indicated types of boxplots.
    results = []
    for field in fields:
        plot_data = []
        plot_labels = []
        plot_colors = []
        legend = None

        # Little bit of duplicate code here... not sure it's worth the effort
        # to clean up though.
        if not suppress_all_within:
            plot_data.append(
                get_all_grouped_distances(dm_header,
                                          dm_data,
                                          map_header,
                                          map_data,
                                          field,
                                          within=True))
            plot_labels.append("All within %s" % field)

            if color_individual_within_by_field is not None:
                plot_colors.append(None)
            else:
                plot_colors.append(box_color)

        if not suppress_all_between:
            plot_data.append(
                get_all_grouped_distances(dm_header,
                                          dm_data,
                                          map_header,
                                          map_data,
                                          field,
                                          within=False))
            plot_labels.append("All between %s" % field)

            if color_individual_within_by_field is not None:
                plot_colors.append(None)
            else:
                plot_colors.append(box_color)

        if not suppress_individual_within:
            within_dists = get_grouped_distances(dm_header,
                                                 dm_data,
                                                 map_header,
                                                 map_data,
                                                 field,
                                                 within=True)
            field_states = []
            for grouping in within_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))
                field_states.append(grouping[0])

            # If we need to color these boxplots by a field, build up a
            # list of colors and a legend.
            if color_individual_within_by_field is not None:
                colors, color_mapping = _color_field_states(
                    format_mapping_file(map_header,
                                        map_data).split('\n'), dm_header,
                    field, field_states, color_individual_within_by_field)
                plot_colors.extend(colors)
                legend = (color_mapping.values(), color_mapping.keys())
            else:
                plot_colors.extend([box_color] * len(field_states))

        if not suppress_individual_between:
            between_dists = get_grouped_distances(dm_header,
                                                  dm_data,
                                                  map_header,
                                                  map_data,
                                                  field,
                                                  within=False)

            for grouping in between_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))

                if color_individual_within_by_field is not None:
                    plot_colors.append(None)
                else:
                    plot_colors.append(box_color)

        assert (len(plot_data) == len(plot_labels) and
                len(plot_labels) == len(plot_colors)), "The number " +\
            "of boxplot labels and colors do not match the number of " +\
            "boxplots."

        # We now have our data and labels ready, so plot them!
        if plot_data:
            if sort is not None:
                plot_data, plot_labels, plot_colors = _sort_distributions(
                    plot_data, plot_labels, plot_colors, sort)

            if width is None:
                width = len(plot_data) * box_width + 2
            if width <= 0 or height <= 0:
                raise ValueError("The specified width and height of the plot "
                                 "must be greater than zero.")

            plot_figure = boxplots(plot_data,
                                   x_tick_labels=plot_labels,
                                   title="%s Distances" % field,
                                   x_label="Grouping",
                                   y_label="Distance",
                                   x_tick_labels_orientation='vertical',
                                   y_min=y_min,
                                   y_max=y_max,
                                   whisker_length=whisker_length,
                                   box_width=box_width,
                                   box_colors=plot_colors,
                                   figure_width=width,
                                   figure_height=height,
                                   legend=legend)

            results.append(
                (field, plot_figure, plot_data, plot_labels, plot_colors))
        else:
            raise ValueError("The generation of all plots was suppressed. At "
                             "least one type of plot must be unsuppressed.")

    return results