コード例 #1
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Open the input distance matrices, parse them, find the intersection, and
    # write the two new distance matrices to the output filepaths.
    input_dm_fps = opts.input_dms.split(',')
    output_dm_fps = opts.output_dms.split(',')
    if len(input_dm_fps) != 2 or len(output_dm_fps) != 2:
        option_parser.error("You must provide exactly two input and output "
            "distance matrix filepaths.")

    labels1, dm1_data = parse_distmat(open(input_dm_fps[0], 'U'))
    labels2, dm2_data = parse_distmat(open(input_dm_fps[1], 'U'))

    (dm1_labels, dm1), (dm2_labels, dm2) = make_compatible_distance_matrices(
        parse_distmat(open(input_dm_fps[0],'U')),
        parse_distmat(open(input_dm_fps[1],'U')))
    assert (dm1_labels == dm2_labels), "The order of sample IDs is not the " +\
        "same for the two matrices."

    output1_f = open(output_dm_fps[0], 'w')
    output2_f = open(output_dm_fps[1], 'w')
    output1_f.write(format_distance_matrix(dm1_labels, dm1))
    output2_f.write(format_distance_matrix(dm2_labels, dm2))
    output1_f.close()
    output2_f.close()
コード例 #2
0
ファイル: beta_diversity.py プロジェクト: jairideout/qiime
def assemble_distance_matrix(dm_components):
    """ assemble distance matrix components into a complete dm string

    """
    print "I get called."
    data = {}
    # iterate over compenents
    for c in dm_components:
        # create a blank list to store the column ids
        col_ids = []
        # iterate over lines
        for line in c:
            # split on tabs remove leading and trailing whitespace
            fields = line.strip().split()
            if fields:
                # if no column ids seen yet, these are them
                if not col_ids:
                    col_ids = fields
                # otherwise this is a data row so add it to data
                else:
                    sid = fields[0]
                    data[sid] = dict(zip(col_ids, fields[1:]))

    # grab the col/row ids as a list so it's ordered
    labels = data.keys()
    # create an empty list to build the dm
    dm = []
    # construct the dm one row at a time
    for l1 in labels:
        dm.append([data[l1][l2] for l2 in labels])
    # create the dm string and return it
    dm = format_distance_matrix(labels, dm)
    return dm
コード例 #3
0
ファイル: shared_phylotypes.py プロジェクト: jairideout/qiime
def calc_shared_phylotypes(infile, reference_sample=None):
    """Calculates number of shared phylotypes for each pair of sample.

    infile: otu table filehandle

    reference_sample: if set, will use this sample name to calculate shared OTUs
                      between reference sample, and pair of samples. Useful,
                      e.g. when the reference sample is the Donor in a transplant study
    """

    otu_table = parse_biom_table(infile)

    if reference_sample:
        #ref_idx = sample_ids.index(reference_sample)
        ref_idx = reference_sample

    num_samples = len(otu_table.SampleIds)
    result_array = zeros((num_samples, num_samples), dtype=int)
    for i, samp1_id in enumerate(otu_table.SampleIds):
        for j, samp2_id in enumerate(otu_table.SampleIds[:i + 1]):
            if reference_sample:
                result_array[i, j] = result_array[j, i] = \
                    _calc_shared_phylotypes_multiple(otu_table,
                                                     [samp1_id, samp2_id, ref_idx])
            else:
                result_array[i, j] = result_array[j, i] = \
                    _calc_shared_phylotypes_pairwise(otu_table, samp1_id,
                                                     samp2_id)

    return format_distance_matrix(otu_table.SampleIds, result_array) + "\n"
コード例 #4
0
ファイル: beta_diversity.py プロジェクト: rob-knight/qiime
def assemble_distance_matrix(dm_components):
    """ assemble distance matrix components into a complete dm string

    """
    print "I get called."
    data = {}
    # iterate over compenents
    for c in dm_components:
        # create a blank list to store the column ids
        col_ids = []
        # iterate over lines
        for line in c:
            # split on tabs remove leading and trailing whitespace
            fields = line.strip().split()
            if fields:
                # if no column ids seen yet, these are them
                if not col_ids:
                    col_ids = fields
                # otherwise this is a data row so add it to data
                else:
                    sid = fields[0]
                    data[sid] = dict(zip(col_ids,fields[1:]))

    # grab the col/row ids as a list so it's ordered
    labels = data.keys()
    # create an empty list to build the dm
    dm = []
    # construct the dm one row at a time
    for l1 in labels:
        dm.append([data[l1][l2] for l2 in labels])
    # create the dm string and return it
    dm = format_distance_matrix(labels,dm)
    return dm
コード例 #5
0
def calc_shared_phylotypes(infile, reference_sample=None):
    """Calculates number of shared phylotypes for each pair of sample.

    infile: otu table filehandle

    reference_sample: if set, will use this sample name to calculate shared OTUs
                      between reference sample, and pair of samples. Useful, 
                      e.g. when the reference sample is the Donor in a transplant study
    """

    sample_ids, otu_ids, otu_table, lineages = parse_otu_table(infile)
 
    if reference_sample:
        ref_idx = sample_ids.index(reference_sample)
    (n,m) = otu_table.shape
    result_array = zeros((m,m), dtype=int)
    for i in range(m):
        for j in range (i+1):
            if reference_sample:
                result_array[i,j] = result_array[j,i] = \
                    _calc_shared_phylotypes_multiple(otu_table, [i, j, ref_idx])
            else:  
                result_array[i,j] = result_array[j,i] = \
                    _calc_shared_phylotypes_pairwise(otu_table, i, j)
                
    return format_distance_matrix(sample_ids, result_array)+"\n"
コード例 #6
0
def distance_matrix(input_path, column):
    """ calculates distance matrix on a single column of a mapping file
    
    inputs:
     input_path (file handler)
     column (str)
    """
    data, comments = parse_mapping_file_to_dict(input_path)
    column_data = []
    column_headers = []
    for i in data:
        if column not in data[i]:
            stderr.write("\n\nNo column: '%s' in the mapping file. Existing columns are: %s\n\n" % (column,data[i].keys()))
            exit(1)
        try:
            column_data.append(float(data[i][column]))
        except ValueError:
            stderr.write("\n\nall the values in the column '%s' must be numeric but '%s' has '%s'\n\n"\
                % (column,i,data[i][column]))
            exit(1)
            
        column_headers.append(i)
    
    data_row = array(column_data)
    data_col = reshape(data_row, (1, len(data_row)))
    dist_mtx = abs(data_row-data_col.T)
    
    return format_distance_matrix(column_headers, dist_mtx)
コード例 #7
0
ファイル: shared_phylotypes.py プロジェクト: DDomogala3/qiime
def calc_shared_phylotypes(infile, reference_sample=None):
    """Calculates number of shared phylotypes for each pair of sample.

    infile: otu table filehandle

    reference_sample: if set, will use this sample name to calculate shared OTUs
                      between reference sample, and pair of samples. Useful, 
                      e.g. when the reference sample is the Donor in a transplant study
    """

    otu_table = parse_biom_table(infile)

    if reference_sample:
        #ref_idx = sample_ids.index(reference_sample)
        ref_idx = reference_sample
    
    num_samples = len(otu_table.SampleIds)
    result_array = zeros((num_samples, num_samples), dtype=int)
    for i,samp1_id in enumerate(otu_table.SampleIds):
        for j,samp2_id in enumerate(otu_table.SampleIds[:i+1]):
            if reference_sample:
                result_array[i,j] = result_array[j,i] = \
                    _calc_shared_phylotypes_multiple(otu_table, 
                                                 [samp1_id, samp2_id, ref_idx])
            else:  
                result_array[i,j] = result_array[j,i] = \
                    _calc_shared_phylotypes_pairwise(otu_table, samp1_id, 
                                                      samp2_id)
                
    return format_distance_matrix(otu_table.SampleIds, result_array)+"\n"
コード例 #8
0
def distance_matrix(input_path, column):
    """ calculates distance matrix on a single column of a mapping file
    
    inputs:
     input_path (file handler)
     column (str)
    """
    data, comments = parse_mapping_file_to_dict(input_path)
    column_data = []
    column_headers = []
    for i in data:
        if column not in data[i]:
            stderr.write(
                "\n\nNo column: '%s' in the mapping file. Existing columns are: %s\n\n"
                % (column, data[i].keys()))
            exit(1)
        try:
            column_data.append(float(data[i][column]))
        except ValueError:
            stderr.write("\n\nall the values in the column '%s' must be numeric but '%s' has '%s'\n\n"\
                % (column,i,data[i][column]))
            exit(1)

        column_headers.append(i)

    data_row = array(column_data)
    data_col = reshape(data_row, (1, len(data_row)))
    dist_mtx = abs(data_row - data_col.T)

    return format_distance_matrix(column_headers, dist_mtx)
コード例 #9
0
ファイル: test_format.py プロジェクト: Gaby1212/qiime
 def test_format_distance_matrix(self):
     """format_distance_matrix should return tab-delimited dist mat"""
     a = array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     labels = [11, 22, 33]
     res = format_distance_matrix(labels, a)
     self.assertEqual(res, "\t11\t22\t33\n11\t1\t2\t3\n22\t4\t5\t6\n33\t7\t8\t9")
     self.assertRaises(ValueError, format_distance_matrix, labels[:2], a)
コード例 #10
0
def calc_shared_phylotypes(infile, reference_sample=None):
    """Calculates number of shared phylotypes for each pair of sample.

    infile: otu table filehandle

    reference_sample: if set, will use this sample name to calculate shared OTUs
                      between reference sample, and pair of samples. Useful, 
                      e.g. when the reference sample is the Donor in a transplant study
    """

    sample_ids, otu_ids, otu_table, lineages = parse_otu_table(infile)
 
    if reference_sample:
        ref_idx = sample_ids.index(reference_sample)
    (n,m) = otu_table.shape
    result_array = zeros((m,m), dtype=int)
    for i in range(m):
        for j in range (i+1):
            if reference_sample:
                result_array[i,j] = result_array[j,i] = \
                    _calc_shared_phylotypes_multiple(otu_table, [i, j, ref_idx])
            else:  
                result_array[i,j] = result_array[j,i] = \
                    _calc_shared_phylotypes_pairwise(otu_table, i, j)
                
    return format_distance_matrix(sample_ids, result_array)+"\n"
コード例 #11
0
ファイル: test_format.py プロジェクト: davidvilanova/qiime
 def test_format_distance_matrix(self):
     """format_distance_matrix should return tab-delimited dist mat"""
     a = array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     labels = [11, 22, 33]
     res = format_distance_matrix(labels, a)
     self.assertEqual(
         res, '\t11\t22\t33\n11\t1\t2\t3\n22\t4\t5\t6\n33\t7\t8\t9')
     self.assertRaises(ValueError, format_distance_matrix, labels[:2], a)
コード例 #12
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    data, comments = parse_mapping_file_to_dict(opts.input_path)
    column_headers = []
    if ',' not in opts.column:
        column_data = []
        column_name = opts.column
        for i in data:
            if column_name not in data[i]:
                raise ValueError(
                    "No column: '%s' in the mapping file. Existing columns are: %s"
                    % (column_name, data[i].keys()))

            try:
                column_data.append(float(data[i][opts.column]))
            except ValueError:
                raise ValueError(
                    "All the values in the column '%s' must be numeric but '%s' has '%s'"
                    % (column_name, i, data[i][column_name]))

            column_headers.append(i)
        dtx_mtx = compute_distance_matrix_from_metadata(column_data)
    else:
        latitudes = []
        longitudes = []
        try:
            latitude, longitude = opts.column.split(',')
        except ValueError:
            raise ValueError(
                "This script accepts a maximum of 2 colums separated by comma and you passed: %s"
                % (opts.column))

        for i in data:
            if latitude not in data[i] or longitude not in data[i]:
                raise ValueError(
                    "One of these columns or both do not exist: '%s' or '%s' in the mapping file. Existing columns are: %s"
                    % (latitude, longitude, data[i].keys()))

            try:
                latitudes.append(float(data[i][latitude]))
                longitudes.append(float(data[i][longitude]))
            except ValueError:
                raise ValueError(
                    "All the values in the columnd '%s' & '%s' must be numeric but '%s' has '%s'"
                    % (latitude, longitude, i, data[i][column_name]))

            column_headers.append(i)

        dtx_mtx = calculate_dist_vincenty(latitudes, longitudes)

    dtx_txt = format_distance_matrix(column_headers, dtx_mtx)

    outfilepath = os.path.join(opts.output_fp)
    f = open(outfilepath, 'w')
    f.write(dtx_txt)
    f.close()
コード例 #13
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    data, comments = parse_mapping_file_to_dict(opts.input_path)
    column_headers = []
    if ',' not in opts.column:
        column_data = []
        column_name = opts.column
        for i in data:
            if column_name not in data[i]:
                raise ValueError(
                    "No column: '%s' in the mapping file. Existing columns are: %s" %
                    (column_name, data[i].keys()))

            try:
                column_data.append(float(data[i][opts.column]))
            except ValueError:
                raise ValueError(
                    "All the values in the column '%s' must be numeric but '%s' has '%s'" %
                    (column_name, i, data[i][column_name]))

            column_headers.append(i)
        dtx_mtx = compute_distance_matrix_from_metadata(column_data)
    else:
        latitudes = []
        longitudes = []
        try:
            latitude, longitude = opts.column.split(',')
        except ValueError:
            raise ValueError(
                "This script accepts a maximum of 2 colums separated by comma and you passed: %s" %
                (opts.column))

        for i in data:
            if latitude not in data[i] or longitude not in data[i]:
                raise ValueError(
                    "One of these columns or both do not exist: '%s' or '%s' in the mapping file. Existing columns are: %s" %
                    (latitude, longitude, data[i].keys()))

            try:
                latitudes.append(float(data[i][latitude]))
                longitudes.append(float(data[i][longitude]))
            except ValueError:
                raise ValueError(
                    "All the values in the columnd '%s' & '%s' must be numeric but '%s' has '%s'" %
                    (latitude, longitude, i, data[i][column_name]))

            column_headers.append(i)

        dtx_mtx = calculate_dist_vincenty(latitudes, longitudes)

    dtx_txt = format_distance_matrix(column_headers, dtx_mtx)

    outfilepath = os.path.join(opts.output_fp)
    f = open(outfilepath, 'w')
    f.write(dtx_txt)
    f.close()
コード例 #14
0
ファイル: test_format.py プロジェクト: TankMermaid/qiime-1
 def test_format_distance_matrix_almost_zero_diagonal(self):
     # only diagonal values should be converted to 0.0 if they are close to
     # zero. other values in the matrix should not be changed.
     a = array([[0.00001, 1, 0.0000000000001], [1.0, 0.0000000000001, 3],
                [0.0000000000001, 3.0, 0.0]])
     res = format_distance_matrix(['foo', 'bar', 'baz'], a)
     self.assertEqual(
         res, '\tfoo\tbar\tbaz\nfoo\t1e-05\t1.0\t1e-13\nbar\t1.0'
         '\t0.0\t3.0\nbaz\t1e-13\t3.0\t0.0')
コード例 #15
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Open the input distance matrix and parse it. Shuffle its labels and write
    # them and the original data to the output file.
    labels, dm_data = parse_distmat(open(opts.input_distance_matrix, 'U'))
    shuffle(labels)
    output_f = open(opts.output_distance_matrix, 'w')
    output_f.write(format_distance_matrix(labels, dm_data))
    output_f.close()
コード例 #16
0
def compute_procrustes(result_tables, expected_pc_lookup, taxonomy_level=6, num_dimensions=3, random_trials=999):
    """ Compute Procrustes M2 and p-values for a set of results
    
        result_tables: 2d list of tables to be compared to expected tables, 
         where the data in the inner list is:
          [dataset_id, reference_database_id, method_id, 
           parameter_combination_id, table_fp]
        expected_pc_lookup: 2d dict of dataset_id, reference_db_id to principal
         coordinate matrices, for the expected result coordinate matrices
        taxonomy_level: level to compute results
    """
    ### Start code copied ALMOST* directly from compute_prfs - some re-factoring for re-use is
    ### in order here. *ALMOST refers to changes to parser and variable names since expected
    ### is a pc matrix here.

    for dataset_id, reference_id, method_id, params, actual_table_fp in result_tables:
        ## parse the expected table (unless taxonomy_level is specified, this should be
        ## collapsed on level 6 taxonomy)
        try:
            expected_pc_fp = expected_pc_lookup[dataset_id][reference_id]
        except KeyError:
            raise KeyError, "Can't find expected table for (%s, %s)." % (dataset_id, reference_id)

        ## parse the actual table and collapse it at the specified taxonomic level
        try:
            actual_table = parse_biom_table(open(actual_table_fp, "U"))
        except ValueError:
            raise ValueError, "Couldn't parse BIOM table: %s" % actual_table_fp
        collapse_by_taxonomy = get_taxonomy_collapser(taxonomy_level)
        actual_table = actual_table.collapseObservationsByMetadata(collapse_by_taxonomy)
        ### End code copied directly from compute_prfs.

        # Next block of code, how do I hate thee? Let me count the ways...
        # (1) dist_bray_curtis doesn't take a BIOM Table object
        # (2) pcoa takes a qiime-formatted distance matrix as a list of lines
        # (3) pcoa return a qiime-formatted pc matrix
        # (4) procrustes_monte_carlo needs to pass through the pc "file" multiple
        #     times, so we actually *need* those the pcs that get passed in to be
        #     lists of lines
        dm = dist_bray_curtis(asarray([v for v in actual_table.iterSampleData()]))
        formatted_dm = format_distance_matrix(actual_table.SampleIds, dm)
        actual_pc = pcoa(formatted_dm.split("\n")).split("\n")
        expected_pc = list(open(expected_pc_fp, "U"))

        ## run Procrustes analysis with monte carlo simulation
        actual_m_squared, trial_m_squareds, count_better, mc_p_value = procrustes_monte_carlo(
            expected_pc,
            actual_pc,
            trials=random_trials,
            max_dimensions=num_dimensions,
            sample_id_map=None,
            trial_output_dir=None,
        )

        yield (dataset_id, reference_id, method_id, params, actual_m_squared, mc_p_value)
コード例 #17
0
    def test_single_file_nj(self):
        """ single_file_nj should throw no errors"""

        titles = ["hi", "ho", "yo"]
        distdata = numpy.array([[0, 0.5, 0.3], [0.5, 0.0, 0.9], [0.3, 0.9, 0.0]])
        fname = get_tmp_filename(prefix="nj_", suffix=".txt")
        f = open(fname, "w")
        self._paths_to_clean_up.append(fname)
        f.write(format_distance_matrix(titles, distdata))
        f.close()

        fname2 = get_tmp_filename(prefix="nj_", suffix=".txt", result_constructor=str)
        self._paths_to_clean_up.append(fname2)
        single_file_nj(fname, fname2)
        assert os.path.exists(fname2)
コード例 #18
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    
    indir = opts.input_dir
    outdir = opts.output_dir
    if not os.path.exists(outdir):
      os.makedirs(outdir)

    #input    
    file_names = os.listdir(indir)
    file_names = [fname for fname in file_names if not fname.startswith('.')]

    distmats = []
    headers_list = []
    for fname in file_names:
      f = open(os.path.join(indir,fname), 'U')
      headers, data = parse_distmat(f)
      f.close()
      distmats.append(data)
      headers_list.append(headers)

    #calcs
    headers, means, medians, stdevs = matrix_stats(headers_list, distmats)

    #output
    f = open(os.path.join(outdir,'means.txt'), 'w')
    f.write(format_distance_matrix(headers,means))
    f.close()

    f = open(os.path.join(outdir,'medians.txt'), 'w')
    f.write(format_distance_matrix(headers,medians))
    f.close()

    f = open(os.path.join(outdir,'stdevs.txt'), 'w')
    f.write(format_distance_matrix(headers,stdevs))
    f.close()
コード例 #19
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    indir = opts.input_dir
    outdir = opts.output_dir
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    # input
    file_names = os.listdir(indir)
    file_names = [fname for fname in file_names if not fname.startswith('.')]

    distmats = []
    headers_list = []
    for fname in file_names:
        f = open(os.path.join(indir, fname), 'U')
        headers, data = parse_distmat(f)
        f.close()
        distmats.append(data)
        headers_list.append(headers)

    # calcs
    headers, means, medians, stdevs = matrix_stats(headers_list, distmats)

    # output
    f = open(os.path.join(outdir, 'means.txt'), 'w')
    f.write(format_distance_matrix(headers, means))
    f.close()

    f = open(os.path.join(outdir, 'medians.txt'), 'w')
    f.write(format_distance_matrix(headers, medians))
    f.close()

    f = open(os.path.join(outdir, 'stdevs.txt'), 'w')
    f.write(format_distance_matrix(headers, stdevs))
    f.close()
コード例 #20
0
 def test_single_file_upgma(self):
     """ single_file_upgma should throw no errors"""
     
     titles = ['hi','ho']
     distdata = numpy.array([[0,.5],[.5,0.]])
     fname = get_tmp_filename(prefix='upgma_',suffix='.txt')
     f = open(fname,'w')
     self._paths_to_clean_up.append(fname)
     f.write(format_distance_matrix(titles, distdata))
     f.close()
     
     fname2 = get_tmp_filename(prefix='upgma_',suffix='.txt',
         result_constructor=str)
     self._paths_to_clean_up.append(fname2)
     single_file_upgma(fname,fname2)
     assert(os.path.exists(fname2))
コード例 #21
0
    def test_single_file_nj(self):
        """ single_file_nj should throw no errors"""

        titles = ['hi','ho','yo']
        distdata = numpy.array([[0,.5,.3],[.5,0.,.9],[.3,.9,0.]])
        fname = get_tmp_filename(prefix='nj_',suffix='.txt')
        f = open(fname,'w')
        self._paths_to_clean_up.append(fname)
        f.write(format_distance_matrix(titles, distdata))
        f.close()
        
        fname2 = get_tmp_filename(prefix='nj_',suffix='.txt',
            result_constructor=str)
        self._paths_to_clean_up.append(fname2)
        single_file_nj(fname,fname2)
        assert(os.path.exists(fname2))
コード例 #22
0
    def test_single_file_upgma(self):
        """ single_file_upgma should throw no errors"""

        titles = ["hi", "ho"]
        distdata = numpy.array([[0, 0.5], [0.5, 0.0]])
        fd, fname = mkstemp(prefix="upgma_", suffix=".txt")
        close(fd)
        f = open(fname, "w")
        self._paths_to_clean_up.append(fname)
        f.write(format_distance_matrix(titles, distdata))
        f.close()

        fd, fname2 = mkstemp(prefix="upgma_", suffix=".txt")
        close(fd)
        self._paths_to_clean_up.append(fname2)
        single_file_upgma(fname, fname2)
        assert os.path.exists(fname2)
コード例 #23
0
    def test_single_file_nj(self):
        """ single_file_nj should throw no errors"""

        titles = ['hi', 'ho', 'yo']
        distdata = numpy.array([[0, .5, .3], [.5, 0., .9], [.3, .9, 0.]])
        fd, fname = mkstemp(prefix='nj_', suffix='.txt')
        close(fd)
        f = open(fname, 'w')
        self._paths_to_clean_up.append(fname)
        f.write(format_distance_matrix(titles, distdata))
        f.close()

        fd, fname2 = mkstemp(prefix='nj_', suffix='.txt')
        close(fd)
        self._paths_to_clean_up.append(fname2)
        single_file_nj(fname, fname2)
        assert (os.path.exists(fname2))
コード例 #24
0
    def test_single_file_upgma(self):
        """ single_file_upgma should throw no errors"""

        titles = ['hi', 'ho']
        distdata = numpy.array([[0, .5], [.5, 0.]])
        fd, fname = mkstemp(prefix='upgma_', suffix='.txt')
        close(fd)
        f = open(fname, 'w')
        self._paths_to_clean_up.append(fname)
        f.write(format_distance_matrix(titles, distdata))
        f.close()

        fd, fname2 = mkstemp(prefix='upgma_', suffix='.txt')
        close(fd)
        self._paths_to_clean_up.append(fname2)
        single_file_upgma(fname, fname2)
        assert(os.path.exists(fname2))
コード例 #25
0
ファイル: filter.py プロジェクト: nbresnick/qiime
def filter_samples_from_distance_matrix(dm, samples_to_discard, negate=False):
    """ Remove specified samples from distance matrix

        dm: (sample_ids, dm_data) tuple, as returned from
         qiime.parse.parse_distmat; or a file handle that can be passed
         to qiime.parse.parse_distmat

    """
    try:
        sample_ids, dm_data = dm
    except ValueError:
        # input was provide as a file handle
        sample_ids, dm_data = parse_distmat(dm)

    sample_lookup = {}.fromkeys([e.split()[0] for e in samples_to_discard])
    temp_dm_data = []
    new_dm_data = []
    new_sample_ids = []

    if negate:

        def keep_sample(s):
            return s in sample_lookup

    else:

        def keep_sample(s):
            return s not in sample_lookup

    for row, sample_id in zip(dm_data, sample_ids):
        if keep_sample(sample_id):
            temp_dm_data.append(row)
            new_sample_ids.append(sample_id)
    temp_dm_data = array(temp_dm_data).transpose()

    for col, sample_id in zip(temp_dm_data, sample_ids):
        if keep_sample(sample_id):
            new_dm_data.append(col)
    new_dm_data = array(new_dm_data).transpose()

    return format_distance_matrix(new_sample_ids, new_dm_data)
コード例 #26
0
ファイル: filter.py プロジェクト: Sandy4321/FizzyQIIME
def filter_samples_from_distance_matrix(dm, samples_to_discard, negate=False):
    """ Remove specified samples from distance matrix 
    
        dm: (sample_ids, dm_data) tuple, as returned from 
         qiime.parse.parse_distmat; or a file handle that can be passed
         to qiime.parse.parse_distmat
    
    """
    try:
        sample_ids, dm_data = dm
    except ValueError:
        # input was provide as a file handle
        sample_ids, dm_data = parse_distmat(dm)

    sample_lookup = {}.fromkeys([e.split()[0] for e in samples_to_discard])
    temp_dm_data = []
    new_dm_data = []
    new_sample_ids = []

    if negate:

        def keep_sample(s):
            return s in sample_lookup
    else:

        def keep_sample(s):
            return s not in sample_lookup

    for row, sample_id in zip(dm_data, sample_ids):
        if keep_sample(sample_id):
            temp_dm_data.append(row)
            new_sample_ids.append(sample_id)
    temp_dm_data = array(temp_dm_data).transpose()

    for col, sample_id in zip(temp_dm_data, sample_ids):
        if keep_sample(sample_id):
            new_dm_data.append(col)
    new_dm_data = array(new_dm_data).transpose()

    return format_distance_matrix(new_sample_ids, new_dm_data)
コード例 #27
0
ファイル: beta_diversity.py プロジェクト: wilkox/qiime
def single_object_beta(otu_table, metrics, tr, rowids=None, full_tree=False):
    """mod of single_file_beta to recieve and return otu obj, tree str

    uses name in metrics to name output beta diversity files
    assumes input tree is already trimmed to contain only otus present
    in otu_table, doesn't call getSubTree()
    inputs:
                otu_table -- a otu_table in the biom format
                metrics -- metrics (str, comma delimited if more than 1 metric)
                tr -- a phylonode cogent tree object if needed by the chosen beta
                                        diversity metric
                rowids -- comma seperated string
    """
    if isinstance(otu_table, DenseTable):
        otumtx = otu_table._data.T
    else:
        otumtx = asarray([v for v in otu_table.iterSampleData()])

    if tr:
        tree = tr
    else:
        tree = None

    metrics_list = metrics.split(',')

    for metric in metrics_list:
        try:
            metric_f = get_nonphylogenetic_metric(metric)
            is_phylogenetic = False
        except AttributeError:
            try:
                metric_f = get_phylogenetic_metric(metric)
                is_phylogenetic = True
                if tree is None:
                    stderr.write(
                        "metric %s requires a tree, but none found\n" %
                        (metric, ))
                    exit(1)
            except AttributeError:
                stderr.write(
                    "Could not find metric %s.\n\nKnown metrics are: %s\n" %
                    (metric, ', '.join(list_known_metrics())))
                exit(1)
        if rowids is None:
            # standard, full way
            if is_phylogenetic:
                dissims = metric_f(otumtx,
                                   otu_table.ObservationIds,
                                   tree,
                                   otu_table.SampleIds,
                                   make_subtree=(not full_tree))
            else:
                dissims = metric_f(otumtx)

            return (format_distance_matrix(otu_table.SampleIds,
                                           dissims).split('\n'))
        else:
            # only calc d(rowid1, *) for each rowid
            rowids_list = rowids.split(',')
            row_dissims = []  # same order as rowids_list
            for rowid in rowids_list:
                rowidx = otu_table.SampleIds.index(rowid)

                # first test if we can the dissim is a fn of only the pair
                # if not, just calc the whole matrix
                if metric_f.__name__ == 'dist_chisq' or \
                        metric_f.__name__ == 'dist_gower' or \
                        metric_f.__name__ == 'dist_hellinger' or\
                        metric_f.__name__ == 'binary_dist_chisq':
                    warnings.warn(
                        'dissimilarity ' + metric_f.__name__ +
                        ' is not parallelized, calculating the whole matrix...'
                    )
                    row_dissims.append(metric_f(otumtx)[rowidx])
                else:
                    try:
                        row_metric = get_phylogenetic_row_metric(metric)
                    except AttributeError:
                        # do element by element
                        dissims = []
                        for i in range(len(otu_table.SampleIds)):
                            if is_phylogenetic:
                                dissim = metric_f(
                                    otumtx[[rowidx, i], :],
                                    otu_table.ObservationIds,
                                    tree, [
                                        otu_table.SampleIds[rowidx],
                                        otu_table.SampleIds[i]
                                    ],
                                    make_subtree=(not full_tree))[0, 1]
                            else:
                                dissim = metric_f(otumtx[[rowidx, i], :])[0, 1]
                            dissims.append(dissim)
                        row_dissims.append(dissims)
                    else:
                        # do whole row at once
                        dissims = row_metric(otumtx,
                                             otu_table.ObservationIds,
                                             tree,
                                             otu_table.SampleIds,
                                             rowid,
                                             make_subtree=(not full_tree))
                        row_dissims.append(dissims)

            return format_matrix(row_dissims, rowids_list, otu_table.SampleIds)
コード例 #28
0
ファイル: beta_diversity.py プロジェクト: wilkox/qiime
def single_file_beta(input_path,
                     metrics,
                     tree_path,
                     output_dir,
                     rowids=None,
                     full_tree=False):
    """ does beta diversity calc on a single otu table

    uses name in metrics to name output beta diversity files
    assumes input tree is already trimmed to contain only otus present in otu
    table, doesn't call getSubTree()
    inputs:
     input_path (str)
     metrics (str, comma delimited if more than 1 metric; or list)
     tree_path (str)
     output_dir (str)
     rowids (comma separated str)
    """
    metrics_list = metrics
    try:
        metrics_list = metrics_list.split(',')
    except AttributeError:
        pass

    otu_table = parse_biom_table(open(input_path, 'U'))

    if isinstance(otu_table, DenseTable):
        otumtx = otu_table._data.T
    else:
        otumtx = asarray([v for v in otu_table.iterSampleData()])

    if tree_path:
        tree = parse_newick(open(tree_path, 'U'), PhyloNode)
    else:
        tree = None

    input_dir, input_filename = os.path.split(input_path)
    input_basename, input_ext = os.path.splitext(input_filename)
    for metric in metrics_list:
        outfilepath = os.path.join(output_dir,
                                   metric + '_' + input_basename + '.txt')
        try:
            metric_f = get_nonphylogenetic_metric(metric)
            is_phylogenetic = False
        except AttributeError:
            try:
                metric_f = get_phylogenetic_metric(metric)
                is_phylogenetic = True
                if tree is None:
                    stderr.write(
                        "metric %s requires a tree, but none found\n" %
                        (metric, ))
                    exit(1)
            except AttributeError:
                stderr.write(
                    "Could not find metric %s.\n\nKnown metrics are: %s\n" %
                    (metric, ', '.join(list_known_metrics())))
                exit(1)
        if rowids is None:
            # standard, full way
            if is_phylogenetic:
                dissims = metric_f(otumtx,
                                   otu_table.ObservationIds,
                                   tree,
                                   otu_table.SampleIds,
                                   make_subtree=(not full_tree))
            else:
                dissims = metric_f(otumtx)
            f = open(outfilepath, 'w')
            f.write(format_distance_matrix(otu_table.SampleIds, dissims))
            f.close()
        else:
            # only calc d(rowid1, *) for each rowid
            rowids_list = rowids.split(',')
            row_dissims = []  # same order as rowids_list
            for rowid in rowids_list:
                rowidx = otu_table.SampleIds.index(rowid)

                # first test if we can the dissim is a fn of only the pair
                # if not, just calc the whole matrix
                if metric_f.__name__ == 'dist_chisq' or \
                        metric_f.__name__ == 'dist_gower' or \
                        metric_f.__name__ == 'dist_hellinger' or\
                        metric_f.__name__ == 'binary_dist_chisq':
                    warnings.warn(
                        'dissimilarity ' + metric_f.__name__ +
                        ' is not parallelized, calculating the whole matrix...'
                    )
                    row_dissims.append(metric_f(otumtx)[rowidx])
                else:
                    try:
                        row_metric = get_phylogenetic_row_metric(metric)
                    except AttributeError:
                        # do element by element
                        dissims = []
                        for i in range(len(otu_table.SampleIds)):
                            if is_phylogenetic:
                                dissim = metric_f(
                                    otumtx[[rowidx, i], :],
                                    otu_table.ObservationIds,
                                    tree, [
                                        otu_table.SampleIds[rowidx],
                                        otu_table.SampleIds[i]
                                    ],
                                    make_subtree=(not full_tree))[0, 1]
                            else:
                                dissim = metric_f(otumtx[[rowidx, i], :])[0, 1]
                            dissims.append(dissim)
                        row_dissims.append(dissims)
                    else:
                        # do whole row at once
                        dissims = row_metric(otumtx,
                                             otu_table.ObservationIds,
                                             tree,
                                             otu_table.SampleIds,
                                             rowid,
                                             make_subtree=(not full_tree))
                        row_dissims.append(dissims)

            # rows_outfilepath = os.path.join(output_dir, metric + '_' +\
            #     '_'.join(rowids_list) + '_' + os.path.split(input_path)[1])
            f = open(outfilepath, 'w')
            f.write(
                format_matrix(row_dissims, rowids_list, otu_table.SampleIds))
            f.close()
コード例 #29
0
ファイル: beta_diversity.py プロジェクト: wilkox/qiime
 def formatResult(self, result):
     """Generate formatted distance matrix. result is (data, sample_names)"""
     data, sample_names = result
     return format_distance_matrix(sample_names, data)
コード例 #30
0
ファイル: beta_diversity.py プロジェクト: EESI/FizzyQIIME
def single_object_beta(otu_table, metrics, tr, rowids=None,
    full_tree=False):
    """mod of single_file_beta to recieve and return otu obj, tree str

    uses name in metrics to name output beta diversity files
    assumes input tree is already trimmed to contain only otus present 
    in otu_table, doesn't call getSubTree()
    inputs:
		otu_table -- a otu_table in the biom format
		metrics -- metrics (str, comma delimited if more than 1 metric)
		tr -- a phylonode cogent tree object if needed by the chosen beta
					diversity metric
		rowids -- comma seperated string
    """ 
    if isinstance(otu_table, DenseTable):
        otumtx = otu_table._data.T
    else:
        otumtx = asarray([v for v in otu_table.iterSampleData()])
    
    if tr:
        tree = tr
    else:
        tree = None

    metrics_list = metrics.split(',')
    
    for metric in metrics_list:
        try:
            metric_f = get_nonphylogenetic_metric(metric)
            is_phylogenetic = False
        except AttributeError:
            try:
                metric_f = get_phylogenetic_metric(metric)
                is_phylogenetic = True
                if tree == None:
                    stderr.write("metric %s requires a tree, but none found\n"\
                        % (metric,))
                    exit(1)
            except AttributeError:
                stderr.write("Could not find metric %s.\n\nKnown metrics are: %s\n"\
                    % (metric, ', '.join(list_known_metrics())))
                exit(1)
        if rowids == None:
            # standard, full way
            if is_phylogenetic:
                dissims = metric_f(otumtx, otu_table.ObservationIds, tree,
                    otu_table.SampleIds, make_subtree = (not full_tree))
            else:
                dissims = metric_f(otumtx)
            
            return format_distance_matrix(otu_table.SampleIds, dissims).split('\n') 
        else:
            # only calc d(rowid1, *) for each rowid
            rowids_list = rowids.split(',')
            row_dissims = [] # same order as rowids_list
            for rowid in rowids_list:
                rowidx = otu_table.SampleIds.index(rowid)
                
                # first test if we can the dissim is a fn of only the pair
                # if not, just calc the whole matrix
                if metric_f.__name__ == 'dist_chisq' or \
                    metric_f.__name__ == 'dist_gower' or \
                    metric_f.__name__ == 'dist_hellinger' or\
                    metric_f.__name__ == 'binary_dist_chisq':
                    warnings.warn('dissimilarity '+metric_f.__name__+\
                      ' is not parallelized, calculating the whole matrix...')
                    row_dissims.append(metric_f(otumtx)[rowidx])
                else:
                    try:
                        row_metric = get_phylogenetic_row_metric(metric)
                    except AttributeError:
                        # do element by element
                        dissims = []
                        for i in range(len(otu_table.SampleIds)):
                            if is_phylogenetic:
                                dissim = metric_f(otumtx[[rowidx,i],:],
                                    otu_table.ObservationIds, tree,
                                    [otu_table.SampleIds[rowidx],
                                    otu_table.SampleIds[i]],
                                    make_subtree = (not full_tree))[0,1]
                            else:
                                dissim = metric_f(otumtx[[rowidx,i],:])[0,1]
                            dissims.append(dissim)
                        row_dissims.append(dissims)
                    else:
                        # do whole row at once
                        dissims = row_metric(otumtx,
                                    otu_table.ObservationIds, tree,
                                    otu_table.SampleIds, rowid,
                                    make_subtree = (not full_tree))
                        row_dissims.append(dissims)
            
            return format_matrix(row_dissims,rowids_list,otu_table.SampleIds)
コード例 #31
0
ファイル: beta_diversity.py プロジェクト: EESI/FizzyQIIME
def single_file_beta(input_path, metrics, tree_path, output_dir,
    rowids=None, full_tree=False):
    """ does beta diversity calc on a single otu table

    uses name in metrics to name output beta diversity files
    assumes input tree is already trimmed to contain only otus present in otu
    table, doesn't call getSubTree()
    inputs:
     input_path (str)
     metrics (str, comma delimited if more than 1 metric; or list)
     tree_path (str)
     output_dir (str)
     rowids (comma separated str)
    """
    metrics_list = metrics
    try:
        metrics_list = metrics_list.split(',')
    except AttributeError:
        pass

    otu_table = parse_biom_table(open(input_path,'U'))

    if isinstance(otu_table, DenseTable):
        otumtx = otu_table._data.T
    else:
        otumtx = asarray([v for v in otu_table.iterSampleData()])

    if tree_path:
        tree = parse_newick(open(tree_path, 'U'), 
                            PhyloNode)
    else:
        tree = None

    input_dir, input_filename = os.path.split(input_path)
    input_basename, input_ext = os.path.splitext(input_filename)
    for metric in metrics_list:
        outfilepath = os.path.join(output_dir, metric + '_' + \
            input_basename + '.txt')
        try:
            metric_f = get_nonphylogenetic_metric(metric)
            is_phylogenetic = False
        except AttributeError:
            try:
                metric_f = get_phylogenetic_metric(metric)
                is_phylogenetic = True
                if tree == None:
                    stderr.write("metric %s requires a tree, but none found\n"\
                        % (metric,))
                    exit(1)
            except AttributeError:
                stderr.write("Could not find metric %s.\n\nKnown metrics are: %s\n"\
                    % (metric, ', '.join(list_known_metrics())))
                exit(1)
        if rowids == None:
            # standard, full way
            if is_phylogenetic:
                dissims = metric_f(otumtx, otu_table.ObservationIds, \
                    tree, otu_table.SampleIds, make_subtree = (not full_tree))
            else:
                dissims = metric_f(otumtx)
            f = open(outfilepath,'w')
            f.write(format_distance_matrix(otu_table.SampleIds, dissims))
            f.close()
        else:
            # only calc d(rowid1, *) for each rowid
            rowids_list = rowids.split(',')
            row_dissims = [] # same order as rowids_list
            for rowid in rowids_list:
                rowidx = otu_table.SampleIds.index(rowid)
                
                # first test if we can the dissim is a fn of only the pair
                # if not, just calc the whole matrix
                if metric_f.__name__ == 'dist_chisq' or \
                    metric_f.__name__ == 'dist_gower' or \
                    metric_f.__name__ == 'dist_hellinger' or\
                    metric_f.__name__ == 'binary_dist_chisq':
                    warnings.warn('dissimilarity '+metric_f.__name__+\
                      ' is not parallelized, calculating the whole matrix...')
                    row_dissims.append(metric_f(otumtx)[rowidx])
                else:
                    try:
                        row_metric = get_phylogenetic_row_metric(metric)
                    except AttributeError:
                        # do element by element
                        dissims = []
                        for i in range(len(otu_table.SampleIds)):
                            if is_phylogenetic:
                                dissim = metric_f(otumtx[[rowidx,i],:],
                                    otu_table.ObservationIds, tree,
                                    [otu_table.SampleIds[rowidx],
                                    otu_table.SampleIds[i]],
                                    make_subtree = (not full_tree))[0,1]
                            else:
                                dissim = metric_f(otumtx[[rowidx,i],:])[0,1]
                            dissims.append(dissim)
                        row_dissims.append(dissims)
                    else:
                        # do whole row at once
                        dissims = row_metric(otumtx,
                                    otu_table.ObservationIds, tree,
                                    otu_table.SampleIds, rowid,
                                    make_subtree = (not full_tree))
                        row_dissims.append(dissims)

            # rows_outfilepath = os.path.join(output_dir, metric + '_' +\
            #     '_'.join(rowids_list) + '_' + os.path.split(input_path)[1])
            f = open(outfilepath,'w')
            f.write(format_matrix(row_dissims,rowids_list,otu_table.SampleIds))
            f.close()
コード例 #32
0
ファイル: beta_diversity.py プロジェクト: EESI/FizzyQIIME
 def formatResult(self, result):
     """Generate formatted distance matrix. result is (data, sample_names)"""
     data, sample_names = result
     return format_distance_matrix(sample_names, data)
コード例 #33
0
ファイル: test_format.py プロジェクト: colinbrislawn/qiime
 def test_format_distance_matrix_almost_zero_diagonal(self):
     # only diagonal values should be converted to 0.0 if they are close to
     # zero. other values in the matrix should not be changed.
     a = array([[0.00001, 1, 0.0000000000001], [1.0, 0.0000000000001, 3], [0.0000000000001, 3.0, 0.0]])
     res = format_distance_matrix(["foo", "bar", "baz"], a)
     self.assertEqual(res, "\tfoo\tbar\tbaz\nfoo\t1e-05\t1.0\t1e-13\nbar\t1.0" "\t0.0\t3.0\nbaz\t1e-13\t3.0\t0.0")
コード例 #34
0
ファイル: util.py プロジェクト: gregcaporaso/microbiogeo
def shuffle_dm(dm_f):
    labels, dm_data = parse_distmat(dm_f)
    shuffle(labels)
    return format_distance_matrix(labels, dm_data)
コード例 #35
0
def single_file_beta(input_path,
                     metrics,
                     tree_path,
                     output_dir,
                     rowids=None,
                     full_tree=False):
    """ does beta diversity calc on a single otu table

    uses name in metrics to name output beta diversity files
    assumes input tree is already trimmed to contain only otus present in otu
    table, doesn't call getSubTree()
    inputs:
     input_path (str)
     metrics (str, comma delimited if more than 1 metric)
     tree_path (str)
     output_dir (str)
     rowids (comma separated str)
    """
    f = open(input_path, 'U')
    samids, otuids, otumtx, lineages = parse_otu_table(f)
    # otu mtx is otus by samples
    f.close()
    tree = None
    if tree_path:
        f = open(tree_path, 'U')
        tree = parse_newick(f, PhyloNode)
        f.close()
        if not full_tree:
            tree = tree.getSubTree(otuids, ignore_missing=True)

    metrics_list = metrics.split(',')
    for metric in metrics_list:
        outfilepath = os.path.join(output_dir,
                                   metric + '_' + os.path.split(input_path)[1])
        try:
            metric_f = get_nonphylogenetic_metric(metric)
            is_phylogenetic = False
        except AttributeError:
            try:
                metric_f = get_phylogenetic_metric(metric)
                is_phylogenetic = True
                if tree == None:
                    stderr.write("metric %s requires a tree, but none found\n"\
                        % (metric,))
                    exit(1)
            except AttributeError:
                stderr.write("Could not find metric %s.\n\nKnown metrics are: %s\n"\
                    % (metric, ', '.join(list_known_metrics())))
                exit(1)
        if rowids == None:
            # standard, full way
            if is_phylogenetic:
                dissims = metric_f(otumtx.T, otuids, tree, samids)
            else:
                dissims = metric_f(otumtx.T)

            f = open(outfilepath, 'w')
            f.write(format_distance_matrix(samids, dissims))
            f.close()
        else:
            # only calc d(rowid1, *) for each rowid
            rowids_list = rowids.split(',')
            row_dissims = []  # same order as rowids_list
            for rowid in rowids_list:
                rowidx = samids.index(rowid)

                # first test if we can the dissim is a fn of only the pair
                # if not, just calc the whole matrix
                if metric_f.__name__ == 'dist_chisq' or \
                    metric_f.__name__ == 'dist_gower' or \
                    metric_f.__name__ == 'dist_hellinger' or\
                    metric_f.__name__ == 'binary_dist_chisq':
                    row_dissims.append(metric_f(otumtx.T)[rowidx])
                else:
                    try:
                        row_metric = get_phylogenetic_row_metric(metric)
                    except AttributeError:
                        # do element by element
                        dissims = []
                        for i in range(len(samids)):
                            if is_phylogenetic:
                                dissim = metric_f(
                                    otumtx.T[[rowidx, i], :], otuids, tree,
                                    [samids[rowidx], samids[i]])[0, 1]
                            else:
                                dissim = metric_f(otumtx.T[[rowidx, i], :])[0,
                                                                            1]
                            dissims.append(dissim)
                        row_dissims.append(dissims)
                    else:
                        # do whole row at once
                        dissims = row_metric(otumtx.T, otuids, tree, samids,
                                             rowid)
                        row_dissims.append(dissims)

            # rows_outfilepath = os.path.join(output_dir, metric + '_' +\
            #     '_'.join(rowids_list) + '_' + os.path.split(input_path)[1])
            f = open(outfilepath, 'w')
            f.write(format_matrix(row_dissims, rowids_list, samids))
            f.close()