Ejemplo n.º 1
0
    def test_make_otu_table_with_sample_metadata(self):
        # Want to make sure that the order of the sample IDs in the OTU
        # map and the order of the IDs in the mapping file do not matter
        otu_map_lines = """0	ABC_0	DEF_1
1	ABC_1
x	GHI_2	GHI_3	GHI_77
z	DEF_3	XYZ_1""".split('\n')
        mapping_f = StringIO(MAPPING_FILE)
        sample_ids = ['ABC', 'DEF', 'GHI', 'XYZ']
        data = [[1, 1, 0, 0], [1, 0, 0, 0], [0, 0, 3, 0], [0, 1, 0, 1]]

        map_data, map_header, map_comments = parse_mapping_file(mapping_f)
        sample_metadata = mapping_file_to_dict(map_data, map_header)

        sample_md = [sample_metadata[sample_id] for sample_id in sample_ids]

        obs = make_otu_table(otu_map_lines, sample_metadata=sample_metadata)
        exp = Table(data, ['0', '1', 'x', 'z'], sample_ids,
                    sample_metadata=sample_md, input_is_dense=True)

        self.assertEqual(obs, exp)

        # Test with a mapping file that is missing a sample's metadata,
        # make sure it raises the KeyError
        mapping_f = StringIO(MAPPING_FILE_MISSING_SAMPLE)
        map_data, map_header, map_comments = parse_mapping_file(mapping_f)
        sample_metadata = mapping_file_to_dict(map_data, map_header)

        with self.assertRaises(KeyError):
            obs = make_otu_table(otu_map_lines,
                                 sample_metadata=sample_metadata)
Ejemplo n.º 2
0
    def test_make_otu_table_with_sample_metadata(self):
        # Want to make sure that the order of the sample IDs in the OTU
        # map and the order of the IDs in the mapping file do not matter
        otu_map_lines = """0	ABC_0	DEF_1
1	ABC_1
x	GHI_2	GHI_3	GHI_77
z	DEF_3	XYZ_1""".split('\n')
        mapping_f = StringIO(MAPPING_FILE)
        sample_ids = ['ABC', 'DEF', 'GHI', 'XYZ']
        data = [[1, 1, 0, 0], [1, 0, 0, 0], [0, 0, 3, 0], [0, 1, 0, 1]]

        map_data, map_header, map_comments = parse_mapping_file(mapping_f)
        sample_metadata = mapping_file_to_dict(map_data, map_header)

        sample_md = [sample_metadata[sample_id] for sample_id in sample_ids]

        obs = make_otu_table(otu_map_lines, sample_metadata=sample_metadata)
        exp = Table(data, ['0', '1', 'x', 'z'],
                    sample_ids,
                    sample_metadata=sample_md,
                    input_is_dense=True)

        self.assertEqual(obs, exp)

        # Test with a mapping file that is missing a sample's metadata,
        # make sure it raises the KeyError
        mapping_f = StringIO(MAPPING_FILE_MISSING_SAMPLE)
        map_data, map_header, map_comments = parse_mapping_file(mapping_f)
        sample_metadata = mapping_file_to_dict(map_data, map_header)

        with self.assertRaises(KeyError):
            obs = make_otu_table(otu_map_lines,
                                 sample_metadata=sample_metadata)
Ejemplo n.º 3
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)
    map_data, map_header, map_comments = parse_mapping_file(open(
        opts.map, 'U'))
    map_dict = mapping_file_to_dict(map_data, map_header)

    distdict = parse_distmat_to_dict(open(opts.distance_matrix, 'U'))

    if opts.colorby == None:
        colorby_cats = [None]
    else:
        colorby_idx = map_header.index(opts.colorby)
        colorby_cats = list(set([map_data[i][colorby_idx] for\
            i in range(len(map_data))]))
    textfilename = os.path.splitext(opts.output_path)[0] + '.txt'
    text_fh = open(textfilename, 'w')
    text_fh.write(opts.axis_category + '\tdistance\tSampleID' + '\n')
    colorby_cats.sort()
    plt.figure()
    for cat_num, cat in enumerate(colorby_cats):
        # collect the primary and secondary samples within this category
        state1_samids, state2_samids = get_sam_ids(map_data, map_header,
                                                   opts.colorby, cat,
                                                   opts.primary_state,
                                                   opts.secondary_state)
        state1_samids =\
            list(set(state1_samids).intersection(set(distdict.keys())))
        state2_samids =\
            list(set(state2_samids).intersection(set(distdict.keys())))
        if state1_samids == [] or state2_samids == [] or \
            (len(state1_samids) == 1 and state1_samids == state2_samids):
            raise RuntimeError("one category of samples didn't have any valid"+\
            " distances. try eliminating samples from -p or -s, or changing"+\
            " your mapping file with filter_samples_from_otu_table.py")
        # go through dmtx
        state1_avg_dists = get_avg_dists(state1_samids, state2_samids,
                                         distdict)

        # plot
        xvals = [float(map_dict[sam][opts.axis_category]) for\
            sam in state1_samids]
        try:
            color = plt.cm.jet(cat_num / (len(colorby_cats) - 1))
        except ZeroDivisionError:  # only one cat
            color = 'b'
        plt.scatter(xvals,
                    state1_avg_dists,
                    edgecolors=color,
                    alpha=.5,
                    facecolors='none')
        plt.xlabel(opts.axis_category)
        plt.ylabel('average distance')

        lines = [str(xvals[i])+'\t'+str(state1_avg_dists[i])+\
            '\t'+state1_samids[i]+'\n' for i in range(len(xvals))]
        text_fh.writelines(lines)

    if opts.colorby != None: plt.legend(colorby_cats)
    plt.savefig(opts.output_path)
Ejemplo n.º 4
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    exclude_otus_fp = opts.exclude_otus_fp

    if not opts.taxonomy_fname:
        otu_to_taxonomy = None
    else:
        infile = open(opts.taxonomy_fname, 'U')
        otu_to_taxonomy = parse_taxonomy(infile)

    ids_to_exclude = []
    if exclude_otus_fp:
        if splitext(exclude_otus_fp)[1] in ('.fasta', '.fna'):
            ids_to_exclude = \
                get_seq_ids_from_fasta_file(open(exclude_otus_fp, 'U'))
        else:
            ids_to_exclude = \
                get_seq_ids_from_seq_id_file(open(exclude_otus_fp, 'U'))

    sample_metadata = None
    if opts.mapping_fp is not None:
        with open(opts.mapping_fp, 'U') as map_f:
            mapping_data, mapping_header, mapping_comments = \
                parse_mapping_file(map_f)

        sample_metadata = mapping_file_to_dict(mapping_data, mapping_header)

    with open(opts.otu_map_fp, 'U') as otu_map_f:
        biom_otu_table = make_otu_table(otu_map_f,
                                        otu_to_taxonomy=otu_to_taxonomy,
                                        otu_ids_to_exclude=ids_to_exclude,
                                        sample_metadata=sample_metadata)

    write_biom_table(biom_otu_table, opts.output_biom_fp)
Ejemplo n.º 5
0
def format_mapping_file_to_js(mapping_file_data, mapping_file_headers, columns):
    """Write a javascript representation of the mapping file

    Inputs:
    mapping_file_data: contents of the mapping file
    mapping_file_headers: headers of the mapping file
    columns: valid columns to use, usually a subset of mapping_file_headers

    Outputs:
    string: javascript representation of the mapping file
    """
    js_mapping_file_string = ''

    mapping_file_dict = mapping_file_to_dict(mapping_file_data,
        mapping_file_headers)

    map_values = []
    for k,v in mapping_file_dict.items():
        if 'SampleID' in columns:
            vals = ["'%s'" % k] + ["'%s'" % v[col]\
                for col in mapping_file_headers[1:]]
        else:
            vals = ["'%s'" % v[col] for col in mapping_file_headers[1:]]
        map_values.append("'%s': [%s]" % (k, ','.join(vals)))

    if 'SampleID' not in columns:
        mapping_file_headers = mapping_file_headers[1:]

    # format the mapping file as javascript objects
    js_mapping_file_string += 'var g_mappingFileHeaders = [%s];\n' % ','.join(
        ["'%s'" % col for col in mapping_file_headers])
    js_mapping_file_string += 'var g_mappingFileData = { %s };\n' % ','.join(
        map_values)

    return js_mapping_file_string
Ejemplo n.º 6
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    exclude_otus_fp = opts.exclude_otus_fp

    if not opts.taxonomy_fname:
        otu_to_taxonomy = None
    else:
        infile = open(opts.taxonomy_fname, 'U')
        otu_to_taxonomy = parse_taxonomy(infile)

    ids_to_exclude = []
    if exclude_otus_fp:
        if splitext(exclude_otus_fp)[1] in ('.fasta', '.fna'):
            ids_to_exclude = \
                get_seq_ids_from_fasta_file(open(exclude_otus_fp, 'U'))
        else:
            ids_to_exclude = \
                get_seq_ids_from_seq_id_file(open(exclude_otus_fp, 'U'))

    sample_metadata = None
    if opts.mapping_fp is not None:
        with open(opts.mapping_fp, 'U') as map_f:
            mapping_data, mapping_header, mapping_comments = \
                parse_mapping_file(map_f)
        sample_metadata = mapping_file_to_dict(mapping_data,
                                               mapping_header)

    with open(opts.otu_map_fp, 'U') as otu_map_f:
        biom_otu_table = make_otu_table(otu_map_f,
                                        otu_to_taxonomy=otu_to_taxonomy,
                                        otu_ids_to_exclude=ids_to_exclude,
                                        sample_metadata=sample_metadata)

    write_biom_table(biom_otu_table, opts.output_biom_fp)
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)
    map_data, map_header, map_comments = parse_mapping_file(
        open(opts.map, 'U'))
    map_dict = mapping_file_to_dict(map_data, map_header)

    distdict = parse_distmat_to_dict(open(opts.distance_matrix, 'U'))

    if opts.colorby is None:
        colorby_cats = [None]
    else:
        colorby_idx = map_header.index(opts.colorby)
        colorby_cats = list(set([map_data[i][colorby_idx] for
                                 i in range(len(map_data))]))
    textfilename = os.path.splitext(opts.output_path)[0] + '.txt'
    text_fh = open(textfilename, 'w')
    text_fh.write(opts.axis_category + '\tdistance\tSampleID' + '\n')
    colorby_cats.sort()
    plt.figure()
    for cat_num, cat in enumerate(colorby_cats):
        # collect the primary and secondary samples within this category
        state1_samids, state2_samids = get_sam_ids(map_data, map_header,
                                                   opts.colorby, cat, opts.primary_state, opts.secondary_state)
        state1_samids =\
            list(set(state1_samids).intersection(set(distdict.keys())))
        state2_samids =\
            list(set(state2_samids).intersection(set(distdict.keys())))
        if state1_samids == [] or state2_samids == [] or \
                (len(state1_samids) == 1 and state1_samids == state2_samids):
            raise RuntimeError("one category of samples didn't have any valid" +
                               " distances. try eliminating samples from -p or -s, or changing" +
                               " your mapping file with filter_samples_from_otu_table.py")
        # go through dmtx
        state1_avg_dists = get_avg_dists(
            state1_samids,
            state2_samids,
            distdict)

        # plot
        xvals = [float(map_dict[sam][opts.axis_category]) for
                 sam in state1_samids]
        try:
            color = plt.cm.jet(cat_num / (len(colorby_cats) - 1))
        except ZeroDivisionError:  # only one cat
            color = 'b'
        plt.scatter(xvals, state1_avg_dists, edgecolors=color, alpha=.5,
                    facecolors='none')
        plt.xlabel(opts.axis_category)
        plt.ylabel('average distance')

        lines = [str(xvals[i]) + '\t' + str(state1_avg_dists[i]) +
                 '\t' + state1_samids[i] + '\n' for i in range(len(xvals))]
        text_fh.writelines(lines)

    if opts.colorby is not None:
        plt.legend(colorby_cats)
    plt.savefig(opts.output_path)
Ejemplo n.º 8
0
 def test_mapping_file_to_dict(self):
     """parse_mapping_file functions as expected"""
     s1 = ['#sample\ta\tb', '#comment line to skip',\
           'x \t y \t z ', ' ', '#more skip', 'i\tj\tk']
     exp = ([['x','y','z'],['i','j','k']],\
            ['sample','a','b'],\
            ['comment line to skip','more skip'])
     mapres = parse_mapping_file(s1) # map_data, header, comments
     mapdict = mapping_file_to_dict(*mapres[:2])
     expdict = {'x':{'a':'y','b':'z'}, 'i':{'a':'j','b':'k'}}
     self.assertEqual(mapdict, expdict)
Ejemplo n.º 9
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    mapping_fp = opts.mapping_fp
    mapping_category = opts.mapping_category
    otu_table_fp = opts.otu_table_fp
    output_fp = opts.output_fp
    normalize = opts.normalize

    # define a function that returns the bin a sample shouldbe placed into
    bin_function = lambda id_, sample_metadata:\
        sample_metadata[mapping_category]
    # parse the sample metadata and add it to the OTU table (we assume that
    # sample metadata is not already present in the table)
    mapping, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))

    # added in ability to combine metadata columns and summarize based on the
    # new combined category
    if '&&' in mapping_category:
        new_mapping = []
        new_mapping.append(headers)
        for i in range(len(mapping)):
            new_mapping.append(mapping[i])
        # Create an array using multiple columns from mapping file
        combinecolorby = mapping_category.split('&&')
        mapping = combine_map_label_cols(combinecolorby, new_mapping)

    sample_metadata = mapping_file_to_dict(mapping, headers)
    with biom_open(otu_table_fp, 'U') as biom_file:
        table = parse_biom_table(biom_file)
    table.add_metadata(sample_metadata)
    # create a new OTU table where samples are binned based on their return
    # value from bin_function
    result = table.collapse(bin_function,
                            norm=False,
                            min_group_size=1,
                            axis='sample')

    # normalize the result if requested by the user
    if normalize:
        result.norm(axis='sample', inplace=True)

    # write a new BIOM file
    write_biom_table(result, output_fp)
Ejemplo n.º 10
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    mapping_fp = opts.mapping_fp
    mapping_category = opts.mapping_category
    otu_table_fp = opts.otu_table_fp
    output_fp = opts.output_fp
    normalize = opts.normalize

    # define a function that returns the bin a sample shouldbe placed into
    bin_function = lambda id_, sample_metadata:\
        sample_metadata[mapping_category]
    # parse the sample metadata and add it to the OTU table (we assume that
    # sample metadata is not already present in the table)
    mapping, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))

    # added in ability to combine metadata columns and summarize based on the
    # new combined category
    if '&&' in mapping_category:
        new_mapping = []
        new_mapping.append(headers)
        for i in range(len(mapping)):
            new_mapping.append(mapping[i])
        # Create an array using multiple columns from mapping file
        combinecolorby = mapping_category.split('&&')
        mapping = combine_map_label_cols(combinecolorby, new_mapping)

    sample_metadata = mapping_file_to_dict(mapping, headers)
    with biom_open(otu_table_fp, 'U') as biom_file:
        table = parse_biom_table(biom_file)
    table.add_metadata(sample_metadata)
    # create a new OTU table where samples are binned based on their return
    # value from bin_function
    result = table.collapse(bin_function, norm=False, min_group_size=1,
                            axis='sample')

    # normalize the result if requested by the user
    if normalize:
        result.norm(axis='sample', inplace=True)

    # write a new BIOM file
    write_biom_table(result, output_fp)
Ejemplo n.º 11
0
def generate_pcoa_cloud_from_point_in_omega(mapping_file_tuple, biom_object,
                                            metric, sequences, iterations, axes,
                                            tree_object=None):
    """run the randomisations and get a WebGL PCoA plot string representation

    Input:
    mapping_file_tuple: data and headers tuple for representing the mapping file
    biom_object: otu table biom object
    metric: string of the name for the beta diversity metric, i. e. 'unifrac'
    sequences: number of sequences per sample
    iterations: number of iterations to generate the pcoa plot
    axes: number of axes to account for
    tree_object: tree to perform the beta diversity calculation

    Output:
    WebGL string representing the PCoA plot
    """

    # get a list of the SampleIds
    full_id_list = mapping_file_to_dict(mapping_file_tuple[0], mapping_file_tuple[1]).keys()

    pcoa_list = []
    for i in range(iterations):
        rare_biom_table = get_rare_data(biom_object, sequences)
        beta_dm = single_object_beta(rare_biom_table, metric, tree_object)
        pcoa_results = pcoa(beta_dm)

        pcoa_list.append(pcoa_results)

    # convert the list of pcoa lines into ellipsoid coords
    ellipse_coords_by_sampleId, sampleId_to_coords  = get_pcoa_ellipsoid_coords(pcoa_list, axes, full_id_list)
        
    # check the ellipses are created correctly
    if type(ellipse_coords_by_sampleId) == type(''):
        raise ValueError, 'Could not create PCoA plot'

    webgl_string = make_pcoa_plot(ellipse_coords_by_sampleId, mapping_file_tuple, sampleId_to_coords['variation explained'])
    return webgl_string
Ejemplo n.º 12
0
def format_mapping_file_to_js(mapping_file_data, mapping_file_headers,
                              columns):
    """Write a javascript representation of the mapping file

    Inputs:
    mapping_file_data: contents of the mapping file
    mapping_file_headers: headers of the mapping file
    columns: valid columns to use, usually a subset of mapping_file_headers

    Outputs:
    string: javascript representation of the mapping file
    """
    js_mapping_file_string = ''

    mapping_file_dict = mapping_file_to_dict(mapping_file_data,
                                             mapping_file_headers)

    map_values = []
    for k, v in mapping_file_dict.items():
        if 'SampleID' in columns:
            vals = ["'%s'" % k] + ["'%s'" % v[col]\
                for col in mapping_file_headers[1:]]
        else:
            vals = ["'%s'" % v[col] for col in mapping_file_headers[1:]]
        map_values.append("'%s': [%s]" % (k, ','.join(vals)))

    if 'SampleID' not in columns:
        mapping_file_headers = mapping_file_headers[1:]

    # format the mapping file as javascript objects
    js_mapping_file_string += 'var g_mappingFileHeaders = [%s];\n' % ','.join(
        ["'%s'" % col for col in mapping_file_headers])
    js_mapping_file_string += 'var g_mappingFileData = { %s };\n' % ','.join(
        map_values)

    return js_mapping_file_string
Ejemplo n.º 13
0
            if '&&' in col:
                for _col in col.split('&&'):
                    if _col not in lookup_header:
                        offending_fields.append(col)
            elif col not in lookup_header:
                offending_fields.append(col)
    else:
        # if the user didn't specify the header names display everything
        color_by_column_names = header[:]

    # extract a list of the custom axes provided and each element is numeric
    if custom_axes:
        custom_axes = custom_axes.strip().strip("'").strip('"').split(',')

        # the MetadataMap object makes some checks easier
        map_object = MetadataMap(mapping_file_to_dict(mapping_data, header), [])
        for axis in custom_axes:
            # append the field to the error queue that it belongs to
            if axis not in lookup_header:
                offending_fields.append(axis)
                break
            # make sure this value is in the mapping file
            elif axis not in color_by_column_names:
                color_by_column_names.append(axis)
        # perform only if the for loop does not call break
        else:
            # make sure all these axes are numeric
            for axis in custom_axes:
                if map_object.isNumericCategory(axis) == False:
                    non_numeric_categories.append(axis)
Ejemplo n.º 14
0
def preprocess_mapping_file(data,
                            headers,
                            columns,
                            unique=False,
                            single=False,
                            clones=0):
    """Process a mapping file to expand the data or remove unuseful fields

    Inputs:
    data: mapping file data
    headers: mapping file headers
    columns: list of headers to keep, if one of these headers includes two
    ampersands, this function will create a new column by merging the delimited
    columns.
    unique: keep columns where all values are unique
    single: keep columns where all values are the same
    clones: number of times to replicate the metadata

    Outputs:
    data: processed mapping file data
    headers: processed mapping file headers
    """

    # The sample ID must always be there, else it's meaningless data
    if 'SampleID' != columns[0]:
        columns = ['SampleID'] + columns

    # process concatenated columns if needed
    merge = []
    for column in columns:
        if '&&' in column:
            merge.append(column)
    # each element needs several columns to be merged
    for new_column in merge:
        indices = [
            headers.index(header_name)
            for header_name in new_column.split('&&')
        ]

        # join all the fields of the metadata that are listed in indices
        for line in data:
            line.append(''.join([line[index] for index in indices]))
        headers.append(new_column)

    # remove all unique or singled valued columns
    if unique or single:
        columns_to_remove = []
        metadata = MetadataMap(mapping_file_to_dict(data, headers), [])

        # find columns that have values that are all unique
        if unique == True:
            columns_to_remove += [
                column_name for column_name in headers[1::]
                if metadata.hasUniqueCategoryValues(column_name)
            ]

        # remove categories where there is only one value
        if single == True:
            columns_to_remove += [
                column_name for column_name in headers[1::]
                if metadata.hasSingleCategoryValue(column_name)
            ]
        columns_to_remove = list(set(columns_to_remove))

        # remove the single or unique columns
        data, headers = keep_columns_from_mapping_file(data,
                                                       headers,
                                                       columns_to_remove,
                                                       negate=True)

    # remove anything not specified in the input
    data, headers = keep_columns_from_mapping_file(data, headers, columns)

    # sanitize the mapping file data and headers
    data, headers = sanitize_mapping_file(data, headers)

    # clones mean: replicate the metadata retagging the sample ids with a suffix
    if clones:
        out_data = []
        for index in range(0, clones):
            out_data.extend([[element[0] + '_%d' % index] + element[1::]
                             for element in data])
        data = out_data

    return data, headers
Ejemplo n.º 15
0
def preprocess_mapping_file(data, headers, columns, unique=False, single=False, clones=0):
    """Process a mapping file to expand the data or remove unuseful fields

    Inputs:
    data: mapping file data
    headers: mapping file headers
    columns: list of headers to keep, if one of these headers includes two
    ampersands, this function will create a new column by merging the delimited
    columns.
    unique: keep columns where all values are unique
    single: keep columns where all values are the same
    clones: number of times to replicate the metadata

    Outputs:
    data: processed mapping file data
    headers: processed mapping file headers
    """

    # The sample ID must always be there, else it's meaningless data
    if "SampleID" != columns[0]:
        columns = ["SampleID"] + columns

    # process concatenated columns if needed
    merge = []
    for column in columns:
        if "&&" in column:
            merge.append(column)
    # each element needs several columns to be merged
    for new_column in merge:
        indices = [headers.index(header_name) for header_name in new_column.split("&&")]

        # join all the fields of the metadata that are listed in indices
        for line in data:
            line.append("".join([line[index] for index in indices]))
        headers.append(new_column)

    # remove all unique or singled valued columns
    if unique or single:
        columns_to_remove = []
        metadata = MetadataMap(mapping_file_to_dict(data, headers), [])

        # find columns that have values that are all unique
        if unique == True:
            columns_to_remove += [
                column_name for column_name in headers[1::] if metadata.hasUniqueCategoryValues(column_name)
            ]

        # remove categories where there is only one value
        if single == True:
            columns_to_remove += [
                column_name for column_name in headers[1::] if metadata.hasSingleCategoryValue(column_name)
            ]
        columns_to_remove = list(set(columns_to_remove))

        # remove the single or unique columns
        data, headers = keep_columns_from_mapping_file(data, headers, columns_to_remove, negate=True)

    # remove anything not specified in the input
    data, headers = keep_columns_from_mapping_file(data, headers, columns)

    # sanitize the mapping file data and headers
    data, headers = sanitize_mapping_file(data, headers)

    # clones mean: replicate the metadata retagging the sample ids with a suffix
    if clones:
        out_data = []
        for index in range(0, clones):
            out_data.extend([[element[0] + "_%d" % index] + element[1::] for element in data])
        data = out_data

    return data, headers