Example #1
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    output_f = open(opts.output_distance_matrix, 'w')
    if opts.otu_table_fp:
        otu_table = parse_biom_table(open(opts.otu_table_fp, 'U'))
        samples_to_keep = otu_table.SampleIds
        #samples_to_keep = \
        # sample_ids_from_otu_table(open(opts.otu_table_fp,'U'))
    elif opts.sample_id_fp:
        samples_to_keep = \
         get_seqs_to_keep_lookup_from_seq_id_file(open(opts.sample_id_fp,'U'))
    elif opts.mapping_fp and opts.valid_states:
        samples_to_keep = sample_ids_from_metadata_description(
            open(opts.mapping_fp, 'U'), opts.valid_states)
    else:
        option_parser.error(
            'must pass either --sample_id_fp, -t, or -m and -s')
    # note that negate gets a little weird here. The function we're calling removes the specified
    # samples from the distance matrix, but the other QIIME filter scripts keep these samples specified.
    # So, the interface of this script is designed to keep the specified samples, and therefore
    # negate=True is passed to filter_samples_from_distance_matrix by default.
    d = filter_samples_from_distance_matrix(parse_distmat(
        open(opts.input_distance_matrix, 'U')),
                                            samples_to_keep,
                                            negate=not opts.negate)
    output_f.write(d)
    output_f.close()
Example #2
0
def split_otu_table_on_sample_metadata(otu_table, mapping_f, mapping_field):
    """ split otu table into sub otu tables where each represent samples
    corresponding to only a certain value in mapping_field
    """
    with errstate(empty='raise'):
        mapping_f = list(mapping_f)
        mapping_values = get_mapping_values(mapping_f, mapping_field)
        tables = 0

        for v in mapping_values:
            v_fp_str = v.replace(' ', '_')
            sample_ids_to_keep = sample_ids_from_metadata_description(
                mapping_f, valid_states_str="%s:%s" % (mapping_field, v))

            try:
                # filtering cannot be inplace otherwise we lose data
                filtered_otu_table = otu_table.filter(
                    lambda values, id_, metadata: id_ in sample_ids_to_keep,
                    axis='sample', inplace=False)
                tables += 1
            except TableException:
                # all samples are filtered out, so no otu table to write
                continue
            yield v_fp_str, filtered_otu_table

        if not tables:
            raise OTUTableSplitError(
                "Could not split OTU tables! There are no matches between the "
                "sample identifiers in the OTU table and the mapping file.")
    def silly_function(ui):
        for c_value in ui.series(coloring_values):
            sample_ids = sample_ids_from_metadata_description(open(mapping_fp, 'U'),
                '%s:%s' % (coloring_header_name, c_value))

            _headers, _data = filter_mapping_file(data, headers, sample_ids, True)
            per_color_subject_values = list(set([row[subject_index] for row in _data]))

            fd = open(join(output_path, 'color_by_'+c_value+'.txt'), 'w')
            for s in ui.series(per_color_subject_values):
                fd.write('%s\n' % s)
            fd.close()

            if not suppress_trajectory_files:
                for s in ui.series(per_color_subject_values):
                    filename = join(output_path, s+'.txt')

                    if opts.verbose:
                        print 'Working on printing', filename

                    COMMAND_CALL = FILTER_CMD % (coords_fp, mapping_fp,
                        '%s:%s' % (subject_header_name, s), filename,
                        sorting_category)
                    o, e, r = qiime_system_call(COMMAND_CALL)
                    if opts.verbose and e:
                        print 'Error happened on filtering step: \n%s' % e
                        continue

                    COMMAND_CALL = CONVERSION_CMD % (filename, filename)
                    o, e, r = qiime_system_call(COMMAND_CALL)
                    if opts.verbose and e:
                        print 'Error happened on conversion step: \n%s' % e
                        continue # useless here but just in case
Example #4
0
def split_mapping_file_on_field(mapping_f,
                                mapping_field,
                                column_rename_ids=None,
                                include_repeat_cols=True):
    """ split mapping file based on value in field """
    
    mapping_f = list(mapping_f)
    mapping_values = get_mapping_values(mapping_f,mapping_field)
    
    mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
    
    if column_rename_ids:
        try:
            column_rename_ids = mapping_headers.index(column_rename_ids)
        except ValueError:
            raise KeyError("Field is not in mapping file (search is case "+\
                "and white-space sensitive). \n\tProvided field: "+\
                "%s. \n\tValid fields: %s" % (mapping_field,' '.join(mapping_headers)))
    
    for v in mapping_values:
        v_fp_str = v.replace(' ','_')
        sample_ids_to_keep = sample_ids_from_metadata_description(
            mapping_f,valid_states_str="%s:%s" % (mapping_field,v))
        
        # parse mapping file each time though the loop as filtering operates on values
        mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
        mapping_headers, mapping_data = filter_mapping_file(
                                         mapping_data, 
                                         mapping_headers,
                                         sample_ids_to_keep,
                                         include_repeat_cols=include_repeat_cols, 
                                         column_rename_ids=column_rename_ids)
        yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    mapping_fp = opts.input_fp
    out_mapping_fp = opts.output_fp
    valid_states = opts.valid_states

    if opts.sample_id_fp:
        valid_sample_ids = \
         get_seqs_to_keep_lookup_from_seq_id_file(open(opts.sample_id_fp,'U'))
    elif mapping_fp and valid_states:
        valid_sample_ids = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)

    data, headers, _ = parse_mapping_file(open(mapping_fp, 'U'))

    good_mapping_file = []
    for line in data:
        if line[0] in valid_sample_ids:
            good_mapping_file.append(line)

    lines = format_mapping_file(headers, good_mapping_file)

    fd = open(out_mapping_fp, 'w')
    fd.write(lines)
    fd.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    coords_fp = opts.input_coords
    mapping_fp = opts.mapping_fp
    output_fp = opts.output_fp
    valid_states = opts.valid_states
    negate = opts.negate
    mapping_header_name = opts.mapping_header_name

    coords_ids, coords, eigen_values, pct_exp = parse_coords(open(coords_fp, "U"))

    data, headers, _ = parse_mapping_file(open(mapping_fp, "U"))

    if mapping_fp and valid_states:
        valid_sample_ids = sample_ids_from_metadata_description(open(mapping_fp, "U"), valid_states)

    valid_coords_ids, valid_coords = filter_sample_ids_from_coords(coords_ids, coords, valid_sample_ids, negate)

    if mapping_header_name:
        sorted_sample_ids = sort_sample_ids(data, headers, mapping_header_name)
        sorted_coord_ids, sorted_coords = sort_coords(valid_coords_ids, valid_coords, sorted_sample_ids)
        valid_coords_ids, valid_coords = sorted_coord_ids, sorted_coords

    lines = format_coords(valid_coords_ids, valid_coords, eigen_values, pct_exp)
    fd = open(output_fp, "w")
    fd.writelines(lines)
    fd.close
Example #7
0
def make_profiles_by_category(mapping_fp, taxa_level, category):
    """ Creates a list of profiles for each unique value in the category
    Inputs:
        mapping_fp: filepath to the mapping file
        category: mapping file category to split data over
                  defaults to HOST_SUBJECT_ID
    Returns a dictionary keyed by the values on that category and a list of 
        profiles as values
    """
    # Parse the mapping file
    map_f = open(mapping_fp, 'U')
    mapping_data, comments = parse_mapping_file_to_dict(map_f)
    map_f.close()
    # Get a list of unique keys for the specified category
    if category == 'SampleID':
        result = {}
        for sid in mapping_data:
            result[sid] = [make_profile_by_sid(mapping_data, sid, taxa_level)]
    else:
        values = set([mapping_data[sid][category] for sid in mapping_data])
        result = {}
        # Loop over each value in that category
        for value in values:
            # Re-open the mapping file
            map_f = open(mapping_fp, 'U')
            # Get sample ids that match the value
            sids = sample_ids_from_metadata_description(map_f,
                                                        category+":"+value)
            map_f.close()
            # Create the list with all the profiles of the sample IDs in this
            # category value
            result[value] = [make_profile_by_sid(mapping_data,
                                                sid, taxa_level) for sid in sids]
    return result
Example #8
0
def split_mapping_file_on_field(mapping_f,
                                mapping_field,
                                column_rename_ids=None,
                                include_repeat_cols=True):
    """ split mapping file based on value in field """

    mapping_f = list(mapping_f)
    mapping_values = get_mapping_values(mapping_f, mapping_field)

    mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)

    if column_rename_ids:
        try:
            column_rename_ids = mapping_headers.index(column_rename_ids)
        except ValueError:
            raise KeyError("Field is not in mapping file (search is case " +
                           "and white-space sensitive). \n\tProvided field: " +
                           "%s. \n\tValid fields: %s" % (mapping_field, ' '.join(mapping_headers)))

    for v in mapping_values:
        v_fp_str = v.replace(' ', '_')
        sample_ids_to_keep = sample_ids_from_metadata_description(
            mapping_f, valid_states_str="%s:%s" % (mapping_field, v))

        # parse mapping file each time though the loop as filtering operates on
        # values
        mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
        mapping_headers, mapping_data = filter_mapping_file(
            mapping_data,
            mapping_headers,
            sample_ids_to_keep,
            include_repeat_cols=include_repeat_cols,

            column_rename_ids=column_rename_ids)
        yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    output_f = open(opts.output_distance_matrix, 'w')
    if opts.otu_table_fp:
        otu_table = load_table(opts.otu_table_fp)
        samples_to_keep = otu_table.ids()
        # samples_to_keep = \
        # sample_ids_from_otu_table(open(opts.otu_table_fp,'U'))
    elif opts.sample_id_fp:
        samples_to_keep = \
            get_seqs_to_keep_lookup_from_seq_id_file(
                open(opts.sample_id_fp, 'U'))
    elif opts.mapping_fp and opts.valid_states:
        try:
            samples_to_keep = sample_ids_from_metadata_description(
                open(opts.mapping_fp, 'U'), opts.valid_states)
        except ValueError as e:
            option_parser.error(e.message)
    else:
        option_parser.error('must pass either --sample_id_fp, -t, or -m and '
                            '-s')
    # note that negate gets a little weird here. The function we're calling
    # removes the specified samples from the distance matrix, but the other
    # QIIME filter scripts keep these samples specified.  So, the interface of
    # this script is designed to keep the specified samples, and therefore
    # negate=True is passed to filter_samples_from_distance_matrix by default.
    d = filter_samples_from_distance_matrix(
        parse_distmat(
            open(opts.input_distance_matrix, 'U')),
        samples_to_keep,
        negate=not opts.negate)
    output_f.write(d)
    output_f.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if not ((mapping_fp and valid_states) or
            min_count != 0 or
            not isinf(max_count) or
            sample_id_fp is not None):
        option_parser.error("No filtering requested. Must provide either "
                            "mapping_fp and valid states, min counts, "
                            "max counts, or sample_id_fp (or some combination "
                            "of those).")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate"
                            " output mapping file.")

    otu_table =  load_table(opts.input_fp)

    if mapping_fp and valid_states:
        sample_ids_to_keep = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
    else:
        sample_ids_to_keep = otu_table.ids()

    if sample_id_fp is not None:
        sample_id_f_ids = set([l.strip().split()[0]
                              for l in open(sample_id_fp, 'U') if not l.startswith('#')])
        sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(otu_table,
                                                       sample_ids_to_keep,
                                                       min_count,
                                                       max_count)
    write_biom_table(filtered_otu_table, output_fp)

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(
            open(mapping_fp, 'U'))
        mapping_headers, mapping_data = \
            filter_mapping_file(
                mapping_data,
                mapping_headers,
                filtered_otu_table.ids())
        open(
            output_mapping_fp,
            'w').write(
            format_mapping_file(
                mapping_headers,
                mapping_data))
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if mapping_fp is None and valid_states is not None:
        option_parser.error("--mapping_fp must be provided if --valid_states " "is passed.")

    if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None):
        option_parser.error(
            "No filtering requested. Must provide either "
            "mapping_fp and valid states, min counts, "
            "max counts, or sample_id_fp (or some combination "
            "of those)."
        )
    if (mapping_fp and valid_states) and sample_id_fp:
        option_parser.error("Providing both --sample_id_fp and " "--mapping_fp/--valid_states is not supported.")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate" " output mapping file.")

    otu_table = load_table(opts.input_fp)

    negate_sample_id_fp = opts.negate_sample_id_fp
    if mapping_fp and valid_states:
        sample_ids_to_keep = sample_ids_from_metadata_description(open(mapping_fp, "U"), valid_states)
        negate_sample_id_fp = False
    else:
        sample_ids_to_keep = otu_table.ids()

        if sample_id_fp is not None:
            o = open(sample_id_fp, "U")
            sample_id_f_ids = set([l.strip().split()[0] for l in o if not l.startswith("#")])
            o.close()
            sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(
        otu_table, sample_ids_to_keep, min_count, max_count, negate_ids_to_keep=negate_sample_id_fp
    )

    try:
        write_biom_table(filtered_otu_table, output_fp)
    except EmptyBIOMTableError:
        option_parser.error(
            "Filtering resulted in an empty BIOM table. " "This indicates that no samples remained after filtering."
        )

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_fp, "U"))
        mapping_headers, mapping_data = filter_mapping_file(mapping_data, mapping_headers, filtered_otu_table.ids())
        open(output_mapping_fp, "w").write(format_mapping_file(mapping_headers, mapping_data))
Example #12
0
def get_seqs_to_keep_lookup_from_mapping_file(fasta_f, mapping_f, valid_states):
    sample_ids = {}.fromkeys(sample_ids_from_metadata_description(mapping_f, valid_states))
    seqs_to_keep = []
    for seq_id, seq in parse_fasta(fasta_f):
        if seq_id.split("_")[0] in sample_ids:
            seqs_to_keep.append(seq_id)
        else:
            continue
    return {}.fromkeys(seqs_to_keep)
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if not ((mapping_fp and valid_states) or min_count != 0
            or not isinf(max_count) or sample_id_fp is not None):
        option_parser.error(
            "No filtering requested. Must provide either "
            "mapping_fp and valid states, min counts, "
            "max counts, or sample_id_fp (or some combination of those).")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate"
                            " output mapping file.")

    otu_table = parse_biom_table(open(opts.input_fp, 'U'))
    output_f = open(opts.output_fp, 'w')

    if (mapping_fp and valid_states):
        sample_ids_to_keep = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
    else:
        sample_ids_to_keep = otu_table.SampleIds

    if (sample_id_fp is not None):
        sample_id_f_ids = set([
            l.strip().split()[0] for l in open(sample_id_fp, 'U')
            if not l.startswith('#')
        ])
        sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(otu_table,
                                                       sample_ids_to_keep,
                                                       min_count, max_count)
    output_f.write(format_biom_table(filtered_otu_table))
    output_f.close()

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(
            open(mapping_fp, 'U'))
        mapping_headers, mapping_data = \
            filter_mapping_file(
                mapping_data,
                mapping_headers,
                filtered_otu_table.SampleIds)
        open(output_mapping_fp,
             'w').write(format_mapping_file(mapping_headers, mapping_data))
Example #14
0
def get_seqs_to_keep_lookup_from_mapping_file(fasta_f,mapping_f,valid_states):
    sample_ids = {}.fromkeys(\
     sample_ids_from_metadata_description(mapping_f,valid_states))
    seqs_to_keep = []
    for seq_id, seq in MinimalFastaParser(fasta_f):
        if seq_id.split('_')[0] in sample_ids:
            seqs_to_keep.append(seq_id)
        else:
            continue
    return {}.fromkeys(seqs_to_keep)
Example #15
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    output_f = open(opts.output_distance_matrix,'w')
    if opts.otu_table_fp:
        otu_table = parse_biom_table(open(opts.otu_table_fp,'U'))
        samples_to_keep = otu_table.SampleIds
        #samples_to_keep = \
        # sample_ids_from_otu_table(open(opts.otu_table_fp,'U'))
    elif opts.sample_id_fp:
        samples_to_keep = \
         get_seqs_to_keep_lookup_from_seq_id_file(open(opts.sample_id_fp,'U'))
    elif opts.mapping_fp and opts.valid_states:
        try:
            samples_to_keep = sample_ids_from_metadata_description(
                open(opts.mapping_fp,'U'),opts.valid_states)
        except ValueError, e:
            option_parser.error(e.message)
Example #16
0
def split_otu_table_on_sample_metadata(otu_table_f,mapping_f,mapping_field):
    """ split otu table into sub otu tables where each represent samples corresponding to only a certain value in mapping_field 
    """
    mapping_f = list(mapping_f)
    mapping_values = get_mapping_values(mapping_f,mapping_field)
    otu_table = parse_biom_table(otu_table_f)
    
    for v in mapping_values:
        v_fp_str = v.replace(' ','_')
        sample_ids_to_keep = sample_ids_from_metadata_description(
            mapping_f,valid_states_str="%s:%s" % (mapping_field,v))
        
        try:
            filtered_otu_table = otu_table.filterSamples(
                              lambda values,id_,metadata: id_ in sample_ids_to_keep)
        except TableException:
            # all samples are filtered out, so no otu table to write
            continue
        yield v_fp_str, format_biom_table(filtered_otu_table)
Example #17
0
def split_otu_table_on_sample_metadata(otu_table_f, mapping_f, mapping_field):
    """ split otu table into sub otu tables where each represent samples corresponding to only a certain value in mapping_field
    """
    mapping_f = list(mapping_f)
    mapping_values = get_mapping_values(mapping_f, mapping_field)
    otu_table = parse_biom_table(otu_table_f)

    for v in mapping_values:
        v_fp_str = v.replace(' ', '_')
        sample_ids_to_keep = sample_ids_from_metadata_description(
            mapping_f, valid_states_str="%s:%s" % (mapping_field, v))

        try:
            filtered_otu_table = otu_table.filterSamples(
                lambda values, id_, metadata: id_ in sample_ids_to_keep)
        except TableException:
            # all samples are filtered out, so no otu table to write
            continue
        yield v_fp_str, format_biom_table(filtered_otu_table)
Example #18
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    output_f = open(opts.output_distance_matrix, 'w')
    if opts.otu_table_fp:
        otu_table = parse_biom_table(open(opts.otu_table_fp, 'U'))
        samples_to_keep = otu_table.SampleIds
        #samples_to_keep = \
        # sample_ids_from_otu_table(open(opts.otu_table_fp,'U'))
    elif opts.sample_id_fp:
        samples_to_keep = \
         get_seqs_to_keep_lookup_from_seq_id_file(open(opts.sample_id_fp,'U'))
    elif opts.mapping_fp and opts.valid_states:
        try:
            samples_to_keep = sample_ids_from_metadata_description(
                open(opts.mapping_fp, 'U'), opts.valid_states)
        except ValueError, e:
            option_parser.error(e.message)
Example #19
0
def split_otu_table_on_sample_metadata(otu_table, mapping_f, mapping_field):
    """ split otu table into sub otu tables where each represent samples corresponding to only a certain value in mapping_field
    """
    mapping_f = list(mapping_f)
    mapping_values = get_mapping_values(mapping_f, mapping_field)

    for v in mapping_values:
        v_fp_str = v.replace(' ', '_')
        sample_ids_to_keep = sample_ids_from_metadata_description(
            mapping_f, valid_states_str="%s:%s" % (mapping_field, v))

        try:
            # filtering cannot be inplace otherwise we lose data
            filtered_otu_table = otu_table.filter(
                lambda values, id_, metadata: id_ in sample_ids_to_keep,
                axis='observation', inplace=False)
        except TableException:
            # all samples are filtered out, so no otu table to write
            continue
        yield v_fp_str, filtered_otu_table
Example #20
0
def format_vectors_to_js(mapping_file_data,
                         mapping_file_headers,
                         coords_data,
                         coords_headers,
                         connected_by_header,
                         sorted_by_header=None):
    """Write a string representing the vectors in a PCoA plot as javascript

    Inputs:
    mapping_file_data: contents of the mapping file
    mapping_file_headers: headers of the mapping file
    coords_data: coordinates of the PCoA plot in a numpy 2-D array or a list of
    numpy 2-D arrays for jackknifed input
    coords_headers: headers of the coords in the PCoA plot or a list of lists
    with the headers for jackknifed input
    connected_by_header: header of the mapping file that represents how the
    lines will be connected
    sorted_by_header: numeric-only header name to sort the samples in the
    vectors

    Output:
    js_vectors_string: string that represents the vectors in the shape of a
    javascript object

    Notes:
    If using jackknifed input, the coordinates and headers that will be used are
    the ones belonging to the master coords i. e. the first element.
    """

    js_vectors_string = []
    js_vectors_string.append('\nvar g_vectorPositions = new Array();\n')

    if connected_by_header != None:
        # check if we are processing jackknifed input, if so just get the master
        if type(coords_data) == list:
            coords_data = coords_data[0]
            coords_headers = coords_headers[0]

        columns_to_keep = ['SampleID', connected_by_header]

        # do not ad None if sorted_by_header is None or empty
        if sorted_by_header:
            columns_to_keep.append(sorted_by_header)

        # reduce the amount of data by keeping the required fields only
        mapping_file_data, mapping_file_headers =\
            keep_columns_from_mapping_file(mapping_file_data,
            mapping_file_headers, columns_to_keep)

        # format the mapping file to use this with the filtering function
        mf_string = format_mapping_file(mapping_file_headers,
                                        mapping_file_data)

        index = mapping_file_headers.index(connected_by_header)
        connected_by = list(set([line[index] for line in mapping_file_data]))

        for category in connected_by:
            # convert to StringIO to for each iteration; else the object
            # won't be usable after the first iteration & you'll get an error
            sample_ids = sample_ids_from_metadata_description(
                StringIO(mf_string), '%s:%s' % (connected_by_header, category))

            # if there is a sorting header, sort the coords using these values
            if sorted_by_header:
                sorting_index = mapping_file_headers.index(sorted_by_header)
                to_sort = [line for line in mapping_file_data if line[0] in\
                    sample_ids]

                # get the sorted sample ids from the sorted-reduced mapping file
                sample_ids = zip(
                    *sorted(to_sort, key=lambda x: float(x[sorting_index])))[0]

            # each category value is a new vector
            js_vectors_string.append(
                "g_vectorPositions['%s'] = new Array();\n" % (category))

            for s in sample_ids:
                index = coords_headers.index(s)

                # print the first three elements of each coord for each sample
                js_vectors_string.append(
                    "g_vectorPositions['%s']['%s'] = %s;\n" %
                    (category, s, coords_data[index, :3].tolist()))

    return ''.join(js_vectors_string)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    otu_table_fp = opts.otu_table_fp
    mothur_counts_fp = opts.mothur_counts_fp
    mapping_fp = opts.mapping_fp
    valid_states = opts.valid_states
    blank_id_fp = opts.blank_id_fp
    contaminant_db_fp = opts.contaminant_db_fp
    contaminant_similarity = opts.contaminant_similarity
    max_correlation = opts.max_correlation
    correlate_header = opts.correlate_header
    input_fasta_fp = opts.input_fasta_fp
    otu_map_fp = opts.otu_map_fp
    output_dir = opts.output_dir
    min_relabund_threshold = opts.min_relabund_threshold
    prescreen_threshold = opts.prescreen_threshold
    removal_stat_blank = opts.removal_stat_blank
    removal_stat_sample = opts.removal_stat_sample
    removal_differential = opts.removal_differential
    reinstatement_stat_sample = opts.reinstatement_stat_sample
    reinstatement_stat_blank = opts.reinstatement_stat_blank
    reinstatement_differential = opts.reinstatement_differential
    reinstatement_sample_number = opts.reinstatement_sample_number
    reinstatement_method = opts.reinstatement_method
    write_output_seq_lists = opts.write_output_seq_lists
    write_filtered_output = opts.write_filtered_output
    drop_lib_threshold = opts.drop_lib_threshold
    write_per_seq_stats = opts.write_per_seq_stats
    write_per_library_stats = opts.write_per_library_stats
    write_per_seq_disposition = opts.write_per_seq_disposition

    # Make unique seq OTU table (biom file)

    # Compute unique seq stats
    #   output biom file with unique seq stats

    # Optionally: make candidate contaminant DB
    #   remove sequences present at higher abundance in samples
    #   cluster blanks
    #   remove low-abundance contaminant OTUs

    # Filter by similarity against candidate contaminant DB
    #   annotate unique seq OTU table with top hit (OTU#, rep seq, ID%)
    #   make list of seqs @ threshold

    # Calculate reinstatement rule for filtered sequences

    # Generate lists of seqs failing:
    #   - unique seq rule
    #   - hit to contaminant
    #   - reinstatement after hit

    # Make sure passed at least one of an OTU biom or mothur counts table file
    input_file_counter = 0

    if mothur_counts_fp:
        input_file_counter += 1
        unique_seq_biom = mothur_counts_to_biom(mothur_counts_fp)
        mothur_output = True
        print "mothur input"

    if otu_table_fp:
        input_file_counter += 1
        unique_seq_biom = load_table(otu_table_fp)
        mothur_output = False
        print "BIOM input"

    if input_file_counter != 1:
        option_parser.error("must provide ONLY ONE of an OTU table biom file or"
                            "mothur counts table")

    # Check to make sure that if blank-based contamination filtering requested,
    # all necessary options are specified:

    removal_options_counter = 0
    if removal_stat_blank:
        removal_options_counter += 1
    if removal_stat_sample:
        removal_options_counter += 1
    if removal_differential:
        removal_options_counter += 1

    if ((removal_options_counter > 0) and (removal_options_counter < 3)):
        option_parser.error("Must provide all of "
                            "removal_stats_blank, "
                            "removal_stat_sample, and "
                            "removal_differential, or none.")
    elif removal_options_counter == 0:
        blank_stats_removal = False
    elif removal_options_counter == 3:
        blank_stats_removal = True


    # If reference-based filtering requested, make sure all necessary options
    # have been specified:

    if contaminant_db_fp and not input_fasta_fp:
        option_parser.error("If specifying ref-based contaminant ID, must "
                            "also specify path to input sequence fasta")


    # If correlation-based filtering requested, make sure correlate data 
    # are specified

    if max_correlation and not correlate_header:
        option_parser.error("If specifying maximum Spearman correlation, must "
                           "also provide map column header for correlate data")


    # If sequence reinstatement is requested, make sure all necessary options
    # are specified

    reinstatement_options_counter = 0
    if reinstatement_stat_blank:
        reinstatement_options_counter += 1
    if reinstatement_stat_sample:
        reinstatement_options_counter += 1
    if reinstatement_differential:
        reinstatement_options_counter += 1

    if ((reinstatement_options_counter > 0) and 
        (reinstatement_options_counter < 3)):
        option_parser.error("Must provide all of "
                            "reinstatement_stats_blank, "
                            "reinstatement_stat_sample, and "
                            "reinstatement_differential, or none.")

    if ((reinstatement_options_counter == 3 and reinstatement_sample_number)
        and not reinstatement_method):
        option_parser.error("If providing sample number AND abundance criteria "
                            "for sequence reinstatement, must also provide "
                            "a method for combining results.")

    if reinstatement_options_counter == 3 or reinstatement_sample_number:
        reinstatement = True
    else:
        reinstatement = False

    # get blank sample IDs from mapping file or sample ID list

    if mapping_fp and valid_states:
        blank_sample_ids = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
        blanks = True
    elif blank_id_fp is not None:
        blank_id_f = open(blank_id_fp, 'Ur')
        blank_sample_ids = set([line.strip().split()[0]
                                for line in blank_id_f
                                if not line.startswith('#')])
        blank_id_f.close()
        blanks = True
    else:
        blanks = False


    # Initialize output objets  

    output_dict = {}
    contaminant_types = []

    contamination_stats_dict = None
    contamination_stats_header = None
    corr_data_dict = None

    # Do blank-based stats calculations, if not there check to make sure no 
    # blank-dependent methods are requested:

    if blanks:
        if prescreen_threshold:
            low_contam_libraries = prescreen_libraries(unique_seq_biom,
                                                       blank_sample_ids,
                                                       removal_stat_sample, 
                                                       removal_stat_blank, 
                                                       removal_differential, 
                                                       prescreen_threshold)

            contamination_stats_header, contamination_stats_dict = \
                get_contamination_stats(unique_seq_biom,
                                        blank_sample_ids,
                                        exp_sample_ids=low_contam_libraries)
        else:
            contamination_stats_header, contamination_stats_dict = \
                get_contamination_stats(unique_seq_biom, blank_sample_ids)

    elif (blank_stats_removal or reinstatement or prescreen_threshold):
        option_parser.error("Blank-based filtering requested but no blank"
                            "samples indicated in mapping file or ID file.")
    else:
        contamination_stats_header, contamination_stats_dict = \
            get_contamination_stats(unique_seq_biom)


    seq_ids = unique_seq_biom.ids(axis='observation')


    # Do blank-based contaminant identification

    if min_relabund_threshold:
        output_dict['below_relabund_threshold'] = pick_min_relabund_threshold(
                                                  contamination_stats_dict,
                                                  contamination_stats_header,
                                                  min_relabund_threshold)


    if blank_stats_removal:
        output_dict['abund_contaminants'] = compare_blank_abundances(contamination_stats_dict, 
                                contamination_stats_header,
                                removal_stat_sample,
                                removal_stat_blank,
                                removal_differential,
                                negate=True)

        contaminant_types.append('abund_contaminants')


    # Do reference-based contaminant identification

    if contaminant_db_fp:
        output_dict['ref_contaminants'] = pick_ref_contaminants(seq_ids, contaminant_db_fp, input_fasta_fp, contaminant_similarity, output_dir)

        contaminant_types.append('ref_contaminants')


    # Do spearman correlation based contaminant identification

    if max_correlation:
        metadata_dict = parse_mapping_file_to_dict(open(mapping_fp, 'U'))[0]

        corr_data_dict = {x: float(metadata_dict[x][correlate_header]) for x in metadata_dict}

        output_dict['corr_contaminants'], corr_contaminant_dict = pick_corr_contaminants(unique_seq_biom,
                                                   corr_data_dict,
                                                   max_correlation)

        contaminant_types.append('corr_contaminants')
    else:
        corr_contaminant_dict = None


    # Putative contaminants are those that have been identified by any method

    output_dict['putative_contaminants'] = set.union(*map(set, [output_dict[x] for x in contaminant_types]))


    # If considering low abundance sequences, remove those from consideration as potential contaminants 

    if 'below_relabund_threshold' in output_dict:
        output_dict['putative_contaminants'] = output_dict['putative_contaminants'] - set(output_dict['below_relabund_threshold'])


    # Pick abundance-criterion seqs to reinstate

    if (reinstatement_stat_blank and reinstatement_stat_sample and reinstatement_differential):
        output_dict['abund_reinstated_seqs'] = reinstate_abund_seqs(output_dict['putative_contaminants'], 
                     contamination_stats_dict, 
                     contamination_stats_header,
                     reinstatement_stat_sample,
                     reinstatement_stat_blank,
                     reinstatement_differential)

        output_dict['reinstated_seqs'] = output_dict['abund_reinstated_seqs']


    # Pick incidence-criterion seqs to reinstate
    if reinstatement_sample_number:
        output_dict['incidence_reinstated_seqs'] = reinstate_incidence_seqs(
                     output_dict['putative_contaminants'],
                     unique_seq_biom,
                     blank_sample_ids,
                     reinstatement_sample_number)

        output_dict['reinstated_seqs'] = output_dict['incidence_reinstated_seqs']


    # combine incidence and abundance reinstatements
    if reinstatement_sample_number and reinstatement_stat_blank:
        if reinstatement_method == "union":
            output_dict['reinstated_seqs'] = output_dict['abund_reinstated_seqs'] | output_dict['incidence_reinstated_seqs']
        elif reinstatement_method == "intersection":
            output_dict['reinstated_seqs'] = output_dict['abund_reinstated_seqs'] & output_dict['incidence_reinstated_seqs']


    # make sets for sequence _never_ identified as contaminants:

    output_dict['ever_good_seqs'] = set(seq_ids) - output_dict['putative_contaminants']

    # If considering low abundance sequences, remove those from consideration as potential contaminants 

    if 'below_relabund_threshold' in output_dict:
        output_dict['ever_good_seqs'] = output_dict['ever_good_seqs'] - set(output_dict['below_relabund_threshold'])

    # Make set of good seqs for final filtering

    final_good_seqs = output_dict['ever_good_seqs']

    # ...and those either never ID'd as contaminants or reinstated:
    if reinstatement:
        output_dict['all_good_seqs'] = set(output_dict['ever_good_seqs'] | output_dict['reinstated_seqs'])
        final_good_seqs = output_dict['all_good_seqs']
        # ...and those who remain contaminants after reinstatement:
        output_dict['never_good_seqs'] = set(output_dict['putative_contaminants'] - output_dict['reinstated_seqs'])


    # print filtered OTU maps if given a QIIME OTU map input

    if otu_map_fp:
        print_filtered_output('otu_map', otu_map_fp, output_dir, output_dict)


    # print filtered Mothur counts tables if given a Mothur counts table input

    if mothur_output:
        print_filtered_output('mothur_counts', mothur_counts_fp, output_dir, output_dict)


    # print filtered seq header files if requested

    if write_output_seq_lists:
        print_filtered_output('seq_headers', seq_ids, output_dir, output_dict)


    # filter final biom file to just good seqs

    filtered_biom = unique_seq_biom.filter(lambda val, id_, metadata: id_ in final_good_seqs,
                     axis='observation', invert=False, inplace=False)

    # drop heavily contaminated libraries if requested

    if drop_lib_threshold:
        dropped_libs = unique_seq_biom.norm(inplace=False).filter(lambda val, id_, metadata: id_ in final_good_seqs,
                 axis='observation', invert=False, inplace=False).filter(lambda val, id_, metadata: sum(val) >= drop_lib_threshold,
                 axis='sample', invert=True, inplace=False).ids(axis='sample')
        filtered_biom.filter(lambda val, id_, metadata: id_ in dropped_libs,
                 axis='sample', invert=True, inplace=True)
    else:
        dropped_libs = []


    # print filtered biom/mothur_output if library filtering is requested

    if write_filtered_output:

        if mothur_output:
            output_counts_string = biom_to_mothur_counts(filtered_biom)
            with open(os.path.join(output_dir,'decontaminated_table.counts'), "w") as output_counts_file:
                output_counts_file.write(output_counts_string)
        else:
            output_biom_string = filtered_biom.to_json('Filtered by decontaminate.py')
            output_biom_string
            with open(os.path.join(output_dir,'decontaminated_otu_table.biom'), "w") as output_biom_file:
                output_biom_file.write(output_biom_string)



    # print per-library stats if requested

    if write_per_library_stats:
        per_library_stats, per_library_stats_header = calc_per_library_decontam_stats(unique_seq_biom, output_dict)
        library_stats_string = print_per_library_stats(per_library_stats, per_library_stats_header, unique_seq_biom.ids(axis='sample'), dropped_libs=dropped_libs)
        
        with open(os.path.join(output_dir,'decontamination_per_library_stats.txt'), "w") as output_stats_file:
            output_stats_file.write(library_stats_string)


    # print otu by disposition file if requested

    if write_per_seq_disposition:
        per_seq_disposition = print_otu_disposition(seq_ids, output_dict)

        with open(os.path.join(output_dir,'decontamination_per_otu_disposition.txt'), "w") as output_stats_file:
            output_stats_file.write(per_seq_disposition)


    # print log file / per-seq info
    if write_per_seq_stats:
        print_results_file(seq_ids,
                       output_dict,
                       os.path.join(output_dir,'contamination_summary.txt'),
                       contamination_stats_header,
                       contamination_stats_dict,
                       corr_contaminant_dict)
Example #22
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if (mapping_fp is None and valid_states is not None):
        option_parser.error("--mapping_fp must be provided if --valid_states "
                            "is passed.")

    if not ((mapping_fp and valid_states) or min_count != 0
            or not isinf(max_count) or sample_id_fp is not None):
        option_parser.error("No filtering requested. Must provide either "
                            "mapping_fp and valid states, min counts, "
                            "max counts, or sample_id_fp (or some combination "
                            "of those).")
    if (mapping_fp and valid_states) and sample_id_fp:
        option_parser.error("Providing both --sample_id_fp and "
                            "--mapping_fp/--valid_states is not supported.")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate"
                            " output mapping file.")

    otu_table = load_table(opts.input_fp)

    negate_sample_id_fp = opts.negate_sample_id_fp
    if mapping_fp and valid_states:
        sample_ids_to_keep = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
        negate_sample_id_fp = False
    else:
        sample_ids_to_keep = otu_table.ids()

        if sample_id_fp is not None:
            o = open(sample_id_fp, 'U')
            sample_id_f_ids = set(
                [l.strip().split()[0] for l in o if not l.startswith('#')])
            o.close()
            sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(
        otu_table,
        sample_ids_to_keep,
        min_count,
        max_count,
        negate_ids_to_keep=negate_sample_id_fp)

    try:
        write_biom_table(filtered_otu_table, output_fp)
    except EmptyBIOMTableError:
        option_parser.error(
            "Filtering resulted in an empty BIOM table. "
            "This indicates that no samples remained after filtering.")

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(
            open(mapping_fp, 'U'))
        mapping_headers, mapping_data = \
            filter_mapping_file(
                mapping_data,
                mapping_headers,
                filtered_otu_table.ids())
        open(output_mapping_fp,
             'w').write(format_mapping_file(mapping_headers, mapping_data))
Example #23
0
def format_vectors_to_js(mapping_file_data, mapping_file_headers, coords_data,
                        coords_headers, connected_by_header,
                        sorted_by_header=None):
    """Write a string representing the vectors in a PCoA plot as javascript

    Inputs:
    mapping_file_data: contents of the mapping file
    mapping_file_headers: headers of the mapping file
    coords_data: coordinates of the PCoA plot in a numpy 2-D array or a list of
    numpy 2-D arrays for jackknifed input
    coords_headers: headers of the coords in the PCoA plot or a list of lists
    with the headers for jackknifed input
    connected_by_header: header of the mapping file that represents how the
    lines will be connected
    sorted_by_header: numeric-only header name to sort the samples in the
    vectors

    Output:
    js_vectors_string: string that represents the vectors in the shape of a
    javascript object

    Notes:
    If using jackknifed input, the coordinates and headers that will be used are
    the ones belonging to the master coords i. e. the first element.
    """

    js_vectors_string = []
    js_vectors_string.append('\nvar g_vectorPositions = new Array();\n')

    if connected_by_header != None:
        # check if we are processing jackknifed input, if so just get the master
        if type(coords_data) == list:
            coords_data = coords_data[0]
            coords_headers = coords_headers[0]

        columns_to_keep = ['SampleID', connected_by_header]

        # do not ad None if sorted_by_header is None or empty
        if sorted_by_header:
            columns_to_keep.append(sorted_by_header)

        # reduce the amount of data by keeping the required fields only
        mapping_file_data, mapping_file_headers =\
            keep_columns_from_mapping_file(mapping_file_data,
            mapping_file_headers, columns_to_keep)

        # format the mapping file to use this with the filtering function
        mf_string = format_mapping_file(mapping_file_headers, mapping_file_data)

        index = mapping_file_headers.index(connected_by_header)
        connected_by = list(set([line[index] for line in mapping_file_data]))

        for category in connected_by:
            # convert to StringIO to for each iteration; else the object
            # won't be usable after the first iteration & you'll get an error
            sample_ids = sample_ids_from_metadata_description(
                StringIO(mf_string),'%s:%s' % (connected_by_header,category))

            # if there is a sorting header, sort the coords using these values
            if sorted_by_header:
                sorting_index = mapping_file_headers.index(sorted_by_header)
                to_sort = [line for line in mapping_file_data if line[0] in\
                    sample_ids]

                # get the sorted sample ids from the sorted-reduced mapping file
                sample_ids = zip(*sorted(to_sort,
                    key=lambda x: float(x[sorting_index])))[0]

            # each category value is a new vector
            js_vectors_string.append("g_vectorPositions['%s'] = new Array();\n"
                % (category))

            for s in sample_ids:
                index = coords_headers.index(s)

                # print the first three elements of each coord for each sample
                js_vectors_string.append("g_vectorPositions['%s']['%s'] = %s;\n"
                    % (category, s, coords_data[index, :3].tolist()))

    return ''.join(js_vectors_string)
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    mapping_fp = opts.mapping_fp
    state_values = opts.state_values.split(',')
    metadata_categories = opts.metadata_categories
    state_category = opts.state_category
    individual_id_category = opts.individual_id_category
    output_dir = opts.output_dir
    biom_table_fp = opts.biom_table_fp
    observation_ids = opts.observation_ids
    if not observation_ids is None:
        observation_ids = observation_ids.split(',')
    valid_states = opts.valid_states
    ymin = opts.ymin
    ymax = opts.ymax
    line_color = opts.line_color

    # validate the input - currently only supports either biom data
    # or mapping file data. if useful in the future it shouldn't be too
    # hard to allow the user to provide both.
    if metadata_categories and biom_table_fp:
        option_parser.error(
            "Can only pass --metadata_categories or --biom_table_fp, not both."
        )
    elif not (metadata_categories or biom_table_fp):
        option_parser.error(
            "Must pass either --metadata_categories or --biom_table_fp.")
    else:
        pass

    # parse the mapping file to a dict
    mapping_data = parse_mapping_file_to_dict(open(mapping_fp, 'U'))[0]

    # currently only support for pre/post (ie, two-state) tests
    if len(state_values) != 2:
        option_parser.error(
            "Exactly two state_values must be passed separated by a comma.")

    # filter mapping_data, if requested
    if valid_states:
        sample_ids_to_keep = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
        for sid in mapping_data.keys():
            if sid not in sample_ids_to_keep:
                del mapping_data[sid]

    if biom_table_fp:
        biom_table = parse_biom_table(open(biom_table_fp, 'U'))
        analysis_categories = observation_ids or biom_table.ObservationIds
        personal_ids_to_state_values = \
         extract_per_individual_state_metadata_from_sample_metadata_and_biom(
                                     mapping_data,
                                     biom_table,
                                     state_category,
                                     state_values,
                                     individual_id_category,
                                     observation_ids=analysis_categories)
    else:
        analysis_categories = metadata_categories.split(',')
        personal_ids_to_state_values = \
         extract_per_individual_state_metadata_from_sample_metadata(
                                     mapping_data,
                                     state_category,
                                     state_values,
                                     individual_id_category,
                                     analysis_categories)

    paired_difference_analyses(personal_ids_to_state_values,
                               analysis_categories,
                               state_values,
                               output_dir,
                               line_color=line_color,
                               ymin=ymin,
                               ymax=ymax)
Example #25
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    category = opts.category
    mapping_fp = opts.mapping_fp

    colors_used = []

    if (category and mapping_fp is None) or (category is None and mapping_fp):
        option_parser.error('If coloring by a metadata category, both the '
                            'category and the mapping file must be supplied.')
    elif mapping_fp and category:
        mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_fp,
                                                                   'U'))
        if category not in mapping_headers:
            option_parser.error("The category supplied must exist in the "
                                "metadata mapping file, '%s' does not exist." % category)
        index = mapping_headers.index(category)
        categories = list(set([line[index] for line in mapping_data]))
    list_of_plots = []

    if opts.binning is None:
        ranges = []
    else:
        # simple ranges format validation
        if opts.binning.count('[') != opts.binning.count(']') or\
                opts.binning.count('[') != opts.binning.count(','):
            raise ValueError("The binning input has an error: '%s'; " % +
                             "\nthe format should be [increment1,top_limit1][increment2,top_limit2]")
        # spliting in ranges
        rgn_txt = opts.binning.split('][')
        # removing left [ and right ]
        rgn_txt[0] = rgn_txt[0][1:]
        rgn_txt[-1] = rgn_txt[-1][:-1]
        # converting into int
        ranges = []
        max = 0

        for i, r in enumerate(rgn_txt):
            try:
                values = map(float, r.split(','))
            except ValueError:
                raise ValueError(
                    "Not a valid format for binning %s" %
                    opts.binning)
            if len(values) != 2:
                raise ValueError(
                    "All ranges must have only 2 values: [%s]" %
                    r)
            elif i + 1 != len(rgn_txt):
                if values[0] > values[1]:
                    raise ValueError(
                        "The bin value can't be greater than the max value: [%s]" %
                        r)
                elif values < 0:
                    raise ValueError(
                        "This value can not be negative: [%s]" %
                        r)
                elif max > values[1]:
                    raise ValueError(
                        "This value can not smaller than the previous one: [%s]" %
                        r)
                else:
                    max = values[1]

            ranges.append(values)

    x_samples, x_distmtx = parse_distmat(open(opts.input_path_x, 'U'))
    y_samples, y_distmtx = parse_distmat(open(opts.input_path_y, 'U'))

    if opts.ignore_missing_samples:
        ignoring_from_x = list(set(x_samples) - set(y_samples))
        ignoring_from_y = list(set(y_samples) - set(x_samples))

        if opts.verbose:
            print '\nFrom %s we are ignoring: %s\n' % (opts.input_path_x, ignoring_from_x)
            print '\nFrom %s we are ignoring: %s\n' % (opts.input_path_y, ignoring_from_y)
            print '\nOnly using: %s\n' % (list(set(x_samples) & set(y_samples)))

        x_file = StringIO(
            filter_samples_from_distance_matrix((x_samples, x_distmtx), ignoring_from_x))
        x_samples, x_distmtx = parse_distmat(x_file)

        y_file = StringIO(
            filter_samples_from_distance_matrix((y_samples, y_distmtx), ignoring_from_y))
        y_samples, y_distmtx = parse_distmat(y_file)
    else:
        if x_distmtx.shape != y_distmtx.shape:
            raise ValueError('The distance matrices have different sizes. ' +
                             'You can cancel this error by passing --ignore_missing_samples')

    figure()
    if category is None:
        x_val, y_val, x_fit, y_fit, func_text = fit_semivariogram(
            (x_samples, x_distmtx), (y_samples, y_distmtx), opts.model, ranges)

        plot(
            x_val,
            y_val,
            color=opts.dot_color,
            marker=opts.dot_marker,
            linestyle="None",
            alpha=opts.dot_alpha)
        plot(
            x_fit,
            y_fit,
            linewidth=2.0,
            color=opts.line_color,
            alpha=opts.line_alpha)
    else:
        # not all the categories that are going to be enumerated are found in
        # the distance matrices i.e. the mapping file is a superset that can
        # contain more samples than the distance matrices
        used_categories = deepcopy(categories)

        for index, single_category in enumerate(categories):
            good_sample_ids = sample_ids_from_metadata_description(
                open(mapping_fp), '%s:%s' % (category, single_category))

            try:
                _y_samples, _y_distmtx = parse_distmat(StringIO(
                    filter_samples_from_distance_matrix((y_samples, y_distmtx),
                                                        good_sample_ids, negate=True)))
                _x_samples, _x_distmtx = parse_distmat(StringIO(
                    filter_samples_from_distance_matrix((x_samples, x_distmtx),
                                                        good_sample_ids, negate=True)))
            except ValueError:
                # no samples found for this category
                used_categories.remove(single_category)
                continue

            x_val, y_val, x_fit, y_fit, func_text = fit_semivariogram(
                (_x_samples, _x_distmtx), (_y_samples, _y_distmtx),
                opts.model, ranges)

            # retrieve one of the colors the "QIIME" colors and add it to the
            # list of used colors for the creation of the legends in the plot
            color_only = get_qiime_hex_string_color(index)
            colors_used.append(color_only)

            plot(x_val, y_val, color=color_only, marker=opts.dot_marker,
                 linestyle="None", alpha=opts.dot_alpha)
            plot(x_fit, y_fit, linewidth=2.0, color=color_only,
                 alpha=opts.line_alpha, label=single_category)

    # set plot limits if requested
    x_lb, x_ub = xlim()
    y_lb, y_ub = ylim()
    if opts.x_min is not None:
        x_lb = opts.x_min
    if opts.x_max is not None:
        x_ub = opts.x_max
    if opts.y_min is not None:
        y_lb = opts.y_min
    if opts.y_max is not None:
        y_ub = opts.y_max
    xlim(x_lb, x_ub)
    ylim(y_lb, y_ub)


    x_label = opts.x_label
    y_label = opts.y_label
    fig_title = '%s (%s)' % (opts.fig_title, opts.model)

    xlabel(x_label)
    ylabel(y_label)
    if opts.print_model:
        title(fig_title + ' ' + func_text)
    else:
        title(fig_title)

    savefig(opts.output_path)

    # print the legends after the figure is exported to avoid conflicts
    if category:
        # if there's a desired format, use that, else default it to png
        _, extension = splitext(opts.output_path)

        # remove the dot, else, make_legend will add it to the filename
        extension = extension.replace('.', '')

        if extension == '':
            extension = 'png'
        make_legend(used_categories, colors_used, 0, 0, 'black', 'white',
                    opts.output_path, extension, 80)
Example #26
0
def get_seqs_to_keep_lookup_from_mapping_file(mapping_f, valid_states):
    sample_ids = set(
        sample_ids_from_metadata_description(mapping_f, valid_states))
    return sample_ids
Example #27
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_dir = opts.output_dir

    if opts.num_fraction_for_core_steps < 2:
        option_parser.error(
            "Must perform at least two steps. Increase --num_fraction_for_core_steps.")
    fractions_for_core = np.linspace(opts.min_fraction_for_core,
                                     opts.max_fraction_for_core,
                                     opts.num_fraction_for_core_steps)

    otu_md = opts.otu_md
    valid_states = opts.valid_states
    mapping_fp = opts.mapping_fp

    create_dir(output_dir)

    if valid_states and opts.mapping_fp:
        sample_ids = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'),
            valid_states)
        if len(sample_ids) < 1:
            option_parser.error(
                "--valid_states pattern didn't match any entries in mapping file: \"%s\"" %
                valid_states)
    else:
        # get core across all samples if user doesn't specify a subset of the
        # samples to work with
        sample_ids = None

    input_table = parse_biom_table(open(input_fp, 'U'))

    otu_counts = []
    summary_figure_fp = join(output_dir, 'core_otu_size.pdf')
    for fraction_for_core in fractions_for_core:
        # build a string representation of the fraction as that gets used
        # several times
        fraction_for_core_str = "%1.0f" % (fraction_for_core * 100.)

        # prep output files
        output_fp = join(
            output_dir,
            'core_otus_%s.txt' %
            fraction_for_core_str)
        output_table_fp = join(
            output_dir,
            'core_table_%s.biom' %
            fraction_for_core_str)
        output_f = open(output_fp, 'w')

        try:
            core_table = filter_table_to_core(input_table,
                                              sample_ids,
                                              fraction_for_core)
        except TableException:
            output_f.write(
                "# No OTUs present in %s %% of samples." %
                fraction_for_core_str)
            output_f.close()
            otu_counts.append(0)
            continue

        # write some header information to file
        if sample_ids is None:
            output_f.write(
                "# Core OTUs across %s %% of samples.\n" %
                fraction_for_core_str)
        else:
            output_f.write(
                "# Core OTUs across %s %% of samples matching the sample metadata pattern \"%s\":\n# %s\n" %
                (fraction_for_core_str, valid_states, ' '.join(sample_ids)))

        # write the otu id and corresponding metadata for all core otus
        otu_count = 0
        for value, id_, md in core_table.iter(axis='observation'):
            output_f.write('%s\t%s\n' % (id_, md[otu_md]))
            otu_count += 1
        output_f.close()

        # write the core biom table
        write_biom_table(core_table, output_table_fp)

        # append the otu count to the list of counts
        otu_counts.append(otu_count)

    plot(fractions_for_core, otu_counts)
    xlim(min(fractions_for_core), max(fractions_for_core))
    ylim(0, max(otu_counts) + 1)
    xlabel(
        "Fraction of samples that OTU must be observed in to be considered 'core'")
    ylabel("Number of OTUs")
    savefig(summary_figure_fp)
Example #28
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    category = opts.category
    mapping_fp = opts.mapping_fp

    colors_used = []

    if (category and mapping_fp == None) or (category == None and mapping_fp):
        option_parser.error('If coloring by a metadata category, both the '
                            'category and the mapping file must be supplied.')
    elif mapping_fp and category:
        mapping_data, mapping_headers, _ = parse_mapping_file(
            open(mapping_fp, 'U'))
        if category not in mapping_headers:
            option_parser.error("The category supplied must exist in the "
                                "metadata mapping file, '%s' does not exist." %
                                category)
        index = mapping_headers.index(category)
        categories = list(set([line[index] for line in mapping_data]))
    list_of_plots = []

    if opts.binning is None:
        ranges = []
    else:
        # simple ranges format validation
        if opts.binning.count('[')!=opts.binning.count(']') or\
          opts.binning.count('[')!=opts.binning.count(','):
            raise ValueError, "The binning input has an error: '%s'; " % +\
             "\nthe format should be [increment1,top_limit1][increment2,top_limit2]"
        # spliting in ranges
        rgn_txt = opts.binning.split('][')
        # removing left [ and right ]
        rgn_txt[0] = rgn_txt[0][1:]
        rgn_txt[-1] = rgn_txt[-1][:-1]
        # converting into int
        ranges = []
        max = 0

        for i, r in enumerate(rgn_txt):
            try:
                values = map(float, r.split(','))
            except ValueError:
                raise ValueError, "Not a valid format for binning %s" % opts.binning
            if len(values) != 2:
                raise ValueError, "All ranges must have only 2 values: [%s]" % r
            elif i + 1 != len(rgn_txt):
                if values[0] > values[1]:
                    raise ValueError, "The bin value can't be greater than the max value: [%s]" % r
                elif values < 0:
                    raise ValueError, "This value can not be negative: [%s]" % r
                elif max > values[1]:
                    raise ValueError, "This value can not smaller than the previous one: [%s]" % r
                else:
                    max = values[1]

            ranges.append(values)

    x_samples, x_distmtx = parse_distmat(open(opts.input_path_x, 'U'))
    y_samples, y_distmtx = parse_distmat(open(opts.input_path_y, 'U'))

    if opts.ignore_missing_samples:
        ignoring_from_x = list(set(x_samples) - set(y_samples))
        ignoring_from_y = list(set(y_samples) - set(x_samples))

        if opts.verbose:
            print '\nFrom %s we are ignoring: %s\n' % (opts.input_path_x,
                                                       ignoring_from_x)
            print '\nFrom %s we are ignoring: %s\n' % (opts.input_path_y,
                                                       ignoring_from_y)
            print '\nOnly using: %s\n' % (
                list(set(x_samples) & set(y_samples)))

        x_file = StringIO(\
            filter_samples_from_distance_matrix((x_samples, x_distmtx), ignoring_from_x))
        x_samples, x_distmtx = parse_distmat(x_file)

        y_file = StringIO(\
            filter_samples_from_distance_matrix((y_samples, y_distmtx), ignoring_from_y))
        y_samples, y_distmtx = parse_distmat(y_file)
    else:
        if x_distmtx.shape != y_distmtx.shape:
            raise ValueError, 'The distance matrices have different sizes. ' +\
                'You can cancel this error by passing --ignore_missing_samples'

    figure()
    if category == None:
        x_val, y_val, x_fit, y_fit, func_text = fit_semivariogram(
            (x_samples, x_distmtx), (y_samples, y_distmtx), opts.model, ranges)

        plot(x_val,
             y_val,
             color=opts.dot_color,
             marker=opts.dot_marker,
             linestyle="None",
             alpha=opts.dot_alpha)
        plot(x_fit,
             y_fit,
             linewidth=2.0,
             color=opts.line_color,
             alpha=opts.line_alpha)
    else:
        for index, single_category in enumerate(categories):
            good_sample_ids = sample_ids_from_metadata_description(
                open(mapping_fp), '%s:%s' % (category, single_category))

            _y_samples, _y_distmtx = parse_distmat(
                StringIO(
                    filter_samples_from_distance_matrix((y_samples, y_distmtx),
                                                        good_sample_ids,
                                                        negate=True)))
            _x_samples, _x_distmtx = parse_distmat(
                StringIO(
                    filter_samples_from_distance_matrix((x_samples, x_distmtx),
                                                        good_sample_ids,
                                                        negate=True)))

            x_val, y_val, x_fit, y_fit, func_text = fit_semivariogram(
                (_x_samples, _x_distmtx), (_y_samples, _y_distmtx), opts.model,
                ranges)

            # retrieve one of the colors the "QIIME" colors and add it to the
            # list of used colors for the creation of the legends in the plot
            color_only = get_qiime_hex_string_color(index)
            colors_used.append(color_only)

            plot(x_val,
                 y_val,
                 color=color_only,
                 marker=opts.dot_marker,
                 linestyle="None",
                 alpha=opts.dot_alpha)
            plot(x_fit,
                 y_fit,
                 linewidth=2.0,
                 color=color_only,
                 alpha=opts.line_alpha,
                 label=single_category)

    if opts.x_min != None and opts.x_max != None:
        xlim([opts.x_min, opts.x_max])
    if opts.y_min != None and opts.y_max != None:
        ylim([opts.y_min, opts.y_max])

    x_label = opts.x_label
    y_label = opts.y_label
    fig_title = '%s (%s)' % (opts.fig_title, opts.model)

    xlabel(x_label)
    ylabel(y_label)
    if opts.print_model:
        title(fig_title + ' ' + func_text)
    else:
        title(fig_title)

    savefig(opts.output_path)

    # print the legends after the figure is exported to avoid conflicts
    if category:
        # if there's a desired format, use that, else default it to png
        _, extension = splitext(opts.output_path)

        # remove the dot, else, make_legend will add it to the filename
        extension = extension.replace('.', '')

        if extension == '':
            extension = 'png'
        make_legend(categories, colors_used, 0, 0, 'black', 'white',
                    opts.output_path, extension, 80)
Example #29
0
def get_seqs_to_keep_lookup_from_mapping_file(mapping_f, valid_states):
    sample_ids = set(sample_ids_from_metadata_description(mapping_f,
                                                          valid_states))
    return sample_ids
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    mapping_fp = opts.mapping_fp
    state_values = opts.state_values.split(',')
    metadata_categories = opts.metadata_categories
    state_category = opts.state_category
    individual_id_category = opts.individual_id_category
    output_dir = opts.output_dir
    biom_table_fp = opts.biom_table_fp
    observation_ids = opts.observation_ids
    if not observation_ids is None:
        observation_ids = observation_ids.split(',')
    valid_states = opts.valid_states
    ymin = opts.ymin
    ymax = opts.ymax
    line_color = opts.line_color

    # validate the input - currently only supports either biom data
    # or mapping file data. if useful in the future it shouldn't be too
    # hard to allow the user to provide both.
    if metadata_categories and biom_table_fp:
        option_parser.error(
            "Can only pass --metadata_categories or --biom_table_fp, not both.")
    elif not (metadata_categories or biom_table_fp):
        option_parser.error(
            "Must pass either --metadata_categories or --biom_table_fp.")
    else:
        pass

    # parse the mapping file to a dict
    mapping_data = parse_mapping_file_to_dict(open(mapping_fp, 'U'))[0]

    # currently only support for pre/post (ie, two-state) tests
    if len(state_values) != 2:
        option_parser.error(
            "Exactly two state_values must be passed separated by a comma.")

    # filter mapping_data, if requested
    if valid_states:
        sample_ids_to_keep = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
        for sid in mapping_data.keys():
            if sid not in sample_ids_to_keep:
                del mapping_data[sid]

    if biom_table_fp:
        biom_table = parse_biom_table(open(biom_table_fp, 'U'))
        analysis_categories = observation_ids or biom_table.ObservationIds
        personal_ids_to_state_values = \
            extract_per_individual_state_metadata_from_sample_metadata_and_biom(
                mapping_data,
                biom_table,
                state_category,
                state_values,
                individual_id_category,
                observation_ids=analysis_categories)
    else:
        analysis_categories = metadata_categories.split(',')
        personal_ids_to_state_values = \
            extract_per_individual_state_metadata_from_sample_metadata(
                mapping_data,
                state_category,
                state_values,
                individual_id_category,
                analysis_categories)

    paired_difference_analyses(personal_ids_to_state_values,
                               analysis_categories,
                               state_values,
                               output_dir,
                               line_color=line_color,
                               ymin=ymin,
                               ymax=ymax)