Example #1
0
def split_mapping_file_on_field(mapping_f,
                                mapping_field,
                                column_rename_ids=None,
                                include_repeat_cols=True):
    """ split mapping file based on value in field """

    mapping_f = list(mapping_f)
    mapping_values = get_mapping_values(mapping_f, mapping_field)

    mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)

    if column_rename_ids:
        try:
            column_rename_ids = mapping_headers.index(column_rename_ids)
        except ValueError:
            raise KeyError("Field is not in mapping file (search is case " +
                           "and white-space sensitive). \n\tProvided field: " +
                           "%s. \n\tValid fields: %s" % (mapping_field, ' '.join(mapping_headers)))

    for v in mapping_values:
        v_fp_str = v.replace(' ', '_')
        sample_ids_to_keep = sample_ids_from_metadata_description(
            mapping_f, valid_states_str="%s:%s" % (mapping_field, v))

        # parse mapping file each time though the loop as filtering operates on
        # values
        mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
        mapping_headers, mapping_data = filter_mapping_file(
            mapping_data,
            mapping_headers,
            sample_ids_to_keep,
            include_repeat_cols=include_repeat_cols,

            column_rename_ids=column_rename_ids)
        yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
    def silly_function(ui):
        for c_value in ui.series(coloring_values):
            sample_ids = sample_ids_from_metadata_description(open(mapping_fp, 'U'),
                '%s:%s' % (coloring_header_name, c_value))

            _headers, _data = filter_mapping_file(data, headers, sample_ids, True)
            per_color_subject_values = list(set([row[subject_index] for row in _data]))

            fd = open(join(output_path, 'color_by_'+c_value+'.txt'), 'w')
            for s in ui.series(per_color_subject_values):
                fd.write('%s\n' % s)
            fd.close()

            if not suppress_trajectory_files:
                for s in ui.series(per_color_subject_values):
                    filename = join(output_path, s+'.txt')

                    if opts.verbose:
                        print 'Working on printing', filename

                    COMMAND_CALL = FILTER_CMD % (coords_fp, mapping_fp,
                        '%s:%s' % (subject_header_name, s), filename,
                        sorting_category)
                    o, e, r = qiime_system_call(COMMAND_CALL)
                    if opts.verbose and e:
                        print 'Error happened on filtering step: \n%s' % e
                        continue

                    COMMAND_CALL = CONVERSION_CMD % (filename, filename)
                    o, e, r = qiime_system_call(COMMAND_CALL)
                    if opts.verbose and e:
                        print 'Error happened on conversion step: \n%s' % e
                        continue # useless here but just in case
Example #3
0
def split_mapping_file_on_field(mapping_f,
                                mapping_field,
                                column_rename_ids=None,
                                include_repeat_cols=True):
    """ split mapping file based on value in field """
    
    mapping_f = list(mapping_f)
    mapping_values = get_mapping_values(mapping_f,mapping_field)
    
    mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
    
    if column_rename_ids:
        try:
            column_rename_ids = mapping_headers.index(column_rename_ids)
        except ValueError:
            raise KeyError("Field is not in mapping file (search is case "+\
                "and white-space sensitive). \n\tProvided field: "+\
                "%s. \n\tValid fields: %s" % (mapping_field,' '.join(mapping_headers)))
    
    for v in mapping_values:
        v_fp_str = v.replace(' ','_')
        sample_ids_to_keep = sample_ids_from_metadata_description(
            mapping_f,valid_states_str="%s:%s" % (mapping_field,v))
        
        # parse mapping file each time though the loop as filtering operates on values
        mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
        mapping_headers, mapping_data = filter_mapping_file(
                                         mapping_data, 
                                         mapping_headers,
                                         sample_ids_to_keep,
                                         include_repeat_cols=include_repeat_cols, 
                                         column_rename_ids=column_rename_ids)
        yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
Example #4
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    map_fp = opts.mapping
    biom_fp = opts.biom_file
    min_seqs_sample = opts.min_seqs_sample
    subject_category = opts.subject_name

    cleaned_fp = opts.clean_fp
    verbose = opts.verbose

    map_data, headers, comments = parse_mapping_file(open(map_fp, 'U'))
    biom_table = parse_biom_table(open(biom_fp, 'U'))

    # getting valid samples from biom file
    real_map_headers, real_map_data = filter_mapping_file(map_data, headers,\
        biom_table.SampleIds, include_repeat_cols=False)

    if subject_category not in real_map_headers:
        raise ValueError, 'This column: %s is not in the mapping file, try %s'%\
            (subject_category, real_map_headers)

    sorted_counts_per_sample = get_sorted_counts_per_sample(biom_table)

    mapping_file_tuple = (real_map_data, real_map_headers)

    # calculate the available subjects at each rarefaction level
    results = make_selectors(sorted_counts_per_sample, min_seqs_sample,\
        mapping_file_tuple, subject_category, verbose=verbose)

    # save the output
    fout = open(cleaned_fp,'w')
    fout.write('#Sequences\tSubjects\tSamples\tMetadata\n')
    fout.write('\n'.join(results))
    fout.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if mapping_fp is None and valid_states is not None:
        option_parser.error("--mapping_fp must be provided if --valid_states " "is passed.")

    if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None):
        option_parser.error(
            "No filtering requested. Must provide either "
            "mapping_fp and valid states, min counts, "
            "max counts, or sample_id_fp (or some combination "
            "of those)."
        )
    if (mapping_fp and valid_states) and sample_id_fp:
        option_parser.error("Providing both --sample_id_fp and " "--mapping_fp/--valid_states is not supported.")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate" " output mapping file.")

    otu_table = load_table(opts.input_fp)

    negate_sample_id_fp = opts.negate_sample_id_fp
    if mapping_fp and valid_states:
        sample_ids_to_keep = sample_ids_from_metadata_description(open(mapping_fp, "U"), valid_states)
        negate_sample_id_fp = False
    else:
        sample_ids_to_keep = otu_table.ids()

        if sample_id_fp is not None:
            o = open(sample_id_fp, "U")
            sample_id_f_ids = set([l.strip().split()[0] for l in o if not l.startswith("#")])
            o.close()
            sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(
        otu_table, sample_ids_to_keep, min_count, max_count, negate_ids_to_keep=negate_sample_id_fp
    )

    try:
        write_biom_table(filtered_otu_table, output_fp)
    except EmptyBIOMTableError:
        option_parser.error(
            "Filtering resulted in an empty BIOM table. " "This indicates that no samples remained after filtering."
        )

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_fp, "U"))
        mapping_headers, mapping_data = filter_mapping_file(mapping_data, mapping_headers, filtered_otu_table.ids())
        open(output_mapping_fp, "w").write(format_mapping_file(mapping_headers, mapping_data))
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if not ((mapping_fp and valid_states) or
            min_count != 0 or
            not isinf(max_count) or
            sample_id_fp is not None):
        option_parser.error("No filtering requested. Must provide either "
                            "mapping_fp and valid states, min counts, "
                            "max counts, or sample_id_fp (or some combination "
                            "of those).")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate"
                            " output mapping file.")

    otu_table =  load_table(opts.input_fp)

    if mapping_fp and valid_states:
        sample_ids_to_keep = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
    else:
        sample_ids_to_keep = otu_table.ids()

    if sample_id_fp is not None:
        sample_id_f_ids = set([l.strip().split()[0]
                              for l in open(sample_id_fp, 'U') if not l.startswith('#')])
        sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(otu_table,
                                                       sample_ids_to_keep,
                                                       min_count,
                                                       max_count)
    write_biom_table(filtered_otu_table, output_fp)

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(
            open(mapping_fp, 'U'))
        mapping_headers, mapping_data = \
            filter_mapping_file(
                mapping_data,
                mapping_headers,
                filtered_otu_table.ids())
        open(
            output_mapping_fp,
            'w').write(
            format_mapping_file(
                mapping_headers,
                mapping_data))
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if not ((mapping_fp and valid_states) or min_count != 0
            or not isinf(max_count) or sample_id_fp is not None):
        option_parser.error(
            "No filtering requested. Must provide either "
            "mapping_fp and valid states, min counts, "
            "max counts, or sample_id_fp (or some combination of those).")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate"
                            " output mapping file.")

    otu_table = parse_biom_table(open(opts.input_fp, 'U'))
    output_f = open(opts.output_fp, 'w')

    if (mapping_fp and valid_states):
        sample_ids_to_keep = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
    else:
        sample_ids_to_keep = otu_table.SampleIds

    if (sample_id_fp is not None):
        sample_id_f_ids = set([
            l.strip().split()[0] for l in open(sample_id_fp, 'U')
            if not l.startswith('#')
        ])
        sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(otu_table,
                                                       sample_ids_to_keep,
                                                       min_count, max_count)
    output_f.write(format_biom_table(filtered_otu_table))
    output_f.close()

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(
            open(mapping_fp, 'U'))
        mapping_headers, mapping_data = \
            filter_mapping_file(
                mapping_data,
                mapping_headers,
                filtered_otu_table.SampleIds)
        open(output_mapping_fp,
             'w').write(format_mapping_file(mapping_headers, mapping_data))
Example #8
0
            message += ' Offending sample identifier(s): %s.' %\
                ', '.join(sids_difference)
            print sids_difference

        option_parser.error(message)

    if number_intersected_sids != required_number_of_sids and\
        ignore_missing_samples:
        # keep only the samples that are mapped in the mapping file
        coords_headers, coords_data = keep_samples_from_pcoa_data(
            coords_headers, coords_data, sids_intersection)

    # ignore samples that exist in the coords but not in the mapping file, note:
    # we're using sids_intersection so if --ignore_missing_samples is enabled we
    # account for unmapped coords, else the program will exit before this point
    header, mapping_data = filter_mapping_file(mapping_data, header,
        sids_intersection, include_repeat_cols=True)

    # catch the errors that could occur when filling the mapping file values
    if missing_custom_axes_values:
        try:
            # the fact that this uses parse_metadata_state_descriptions makes
            # the following option '-x Category:7;PH:12' to work as well as the 
            # script-interface-documented '-x Category:7 -x PH:12' option
            for val in missing_custom_axes_values:
                if ':' not in val:
                    option_parser.error("Not valid missing value for custom "
                        "axes: %s" % val)
            mapping_data = fill_mapping_field_from_mapping_file(mapping_data,
                header, ';'.join(missing_custom_axes_values))
            
        except AssertionError, e:
Example #9
0
def make_selectors(counts_per_sample,
                   minimum,
                   mapping_file_tuple,
                   subject_header_name,
                   verbose=False):
    """make the four column string needed to print in the selectors file

    Inputs:
    counts_per_sample: a sorted list of tuples with the sample identifier and
    the number of sequences.
    minimum: minimum number of sequences considered to be a valid state.
    mapping_file_tuple: a tuple with the data of a mapping file and the headers.
    subject_header_name: string identifying the name of the column in the 
    mapping file that represents a unique subject.

    Output:
    result: four columns string corresponding to number of sequences, subjects,
    number of samples and metadata fields.
    """

    # unwrap the mapping file
    mapping_data = mapping_file_tuple[0]
    mapping_headers = mapping_file_tuple[1]

    seqs_per_sample = [t[0] for t in counts_per_sample]

    head_val = None
    subj_val = None
    samp_sub = None
    results = []

    depth = -1
    samples_per_subject = {}

    # store the index for convenience
    subject_index = mapping_headers.index(subject_header_name)
    list_of_subjects = [line[subject_index] for line in mapping_data]

    # initialize the samples_per_subject dictionary with as many keys as
    # subjects and values equal to the minimum number of samples among them
    for unique_subject in list(set(list_of_subjects)):
        samples_per_subject[unique_subject] = list_of_subjects.count(
            unique_subject)
    least_number_of_samples = min(samples_per_subject.values())
    for key, value in samples_per_subject.iteritems():
        samples_per_subject[key] = least_number_of_samples

    for sequences_per_sample_tuple in counts_per_sample:

        # there's no need to iterate if the minimum rarefaction depth is not met
        # or if the depth is the same as the previous depth, this would mean a
        # repeated row in the output line with the same values
        if  sequences_per_sample_tuple[0] < minimum or \
            sequences_per_sample_tuple[0] == depth:
            continue

        if verbose:
            print 'Samples per subject: {0} @ depth: {1}'\
                .format(samples_per_subject, depth)

        # Some samples are not in the mapping file just print those out
        sample_id = sequences_per_sample_tuple[1]
        try:
            current_subject = [
                line[subject_index] for line in mapping_data
                if line[0] == sample_id
            ][0]
        except IndexError:
            print 'Sample Id: {0} is not in the mapping file'.format(sample_id)
            continue

        # extract convenience data for ease of use
        depth = sequences_per_sample_tuple[0]
        remaining_ids = [
            _tuple[1] for _tuple in counts_per_sample if _tuple[0] >= depth
        ]

        filtered_headers, filtered_data = filter_mapping_file(mapping_data,\
            mapping_headers, remaining_ids, include_repeat_cols=False)

        # Breaking when there are no subjects/individuals left
        if subject_header_name not in filtered_headers:
            break

        # numbers to be written in the selectors file
        number_of_subjects = len(samples_per_subject.keys())
        number_of_samples = min(samples_per_subject.values())

        if number_of_subjects * number_of_samples < 3:
            continue

        # format the output
        if not subj_val and not head_val and not samp_sub:
            results.append('%d\t%d\t%d\t%s' % (int(depth), number_of_subjects,\
                number_of_samples, ','.join(filtered_headers[1:-1])))
            subj_val = number_of_subjects
            head_val = filtered_headers
            samp_sub = number_of_samples
            main_map_cat = filtered_headers
        else:
            if head_val != filtered_headers:
                results.append('%d\t%d\t%d\t%s'%(int(depth),number_of_subjects,\
                    number_of_samples, ','.join(filtered_headers[1:-1])))
                head_val = filtered_headers
            elif samp_sub != number_of_samples:
                results.append('%d\t%d\t%d\tNone'% (int(depth),\
                    number_of_subjects, number_of_samples))
                samp_sub = number_of_samples
            elif subj_val != number_of_subjects:
                results.append('%d\t%d\t%d\tNone' % (int(depth),\
                    number_of_subjects, number_of_samples))
                subj_val = number_of_subjects

        # remove the current processed sample and if needed, remove the subject
        try:
            samples_per_subject[current_subject] -= 1
            if samples_per_subject[current_subject] == 0:
                del samples_per_subject[current_subject]
        except:
            pass

    return results, main_map_cat
Example #10
0
def get_field_state_comparisons(dist_matrix_header,
                                dist_matrix,
                                mapping_header,
                                mapping,
                                field,
                                comparison_field_states,
                                suppress_symmetry_and_hollowness_check=False):
    """Returns a 2D dictionary relating distances between field states.

    The 2D dictionary is constructed such that each top-level key is a field
    state other than the field states in comparison_field_states. The
    second-level key is a field state from comparison_field_states, and the
    value at the (key, key) index is a list of distances between those two
    field states. Thus, given a field, this function will create comparisons
    between the specified comparison_field_states and all other field states.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.

    Arguments:
        - dist_matrix_header: The distance matrix header, obtained from
                              parse.parse_distmat()
        - dist_matrix: The distance matrix, obtained from
                       parse.parse_distmat().
        - mapping_header: The mapping file header, obtained from
                          parse.parse_mapping_file()
        - mapping: The mapping file's contents, obtained from
                   parse.parse_mapping_file()
        - field: A field in the mapping file to do the comparisons on.
        - comparison_field_states: A list of strings specifying the field
          states to compare to all other field states. Cannot be an empty list.
        - suppress_symmetry_and_hollowness_check: By default, the input
          distance matrix will be checked for symmetry and hollowness. It is
          recommended to leave this check in place for safety, as the check
          is fairly fast. However, if you *know* you have a symmetric and
          hollow distance matrix, you can disable this check for small
          performance gains on extremely large distance matrices
    """
    _validate_input(dist_matrix_header, dist_matrix, mapping_header, mapping,
                    field)

    # avoid empty groups of distances
    mapping_header, mapping = filter_mapping_file(mapping, mapping_header,
                                                  dist_matrix_header)

    # Make sure each comparison group field state is in the specified field.
    if not comparison_field_states:
        raise ValueError("You must provide at least one field state to "
                         "compare to all of the other field states.")
    mapping_data = [mapping_header]
    mapping_data.extend(mapping)
    groups = group_by_field(mapping_data, field)
    for field_state in comparison_field_states:
        if field_state not in groups:
            raise ValueError("The comparison group field state '%s' is not in "
                             "the provided mapping file's field '%s'." %
                             (field_state, field))

    # Grab a list of all other field states (besides the ones in
    # comparison_field_states). These will be the field states that the states
    # in comparison_field_states will be compared against.
    field_states = [
        group for group in groups.keys()
        if group not in comparison_field_states
    ]

    # Get between distance groupings for the field of interest.
    between_groupings = get_grouped_distances(
        dist_matrix_header,
        dist_matrix,
        mapping_header,
        mapping,
        field,
        within=False,
        suppress_symmetry_and_hollowness_check=
        suppress_symmetry_and_hollowness_check)

    # Build up our 2D dictionary giving the distances between a field state and
    # a comparison group field state by filtering out the between_groupings
    # list to include only the comparisons that we want.
    result = {}
    for field_state in field_states:
        result[field_state] = {}
        for comp_field_state in comparison_field_states:
            result[field_state][comp_field_state] = []
            for group in between_groupings:
                if ((group[0] == field_state or group[1] == field_state)
                        and (group[0] == comp_field_state
                             or group[1] == comp_field_state)):
                    # We've found a group of distances between our comparison
                    # field state and the current field state, so keep the
                    # data.
                    result[field_state][comp_field_state] = group[2]
    return result
Example #11
0
def get_field_state_comparisons(dist_matrix_header, dist_matrix,
                                mapping_header, mapping, field,
                                comparison_field_states,
                                suppress_symmetry_and_hollowness_check=False):
    """Returns a 2D dictionary relating distances between field states.

    The 2D dictionary is constructed such that each top-level key is a field
    state other than the field states in comparison_field_states. The
    second-level key is a field state from comparison_field_states, and the
    value at the (key, key) index is a list of distances between those two
    field states. Thus, given a field, this function will create comparisons
    between the specified comparison_field_states and all other field states.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.

    Arguments:
        - dist_matrix_header: The distance matrix header, obtained from
                              parse.parse_distmat()
        - dist_matrix: The distance matrix, obtained from
                       parse.parse_distmat().
        - mapping_header: The mapping file header, obtained from
                          parse.parse_mapping_file()
        - mapping: The mapping file's contents, obtained from
                   parse.parse_mapping_file()
        - field: A field in the mapping file to do the comparisons on.
        - comparison_field_states: A list of strings specifying the field
          states to compare to all other field states. Cannot be an empty list.
        - suppress_symmetry_and_hollowness_check: By default, the input
          distance matrix will be checked for symmetry and hollowness. It is
          recommended to leave this check in place for safety, as the check
          is fairly fast. However, if you *know* you have a symmetric and
          hollow distance matrix, you can disable this check for small
          performance gains on extremely large distance matrices
    """
    _validate_input(dist_matrix_header, dist_matrix, mapping_header, mapping,
                    field)

    # avoid empty groups of distances
    mapping_header, mapping = filter_mapping_file(mapping, mapping_header,
                                                  dist_matrix_header)

    # Make sure each comparison group field state is in the specified field.
    if not comparison_field_states:
        raise ValueError("You must provide at least one field state to "
                         "compare to all of the other field states.")
    mapping_data = [mapping_header]
    mapping_data.extend(mapping)
    groups = group_by_field(mapping_data, field)
    for field_state in comparison_field_states:
        if field_state not in groups:
            raise ValueError("The comparison group field state '%s' is not in "
                             "the provided mapping file's field '%s'."
                             % (field_state, field))

    # Grab a list of all other field states (besides the ones in
    # comparison_field_states). These will be the field states that the states
    # in comparison_field_states will be compared against.
    field_states = [group for group in groups.keys()
                    if group not in comparison_field_states]

    # Get between distance groupings for the field of interest.
    between_groupings = get_grouped_distances(dist_matrix_header, dist_matrix,
                                              mapping_header, mapping, field, within=False,
                                              suppress_symmetry_and_hollowness_check=
                                              suppress_symmetry_and_hollowness_check)

    # Build up our 2D dictionary giving the distances between a field state and
    # a comparison group field state by filtering out the between_groupings
    # list to include only the comparisons that we want.
    result = {}
    for field_state in field_states:
        result[field_state] = {}
        for comp_field_state in comparison_field_states:
            result[field_state][comp_field_state] = []
            for group in between_groupings:
                if ((group[0] == field_state or group[1] == field_state)
                    and (group[0] == comp_field_state or
                         group[1] == comp_field_state)):
                    # We've found a group of distances between our comparison
                    # field state and the current field state, so keep the
                    # data.
                    result[field_state][comp_field_state] = group[2]
    return result
Example #12
0
def make_selectors(counts_per_sample, minimum, mapping_file_tuple,
                    subject_header_name, verbose=False):
    """make the four column string needed to print in the selectors file

    Inputs:
    counts_per_sample: a sorted list of tuples with the sample identifier and
    the number of sequences.
    minimum: minimum number of sequences considered to be a valid state.
    mapping_file_tuple: a tuple with the data of a mapping file and the headers.
    subject_header_name: string identifying the name of the column in the 
    mapping file that represents a unique subject.

    Output:
    result: four columns string corresponding to number of sequences, subjects,
    number of samples and metadata fields.
    """

    # unwrap the mapping file
    mapping_data = mapping_file_tuple[0]
    mapping_headers = mapping_file_tuple[1]

    seqs_per_sample = [t[0] for t in counts_per_sample]

    head_val = None
    subj_val = None
    samp_sub = None
    results = []

    depth = -1
    samples_per_subject = {}

    # store the index for convenience
    subject_index = mapping_headers.index(subject_header_name)
    list_of_subjects = [line[subject_index] for line in mapping_data]

    # initialize the samples_per_subject dictionary with as many keys as
    # subjects and values equal to the minimum number of samples among them
    for unique_subject in list(set(list_of_subjects)):
        samples_per_subject[unique_subject] = list_of_subjects.count(unique_subject)
    least_number_of_samples = min(samples_per_subject.values())
    for key, value in samples_per_subject.iteritems():
        samples_per_subject[key] = least_number_of_samples

    for sequences_per_sample_tuple in counts_per_sample:

        # there's no need to iterate if the minimum rarefaction depth is not met
        # or if the depth is the same as the previous depth, this would mean a 
        # repeated row in the output line with the same values
        if  sequences_per_sample_tuple[0] < minimum or \
            sequences_per_sample_tuple[0] == depth:
            continue

        if verbose:
            print 'Samples per subject: {0} @ depth: {1}'\
                .format(samples_per_subject, depth)

        # Some samples are not in the mapping file just print those out
        sample_id = sequences_per_sample_tuple[1]
        try:
            current_subject = [line[subject_index] for line in mapping_data if line[0] == sample_id][0]
        except IndexError:
            print 'Sample Id: {0} is not in the mapping file'.format(sample_id)
            continue

        # extract convenience data for ease of use
        depth = sequences_per_sample_tuple[0]
        remaining_ids = [_tuple[1] for _tuple in counts_per_sample if _tuple[0] >= depth]

        filtered_headers, filtered_data = filter_mapping_file(mapping_data,\
            mapping_headers, remaining_ids, include_repeat_cols=False)

        # Breaking when there are no subjects/individuals left
        if subject_header_name not in filtered_headers:
            break

        # numbers to be written in the selectors file
        number_of_subjects = len(samples_per_subject.keys())
        number_of_samples = min(samples_per_subject.values())

        if number_of_subjects*number_of_samples < 3:
            continue

        # format the output 
        if not subj_val and not head_val and not samp_sub:
            results.append('%d\t%d\t%d\t%s' % (int(depth), number_of_subjects,\
                number_of_samples, ','.join(filtered_headers[1:-1])))
            subj_val = number_of_subjects
            head_val = filtered_headers
            samp_sub = number_of_samples
            main_map_cat = filtered_headers
        else:
            if head_val!=filtered_headers:
                results.append('%d\t%d\t%d\t%s'%(int(depth),number_of_subjects,\
                    number_of_samples, ','.join(filtered_headers[1:-1])))
                head_val = filtered_headers
            elif samp_sub!=number_of_samples:
                results.append('%d\t%d\t%d\tNone'% (int(depth),\
                    number_of_subjects, number_of_samples))
                samp_sub = number_of_samples
            elif subj_val!=number_of_subjects:
                results.append('%d\t%d\t%d\tNone' % (int(depth),\
                    number_of_subjects, number_of_samples))
                subj_val = number_of_subjects

        # remove the current processed sample and if needed, remove the subject
        try:
            samples_per_subject[current_subject] -= 1
            if samples_per_subject[current_subject] == 0:
                del samples_per_subject[current_subject]
        except:
            pass

    return results, main_map_cat
Example #13
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    valid_states = opts.valid_states
    min_count = opts.min_count
    max_count = opts.max_count
    sample_id_fp = opts.sample_id_fp

    if (mapping_fp is None and valid_states is not None):
        option_parser.error("--mapping_fp must be provided if --valid_states "
                            "is passed.")

    if not ((mapping_fp and valid_states) or min_count != 0
            or not isinf(max_count) or sample_id_fp is not None):
        option_parser.error("No filtering requested. Must provide either "
                            "mapping_fp and valid states, min counts, "
                            "max counts, or sample_id_fp (or some combination "
                            "of those).")
    if (mapping_fp and valid_states) and sample_id_fp:
        option_parser.error("Providing both --sample_id_fp and "
                            "--mapping_fp/--valid_states is not supported.")
    if output_mapping_fp and not mapping_fp:
        option_parser.error("Must provide input mapping file to generate"
                            " output mapping file.")

    otu_table = load_table(opts.input_fp)

    negate_sample_id_fp = opts.negate_sample_id_fp
    if mapping_fp and valid_states:
        sample_ids_to_keep = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'), valid_states)
        negate_sample_id_fp = False
    else:
        sample_ids_to_keep = otu_table.ids()

        if sample_id_fp is not None:
            o = open(sample_id_fp, 'U')
            sample_id_f_ids = set(
                [l.strip().split()[0] for l in o if not l.startswith('#')])
            o.close()
            sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids

    filtered_otu_table = filter_samples_from_otu_table(
        otu_table,
        sample_ids_to_keep,
        min_count,
        max_count,
        negate_ids_to_keep=negate_sample_id_fp)

    try:
        write_biom_table(filtered_otu_table, output_fp)
    except EmptyBIOMTableError:
        option_parser.error(
            "Filtering resulted in an empty BIOM table. "
            "This indicates that no samples remained after filtering.")

    # filter mapping file if requested
    if output_mapping_fp:
        mapping_data, mapping_headers, _ = parse_mapping_file(
            open(mapping_fp, 'U'))
        mapping_headers, mapping_data = \
            filter_mapping_file(
                mapping_data,
                mapping_headers,
                filtered_otu_table.ids())
        open(output_mapping_fp,
             'w').write(format_mapping_file(mapping_headers, mapping_data))
Example #14
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    otu_table_fp = opts.otu_table_fp
    output_dir = opts.output_dir
    mapping_fp = opts.mapping_fp
    tree_fp = opts.tree_fp
    verbose = opts.verbose
    print_only = opts.print_only
    seqs_per_sample = int(opts.seqs_per_sample)
    parallel = opts.parallel
    min_seqs_sample = opts.min_seqs_sample
    subject_category = opts.subject_name

    try:
        makedirs(output_dir)
    except OSError:
        if opts.force:
            pass
        else:
            # Since the analysis can take quite a while, I put this check
            # in to help users avoid overwriting previous output.
            option_parser.error("Output directory already exists. Please choose"
                " a different directory, or force overwrite with -f.")


    ## ******************** make_evident_selectors ********************
    ## The code for make_evident_selectors.py is here and has to go before the params
    ## validation as we need to know the main cats before creating the params file
    map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))
    biom_table = parse_biom_table(open(otu_table_fp, 'U'))

    # getting valid samples from biom file
    real_map_headers, real_map_data = filter_mapping_file(map_data, headers,\
        biom_table.SampleIds, include_repeat_cols=False)

    if subject_category not in real_map_headers:
        option_parser.error('This column: %s is not in the mapping file, try %s'%\
            (subject_category, real_map_headers))
 
    sorted_counts_per_sample = get_sorted_counts_per_sample(biom_table)

    mapping_file_tuple = (real_map_data, real_map_headers)

    # calculate the available subjects at each rarefaction level
    results, main_map_cat = make_selectors(sorted_counts_per_sample, min_seqs_sample,\
        mapping_file_tuple, subject_category, verbose=verbose)

    fout = open(join(output_dir,'selectors.txt'),'w')
    fout.write('#Sequences\tSubjects\tSamples\tMetadata\n')
    fout.write('\n'.join(results))
    fout.close()
    
    fout = open(join(output_dir,'mapping_file.txt'),'w')
    fout.write(format_mapping_file(real_map_headers, real_map_data))
    fout.close()
    ## ******************** make_evident_selectors ********************

    fout = open(join(output_dir,'study_preferences.txt'),'w')
    fout.write('%d\n' % seqs_per_sample)
    fout.write('%s\n' % subject_category)
    fout.close()

    ## ******************** filter_samples_from_otu_table ********************
    ## Filtering original biom file to only have samples above the max length to avoid
    ## ugly plots
    alpha_biom_file = join(output_dir,'filtered_otu_table_for_alpha.biom')
    fout = open(alpha_biom_file,'w')
    sample_ids_to_keep = biom_table.SampleIds
    filtered_otu_table = filter_samples_from_otu_table(biom_table,
                                                       sample_ids_to_keep,
                                                       min_count=seqs_per_sample,
                                                       max_count=inf)
    fout.write(format_biom_table(filtered_otu_table))
    fout.close()
    ## ******************** filter_samples_from_otu_table ********************

    if opts.parameter_fp:
        try:
            parameter_f = open(opts.parameter_fp, 'U')
        except IOError:
            option_parser.error("Can't open parameters file (%s). Does it exist? " \
            "Do you have read access?" % opts.parameter_fp)
        params = parse_qiime_parameters(parameter_f)
        parameter_f.close()
    else:
        params = parse_qiime_parameters(
            ['beta_diversity:metrics unweighted_unifrac',\
             'make_rarefaction_plots:prefs_path %s' % join(output_dir,'prefs.txt'),
             'make_rarefaction_plots:colorby %s' % ','.join(main_map_cat), 
             'make_rarefaction_plots:output_type memory', 
             'multiple_rarefactions:min %d' % int(seqs_per_sample/4),
             'multiple_rarefactions:max %d' % (seqs_per_sample+1),
             'multiple_rarefactions:step %d' % int(seqs_per_sample/4),
             'multiple_rarefactions:num-reps 4',
            ])
        # empty list returns empty defaultdict for now
    
    jobs_to_start = opts.jobs_to_start
    default_jobs_to_start = qiime_config['jobs_to_start']
    validate_and_set_jobs_to_start(params,
                                   jobs_to_start,
                                   default_jobs_to_start,
                                   parallel,
                                   option_parser)


    if print_only:
        command_handler = print_commands
    else:
        command_handler = call_commands_serially
    
    if verbose:
        status_update_callback = print_to_stdout
    else:
        status_update_callback = no_status_updates
    
    copyfile(otu_table_fp, join(output_dir,'raw.biom'))
    
    run_beta_diversity_through_plots(otu_table_fp=otu_table_fp,
     mapping_fp=mapping_fp,
     output_dir=output_dir,
     command_handler=command_handler,
     params=params,
     qiime_config=qiime_config,
     color_by_interesting_fields_only=False,
     sampling_depth=seqs_per_sample,
     histogram_categories=None,
     tree_fp=tree_fp,
     parallel=parallel,
     suppress_3d_plots=True,
     suppress_2d_plots=True,
     status_update_callback=status_update_callback)
    
    output_dir = join(output_dir,'alpha')
    run_alpha_rarefaction(otu_table_fp=alpha_biom_file,\
     mapping_fp=mapping_fp,\
     output_dir=output_dir,\
     command_handler=command_handler,\
     params=params,
     qiime_config=qiime_config,\
     tree_fp=tree_fp,\
     num_steps=4,\
     parallel=parallel,\
     min_rare_depth=10,
     max_rare_depth=20,
     status_update_callback=status_update_callback,
     plot_stderr_and_stddev=True)