Example #1
0
 def test_parse_mapping_table(self):
     """Successfully parses a correct mapping table file"""
     obs_headers, obs_dict = parse_mapping_table(self.map_table_correct)
     exp_headers = ['header1', 'header2', 'header3', 'header4']
     exp_dict = {
         'some/path/to/mapping_1.txt': {
             'header1': 'h1_v1',
             'header2': 'h2_v1',
             'header3': 'h3_v1',
             'header4': 'h4_v1',
         },
         'some/path/to/mapping_2.txt': {
             'header1': 'h1_v2',
             'header2': 'h2_v1',
             'header3': 'h3_v2',
             'header4': 'h4_v1',
         },
         'some/path/to/mapping_3.txt': {
             'header1': 'h1_v3',
             'header2': 'h2_v2',
             'header3': 'h3_v1',
             'header4': 'h4_v2',
         }
     }
     self.assertEqual(obs_headers, exp_headers)
     self.assertEqual(obs_dict, exp_dict)
Example #2
0
 def test_parse_mapping_table(self):
     """Successfully parses a correct mapping table file"""
     obs_headers, obs_dict = parse_mapping_table(self.map_table_correct)
     exp_headers = ['header1', 'header2', 'header3', 'header4']
     exp_dict = {
         'some/path/to/mapping_1.txt': {
             'header1': 'h1_v1',
             'header2': 'h2_v1',
             'header3': 'h3_v1',
             'header4': 'h4_v1',
         },
         'some/path/to/mapping_2.txt': {
             'header1': 'h1_v2',
             'header2': 'h2_v1',
             'header3': 'h3_v2',
             'header4': 'h4_v1',
         },
         'some/path/to/mapping_3.txt': {
             'header1': 'h1_v3',
             'header2': 'h2_v2',
             'header3': 'h3_v1',
             'header4': 'h4_v2',
         }
     }
     self.assertEqual(obs_headers, exp_headers)
     self.assertEqual(obs_dict, exp_dict)
Example #3
0
def microbiome_model_test(base_dir, lines, models, taxa_level, category, sort,
    subsampling_depth, num_subsamples, output_dir):
    """ Tests the microbiome models listed in 'models'

    Inputs:
        base_dir: base common directory of all mapping files
        lines: mapping table file lines
        models: list of models to test
        category: category to use in the gradient or subpopulation models
        sort: list of category values sorted, or 'ascendant' or 'descendant'
            to use in the gradient model
        subsampling_depth: number of sequences to keep in each subsample
        num_subsamples: number of subsamples
        output_dir: output dirpath to store the results
    """
    # Parse the mapping table file and get the normalized headers and 
    # category translation
    headers, mapping_table_dict = parse_mapping_table(lines)
    # Check that all the mapping files listed in the mapping table exist
    check_exist_filepaths(base_dir, mapping_table_dict.keys())
    # Test the different models
    if 'core' in models:
        # Perform core model testing
        core_model_test(base_dir, mapping_table_dict, taxa_level, output_dir)
    if 'gradient' in models or 'subpopulation' in models:
        # For the gradient and subpopulation models we need to get the 
        # profiles by category value
        profiles = {}
        # Keep track of the mapping files not used in the test
        unused_maps = []
        for mapping_file in mapping_table_dict:
            mapping_fp = join(base_dir, mapping_file)
            mapping_category = mapping_table_dict[mapping_file][category]
            if mapping_category == "No":
                # The mapping file do not have data for this category
                unused_maps.append(mapping_file)
            elif mapping_category == "Yes":
                # 'Yes' its only supported for the category "HEALTHY"
                if category == "HEALTHY":
                    # All the study has been done in healthy people
                    # get the studies by SampleID
                    ret = make_profiles_by_category(mapping_fp, taxa_level, "SampleID")
                    # Get a list of profiles
                    profile_list = [ret[k][0] for k in ret]
                    # Add the list of profiles of this mapping file to the 
                    # previous profiles
                    if 'healthy' in profiles:
                        profiles['healthy'].extend(profile_list)
                    else:
                        profiles['healthy'] = profile_list
                else:
                    raise ValueError, "The value 'Yes' in the mapping table" + \
                        " it's only supported for the category 'HEALTHY'"
            else:
                # Generate the profiles by category of this mapping file
                map_profiles = make_profiles_by_category(mapping_fp, taxa_level, 
                                mapping_category)
                # Add the profiles of this mapping file to the previous profiles
                profiles = unify_dictionaries(profiles, map_profiles)
        # Get the different values of the category in case that we need to
        # sort them (for the gradient model)
        values = profiles.keys()
        if 'gradient' in models:
            # If we have to test the gradient model, we have to use the values
            # in that category sorted
            if sort in ['ascendant', 'descendant']:
                # We remove any None value in order to properly order
                profiles.pop("None")
                values = sort_dictionary_keys(profiles,
                                    descendant=(sort=='descendant'))
            else:
                # We use the user defined sort of the values
                sort = sort.split(',')
                if len(values) != len(sort):
                    raise ValueError, "The number of values in the sorted " + \
                        "list and the number of values found in the mapping" + \
                        " file are not the same."
                values = sort
        # Create a folder to store the subsampled similarity matrices
        sim_mat_folder = join(output_dir, 'subsampled_matrices')
        if not exists(sim_mat_folder):
            mkdir(sim_mat_folder)
        # Initialize matrix list and profiles list
        matrix_list = []
        profiles_list = []
        for i in range(num_subsamples):
            # Subsample the profiles
            subsampled_profiles = subsample_profiles(profiles, 
                                                subsampling_depth, values)
            # Build similarity matrix from bootstrapped profiles
            sim_mat, group_profiles = build_similarity_matrix(
                                                    subsampled_profiles, values)
            matrix_list.append(sim_mat)
            profiles_list.append(group_profiles)
            # Store the similarity matrix in a file
            sim_mat_fp = join(sim_mat_folder, 'similarity_matrix_%d.txt' % i)
            write_similarity_matrix(sim_mat, values, sim_mat_fp)
        # Build consensus matrix
        consensus_mat = build_consensus_matrix(matrix_list)
        # Store the consensus matrix in a file
        cons_mat_fp = join(output_dir, 'consensus_matrix.txt')
        write_similarity_matrix(consensus_mat, values, cons_mat_fp)
        # Build consensus profiles
        consensus_profiles = build_consensus_profiles(profiles_list)
        # Store the profile of each group
        for key in consensus_profiles:
            prof_fp = join(output_dir, str(key + \
                                                '_consensus_profile.txt'))
            write_profile(consensus_profiles[key], prof_fp, consensus=True)
        # Store in a file the mapping files not used for the similarity matrix
        unused_maps_fp = join(output_dir, 'unused_mapping_files.txt')
        write_unused_mapping_files(unused_maps, unused_maps_fp)
        if 'subpopulation' in models:
            # Perform subpopulation model test
            subpopulation_model_test(consensus_mat, category, output_dir)
        if 'gradient' in models:
            # Perform gradient model test
            gradient_model_test(consensus_profiles, consensus_mat, category,
                                                            values, output_dir)
Example #4
0
def microbiome_model_test(base_dir, lines, models, taxa_level, category, sort,
                          subsampling_depth, num_subsamples, output_dir):
    """ Tests the microbiome models listed in 'models'

    Inputs:
        base_dir: base common directory of all mapping files
        lines: mapping table file lines
        models: list of models to test
        category: category to use in the gradient or subpopulation models
        sort: list of category values sorted, or 'ascendant' or 'descendant'
            to use in the gradient model
        subsampling_depth: number of sequences to keep in each subsample
        num_subsamples: number of subsamples
        output_dir: output dirpath to store the results
    """
    # Parse the mapping table file and get the normalized headers and
    # category translation
    headers, mapping_table_dict = parse_mapping_table(lines)
    # Check that all the mapping files listed in the mapping table exist
    check_exist_filepaths(base_dir, mapping_table_dict.keys())
    # Test the different models
    if 'core' in models:
        # Perform core model testing
        core_model_test(base_dir, mapping_table_dict, taxa_level, output_dir)
    if 'gradient' in models or 'subpopulation' in models:
        # For the gradient and subpopulation models we need to get the
        # profiles by category value
        profiles = {}
        # Keep track of the mapping files not used in the test
        unused_maps = []
        for mapping_file in mapping_table_dict:
            mapping_fp = join(base_dir, mapping_file)
            mapping_category = mapping_table_dict[mapping_file][category]
            if mapping_category == "No":
                # The mapping file do not have data for this category
                unused_maps.append(mapping_file)
            elif mapping_category == "Yes":
                # 'Yes' its only supported for the category "HEALTHY"
                if category == "HEALTHY":
                    # All the study has been done in healthy people
                    # get the studies by SampleID
                    ret = make_profiles_by_category(mapping_fp, taxa_level,
                                                    "SampleID")
                    # Get a list of profiles
                    profile_list = [ret[k][0] for k in ret]
                    # Add the list of profiles of this mapping file to the
                    # previous profiles
                    if 'healthy' in profiles:
                        profiles['healthy'].extend(profile_list)
                    else:
                        profiles['healthy'] = profile_list
                else:
                    raise ValueError, "The value 'Yes' in the mapping table" + \
                        " it's only supported for the category 'HEALTHY'"
            else:
                # Generate the profiles by category of this mapping file
                map_profiles = make_profiles_by_category(
                    mapping_fp, taxa_level, mapping_category)
                # Add the profiles of this mapping file to the previous profiles
                profiles = unify_dictionaries(profiles, map_profiles)
        # Get the different values of the category in case that we need to
        # sort them (for the gradient model)
        values = profiles.keys()
        if 'gradient' in models:
            # If we have to test the gradient model, we have to use the values
            # in that category sorted
            if sort in ['ascendant', 'descendant']:
                # We remove any None value in order to properly order
                profiles.pop("None")
                values = sort_dictionary_keys(
                    profiles, descendant=(sort == 'descendant'))
            else:
                # We use the user defined sort of the values
                sort = sort.split(',')
                if len(values) != len(sort):
                    raise ValueError, "The number of values in the sorted " + \
                        "list and the number of values found in the mapping" + \
                        " file are not the same."
                values = sort
        # Create a folder to store the subsampled similarity matrices
        sim_mat_folder = join(output_dir, 'subsampled_matrices')
        if not exists(sim_mat_folder):
            mkdir(sim_mat_folder)
        # Initialize matrix list and profiles list
        matrix_list = []
        profiles_list = []
        for i in range(num_subsamples):
            # Subsample the profiles
            subsampled_profiles = subsample_profiles(profiles,
                                                     subsampling_depth, values)
            # Build similarity matrix from bootstrapped profiles
            sim_mat, group_profiles = build_similarity_matrix(
                subsampled_profiles, values)
            matrix_list.append(sim_mat)
            profiles_list.append(group_profiles)
            # Store the similarity matrix in a file
            sim_mat_fp = join(sim_mat_folder, 'similarity_matrix_%d.txt' % i)
            write_similarity_matrix(sim_mat, values, sim_mat_fp)
        # Build consensus matrix
        consensus_mat = build_consensus_matrix(matrix_list)
        # Store the consensus matrix in a file
        cons_mat_fp = join(output_dir, 'consensus_matrix.txt')
        write_similarity_matrix(consensus_mat, values, cons_mat_fp)
        # Build consensus profiles
        consensus_profiles = build_consensus_profiles(profiles_list)
        # Store the profile of each group
        for key in consensus_profiles:
            prof_fp = join(output_dir, str(key + \
                                                '_consensus_profile.txt'))
            write_profile(consensus_profiles[key], prof_fp, consensus=True)
        # Store in a file the mapping files not used for the similarity matrix
        unused_maps_fp = join(output_dir, 'unused_mapping_files.txt')
        write_unused_mapping_files(unused_maps, unused_maps_fp)
        if 'subpopulation' in models:
            # Perform subpopulation model test
            subpopulation_model_test(consensus_mat, category, output_dir)
        if 'gradient' in models:
            # Perform gradient model test
            gradient_model_test(consensus_profiles, consensus_mat, category,
                                values, output_dir)