def test_parse_coords_exceptions(self): """Check exceptions are raised accordingly with missing information""" # missing eigenvalues line with self.assertRaises(QiimeParseError): out = parse_coords(COORDS_NO_EIGENVALS.splitlines()) # missing percentages explained line with self.assertRaises(QiimeParseError): out = parse_coords(COORDS_NO_PCNTS.splitlines()) # missing vector number line with self.assertRaises(QiimeParseError): out = parse_coords(COORDS_NO_VECTORS.splitlines()) # a whole different file (taxa summary) with self.assertRaises(QiimeParseError): out = parse_coords(taxa_summary1.splitlines())
def test_parse_coords(self): """parse_coords should handle coords file""" coords = """pc vector number\t1\t2\t3 A\t0.11\t0.09\t0.23 B\t0.03\t0.07\t-0.26 C\t0.12\t0.06\t-0.32 eigvals\t4.94\t1.79\t1.50 % variation explained\t14.3\t5.2\t4.3 """.splitlines() obs = parse_coords(coords) exp = (['A', 'B', 'C'], array([[.11, .09, .23], [.03, .07, -.26], [.12, .06, -.32]]), array([4.94, 1.79, 1.50]), array([14.3, 5.2, 4.3])) # test the header and the values apart from each other self.assertEqual(obs[0], exp[0]) assert_almost_equal(obs[1], exp[1])
def test_parse_coords(self): """parse_coords should handle coords file""" coords = """pc vector number\t1\t2\t3 A\t0.11\t0.09\t0.23 B\t0.03\t0.07\t-0.26 C\t0.12\t0.06\t-0.32 eigvals\t4.94\t1.79\t1.50 % variation explained\t14.3\t5.2\t4.3 """.splitlines() obs = parse_coords(coords) exp = ( ["A", "B", "C"], array([[0.11, 0.09, 0.23], [0.03, 0.07, -0.26], [0.12, 0.06, -0.32]]), array([4.94, 1.79, 1.50]), array([14.3, 5.2, 4.3]), ) # test the header and the values apart from each other self.assertEqual(obs[0], exp[0]) assert_almost_equal(obs[1], exp[1])
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_coords = opts.input_coords map_fp = opts.map_fp output_dir = opts.output_dir color_by_column_names = opts.color_by add_unique_columns = opts.add_unique_columns custom_axes = opts.custom_axes ignore_missing_samples = opts.ignore_missing_samples missing_custom_axes_values = opts.missing_custom_axes_values jackknifing_method = opts.ellipsoid_method master_pcoa = opts.master_pcoa taxa_fp = opts.taxa_fp n_taxa_to_keep = opts.n_taxa_to_keep biplot_fp = opts.biplot_fp add_vectors = opts.add_vectors verbose_output = opts.verbose number_of_axes = opts.number_of_axes compare_plots = opts.compare_plots number_of_segments = opts.number_of_segments # add some metadata to the output emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML') # verifying that the number of axes requested is greater than 3 if number_of_axes<3: option_parser.error(('You need to plot at least 3 axes.')) # verifying that the number of segments is between the desired range if number_of_segments<4 or number_of_segments>14: option_parser.error(('number_of_segments should be between 4 and 14.')) # append headernames that the script didn't find in the mapping file # according to different criteria to the following variables offending_fields = [] non_numeric_categories = [] serial_comparison = True # can't do averaged pcoa plots _and_ custom axes in the same plot if custom_axes!=None and len(custom_axes.split(','))>1 and\ isdir(input_coords): option_parser.error(('Jackknifed plots are limited to one custom axis, ' 'currently trying to use: %s. Make sure you use only one.' % custom_axes)) # make sure the flag is not misunderstood from the command line interface if isdir(input_coords) == False and compare_plots: option_parser.error('Cannot use the \'--compare_plots\' flag unless the' ' input path is a directory.') # before creating any output, check correct parsing of the main input files try: mapping_data, header, comments = parse_mapping_file(open(map_fp,'U')) # use this set variable to make presence/absensce checks faster lookup_header = set(header) except: option_parser.error(('The metadata mapping file \'%s\' does not seem ' 'to be formatted correctly, verify the formatting is QIIME ' 'compliant by using check_id_map.py') % map_fp) # dir means jackknifing or coordinate comparison type of processing if isdir(input_coords): offending_coords_fp = [] coords_headers, coords_data, coords_eigenvalues, coords_pct=[],[],[],[] # iterate only over the non-hidden files and not folders and if anything # ignore the procrustes results file that is generated by # transform_coordinate_matrices.py suffixed in procrustes_results.txt coord_fps = [join(input_coords, f) for f in listdir(input_coords) if not f.startswith('.') and not isdir(join(abspath(input_coords),f)) and not f.endswith('procrustes_results.txt')] # this could happen and we rather avoid this problem if len(coord_fps) == 0: option_parser.error('Could not use any of the files in the input ' 'directory.') # the master pcoa must be the first in the list of coordinates; however # if the visualization is not a jackknifed plot this gets ignored if master_pcoa and compare_plots == False: if master_pcoa in coord_fps: # remove it if duplicated coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + coord_fps # prepend it to the list # passing a master file means that the comparison is not serial elif master_pcoa and compare_plots: serial_comparison = False # guarantee that the master is the first and is not repeated if master_pcoa in coord_fps: coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps) # QIIME generates folders of transformed coordinates for the specific # purpose of connecting all coordinates to a set of origin coordinates. # The name of this file is suffixed as _transformed_reference.txt elif master_pcoa == None and len([f for f in coord_fps if f.endswith( '_transformed_reference.txt')]): master_pcoa = [f for f in coord_fps if f.endswith( '_transformed_reference.txt')][0] serial_comparison = False # Note: the following steps are to guarantee consistency. # remove the master from the list and re-add it as a first element # the rest of the files must be sorted alphabetically so the result # will be: ['unifrac_transformed_reference.txt', # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps) for fp in coord_fps: try: _coords_headers, _coords_data, _coords_eigenvalues,_coords_pct=\ parse_coords(open(fp,'U')) except (ValueError, QiimeParseError): offending_coords_fp.append(fp) # do not add any of the data and move along continue # pack all the data correspondingly only if it was correctly parsed coords_headers.append(_coords_headers) coords_data.append(_coords_data) coords_eigenvalues.append(_coords_eigenvalues) coords_pct.append(_coords_pct) # in case there were files that couldn't be parsed if offending_coords_fp: option_parser.error(('The following file(s): \'%s\' could not be ' 'parsed properly. Make sure the input folder only contains ' 'coordinates files.') % ', '.join(offending_coords_fp)) # check all files contain the same sample identifiers by flattening the # list of available sample ids and returning the sample ids that are # in one of the sets of sample ids but not in the globablly shared ids non_shared_ids = set(sum([list(set(sum(coords_headers, []))^set(e)) for e in coords_headers],[])) if non_shared_ids and len(coords_headers) > 1: option_parser.error(('The following sample identifier(s): \'%s\'' 'are not shared between all the files. The files used to ' 'make a jackknifed PCoA plot or coordinate comparison plot (' 'procustes plot) must share all the same sample identifiers' 'between each other.')%', '.join(list(non_shared_ids))) # flatten the list of lists into a 1-d list _coords_headers = list(set(sum(coords_headers, []))) # number of samples ids that are shared between coords and mapping files sids_intersection=list(set(zip(*mapping_data)[0])&set(_coords_headers)) # sample ids that are not mapped but are in the coords sids_difference=list(set(_coords_headers)-set(zip(*mapping_data)[0])) # used to perform different validations in the script, very similar for # the case where the input is not a directory number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers[0]) else: try: coords_headers, coords_data, coords_eigenvalues, coords_pct =\ parse_coords(open(input_coords,'U')) # this exception was noticed when there were letters in the coords file # other exeptions should be catched here; code will be updated then except (ValueError, QiimeParseError): option_parser.error(('The PCoA file \'%s\' does not seem to be a ' 'coordinates formatted file, verify by manually inspecting ' 'the contents.') % input_coords) # number of samples ids that are shared between coords and mapping files sids_intersection = list(set(zip(*mapping_data)[0])&set(coords_headers)) # sample ids that are not mapped but are in the coords sids_difference = list(set(coords_headers)-set(zip(*mapping_data)[0])) number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers) if taxa_fp: try: # for summarized tables the "otu_ids" are really the "lineages" otu_sample_ids, lineages, otu_table, _ = parse_otu_table(open( taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True) except ValueError, e: option_parser.error('There was a problem parsing the --taxa_fp: %s'% e.message) # make sure there are matching sample ids with the otu table if not len(list(set(sids_intersection)&set(otu_sample_ids))): option_parser.error('The sample identifiers in the OTU table must ' 'have at least one match with the data in the mapping file and ' 'with the coordinates file. Verify you are using input files ' 'that belong to the same dataset.') if len(lineages) <= 1: option_parser.error('Contingency tables with one or fewer rows are ' 'not supported, please try passing a contingency table with ' 'more than one row.')
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_coords = opts.input_coords map_fp = opts.map_fp output_dir = opts.output_dir color_by_column_names = opts.color_by add_unique_columns = opts.add_unique_columns custom_axes = opts.custom_axes ignore_missing_samples = opts.ignore_missing_samples missing_custom_axes_values = opts.missing_custom_axes_values jackknifing_method = opts.ellipsoid_method master_pcoa = opts.master_pcoa taxa_fp = opts.taxa_fp n_taxa_to_keep = opts.n_taxa_to_keep biplot_fp = opts.biplot_fp add_vectors = opts.add_vectors verbose_output = opts.verbose number_of_axes = opts.number_of_axes compare_plots = opts.compare_plots number_of_segments = opts.number_of_segments # add some metadata to the output emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML') # verifying that the number of axes requested is greater than 3 if number_of_axes < 3: option_parser.error(('You need to plot at least 3 axes.')) # verifying that the number of segments is between the desired range if number_of_segments < 4 or number_of_segments > 14: option_parser.error(('number_of_segments should be between 4 and 14.')) # append headernames that the script didn't find in the mapping file # according to different criteria to the following variables offending_fields = [] non_numeric_categories = [] serial_comparison = True # can't do averaged pcoa plots _and_ custom axes in the same plot if custom_axes!=None and len(custom_axes.split(','))>1 and\ isdir(input_coords): option_parser.error( ('Jackknifed plots are limited to one custom axis, ' 'currently trying to use: %s. Make sure you use only one.' % custom_axes)) # make sure the flag is not misunderstood from the command line interface if isdir(input_coords) == False and compare_plots: option_parser.error( 'Cannot use the \'--compare_plots\' flag unless the' ' input path is a directory.') # before creating any output, check correct parsing of the main input files try: mapping_data, header, comments = parse_mapping_file(open(map_fp, 'U')) # use this set variable to make presence/absensce checks faster lookup_header = set(header) except: option_parser.error( ('The metadata mapping file \'%s\' does not seem ' 'to be formatted correctly, verify the formatting is QIIME ' 'compliant by using check_id_map.py') % map_fp) # dir means jackknifing or coordinate comparison type of processing if isdir(input_coords): offending_coords_fp = [] coords_headers, coords_data, coords_eigenvalues, coords_pct=[],[],[],[] # iterate only over the non-hidden files and not folders and if anything # ignore the procrustes results file that is generated by # transform_coordinate_matrices.py suffixed in procrustes_results.txt coord_fps = [ join(input_coords, f) for f in listdir(input_coords) if not f.startswith('.') and not isdir(join(abspath(input_coords), f)) and not f.endswith('procrustes_results.txt') ] # this could happen and we rather avoid this problem if len(coord_fps) == 0: option_parser.error('Could not use any of the files in the input ' 'directory.') # the master pcoa must be the first in the list of coordinates; however # if the visualization is not a jackknifed plot this gets ignored if master_pcoa and compare_plots == False: if master_pcoa in coord_fps: # remove it if duplicated coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + coord_fps # prepend it to the list # passing a master file means that the comparison is not serial elif master_pcoa and compare_plots: serial_comparison = False # guarantee that the master is the first and is not repeated if master_pcoa in coord_fps: coord_fps.remove(master_pcoa) coord_fps = [master_pcoa ] + sort_comparison_filenames(coord_fps) # QIIME generates folders of transformed coordinates for the specific # purpose of connecting all coordinates to a set of origin coordinates. # The name of this file is suffixed as _transformed_reference.txt elif master_pcoa == None and len( [f for f in coord_fps if f.endswith('_transformed_reference.txt')]): master_pcoa = [ f for f in coord_fps if f.endswith('_transformed_reference.txt') ][0] serial_comparison = False # Note: the following steps are to guarantee consistency. # remove the master from the list and re-add it as a first element # the rest of the files must be sorted alphabetically so the result # will be: ['unifrac_transformed_reference.txt', # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps) for fp in coord_fps: try: _coords_headers, _coords_data, _coords_eigenvalues,_coords_pct=\ parse_coords(open(fp,'U')) except (ValueError, QiimeParseError): offending_coords_fp.append(fp) # do not add any of the data and move along continue # pack all the data correspondingly only if it was correctly parsed coords_headers.append(_coords_headers) coords_data.append(_coords_data) coords_eigenvalues.append(_coords_eigenvalues) coords_pct.append(_coords_pct) # in case there were files that couldn't be parsed if offending_coords_fp: option_parser.error( ('The following file(s): \'%s\' could not be ' 'parsed properly. Make sure the input folder only contains ' 'coordinates files.') % ', '.join(offending_coords_fp)) # check all files contain the same sample identifiers by flattening the # list of available sample ids and returning the sample ids that are # in one of the sets of sample ids but not in the globablly shared ids non_shared_ids = set( sum([ list(set(sum(coords_headers, [])) ^ set(e)) for e in coords_headers ], [])) if non_shared_ids and len(coords_headers) > 1: option_parser.error( ('The following sample identifier(s): \'%s\'' 'are not shared between all the files. The files used to ' 'make a jackknifed PCoA plot or coordinate comparison plot (' 'procustes plot) must share all the same sample identifiers' 'between each other.') % ', '.join(list(non_shared_ids))) # flatten the list of lists into a 1-d list _coords_headers = list(set(sum(coords_headers, []))) # number of samples ids that are shared between coords and mapping files sids_intersection = list( set(zip(*mapping_data)[0]) & set(_coords_headers)) # sample ids that are not mapped but are in the coords sids_difference = list( set(_coords_headers) - set(zip(*mapping_data)[0])) # used to perform different validations in the script, very similar for # the case where the input is not a directory number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers[0]) else: try: coords_headers, coords_data, coords_eigenvalues, coords_pct =\ parse_coords(open(input_coords,'U')) # this exception was noticed when there were letters in the coords file # other exeptions should be catched here; code will be updated then except (ValueError, QiimeParseError): option_parser.error( ('The PCoA file \'%s\' does not seem to be a ' 'coordinates formatted file, verify by manually inspecting ' 'the contents.') % input_coords) # number of samples ids that are shared between coords and mapping files sids_intersection = list( set(zip(*mapping_data)[0]) & set(coords_headers)) # sample ids that are not mapped but are in the coords sids_difference = list( set(coords_headers) - set(zip(*mapping_data)[0])) number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers) if taxa_fp: try: # for summarized tables the "otu_ids" are really the "lineages" otu_sample_ids, lineages, otu_table, _ = parse_otu_table( open(taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True) except ValueError, e: option_parser.error( 'There was a problem parsing the --taxa_fp: %s' % e.message) # make sure there are matching sample ids with the otu table if not len(list(set(sids_intersection) & set(otu_sample_ids))): option_parser.error( 'The sample identifiers in the OTU table must ' 'have at least one match with the data in the mapping file and ' 'with the coordinates file. Verify you are using input files ' 'that belong to the same dataset.') if len(lineages) <= 1: option_parser.error( 'Contingency tables with one or fewer rows are ' 'not supported, please try passing a contingency table with ' 'more than one row.')