Ejemplo n.º 1
0
    def test_parse_coords_exceptions(self):
        """Check exceptions are raised accordingly with missing information"""

        # missing eigenvalues line
        with self.assertRaises(QiimeParseError):
            out = parse_coords(COORDS_NO_EIGENVALS.splitlines())
        # missing percentages explained line
        with self.assertRaises(QiimeParseError):
            out = parse_coords(COORDS_NO_PCNTS.splitlines())
        # missing vector number line
        with self.assertRaises(QiimeParseError):
            out = parse_coords(COORDS_NO_VECTORS.splitlines())

        # a whole different file (taxa summary)
        with self.assertRaises(QiimeParseError):
            out = parse_coords(taxa_summary1.splitlines())
Ejemplo n.º 2
0
    def test_parse_coords_exceptions(self):
        """Check exceptions are raised accordingly with missing information"""

        # missing eigenvalues line
        with self.assertRaises(QiimeParseError):
            out = parse_coords(COORDS_NO_EIGENVALS.splitlines())
        # missing percentages explained line
        with self.assertRaises(QiimeParseError):
            out = parse_coords(COORDS_NO_PCNTS.splitlines())
        # missing vector number line
        with self.assertRaises(QiimeParseError):
            out = parse_coords(COORDS_NO_VECTORS.splitlines())

        # a whole different file (taxa summary)
        with self.assertRaises(QiimeParseError):
            out = parse_coords(taxa_summary1.splitlines())
Ejemplo n.º 3
0
    def test_parse_coords(self):
        """parse_coords should handle coords file"""
        coords = """pc vector number\t1\t2\t3
A\t0.11\t0.09\t0.23
B\t0.03\t0.07\t-0.26
C\t0.12\t0.06\t-0.32


eigvals\t4.94\t1.79\t1.50
% variation explained\t14.3\t5.2\t4.3


""".splitlines()
        obs = parse_coords(coords)
        exp = (['A', 'B', 'C'],
               array([[.11, .09, .23], [.03, .07, -.26], [.12, .06, -.32]]),
               array([4.94, 1.79, 1.50]), array([14.3, 5.2, 4.3]))
        # test the header and the values apart from each other
        self.assertEqual(obs[0], exp[0])
        assert_almost_equal(obs[1], exp[1])
Ejemplo n.º 4
0
    def test_parse_coords(self):
        """parse_coords should handle coords file"""
        coords = """pc vector number\t1\t2\t3
A\t0.11\t0.09\t0.23
B\t0.03\t0.07\t-0.26
C\t0.12\t0.06\t-0.32


eigvals\t4.94\t1.79\t1.50
% variation explained\t14.3\t5.2\t4.3


""".splitlines()
        obs = parse_coords(coords)
        exp = (
            ["A", "B", "C"],
            array([[0.11, 0.09, 0.23], [0.03, 0.07, -0.26], [0.12, 0.06, -0.32]]),
            array([4.94, 1.79, 1.50]),
            array([14.3, 5.2, 4.3]),
        )
        # test the header and the values apart from each other
        self.assertEqual(obs[0], exp[0])
        assert_almost_equal(obs[1], exp[1])
Ejemplo n.º 5
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_coords = opts.input_coords
    map_fp = opts.map_fp
    output_dir = opts.output_dir
    color_by_column_names = opts.color_by
    add_unique_columns = opts.add_unique_columns
    custom_axes = opts.custom_axes
    ignore_missing_samples = opts.ignore_missing_samples
    missing_custom_axes_values = opts.missing_custom_axes_values
    jackknifing_method = opts.ellipsoid_method
    master_pcoa = opts.master_pcoa
    taxa_fp = opts.taxa_fp
    n_taxa_to_keep = opts.n_taxa_to_keep
    biplot_fp = opts.biplot_fp
    add_vectors = opts.add_vectors
    verbose_output = opts.verbose
    number_of_axes = opts.number_of_axes
    compare_plots = opts.compare_plots
    number_of_segments = opts.number_of_segments

    # add some metadata to the output
    emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML')

    # verifying that the number of axes requested is greater than 3
    if number_of_axes<3:
        option_parser.error(('You need to plot at least 3 axes.'))
        
    # verifying that the number of segments is between the desired range
    if number_of_segments<4 or number_of_segments>14:
        option_parser.error(('number_of_segments should be between 4 and 14.'))
        
    # append headernames that the script didn't find in the mapping file
    # according to different criteria to the following variables
    offending_fields = []
    non_numeric_categories = []

    serial_comparison = True

    # can't do averaged pcoa plots _and_ custom axes in the same plot
    if custom_axes!=None and len(custom_axes.split(','))>1 and\
        isdir(input_coords):
        option_parser.error(('Jackknifed plots are limited to one custom axis, '
            'currently trying to use: %s. Make sure you use only one.' %
            custom_axes))

    # make sure the flag is not misunderstood from the command line interface
    if isdir(input_coords) == False and compare_plots:
        option_parser.error('Cannot use the \'--compare_plots\' flag unless the'
            ' input path is a directory.')

    # before creating any output, check correct parsing of the main input files
    try:
        mapping_data, header, comments = parse_mapping_file(open(map_fp,'U'))

        # use this set variable to make presence/absensce checks faster
        lookup_header = set(header)
    except:
        option_parser.error(('The metadata mapping file \'%s\' does not seem '
            'to be formatted correctly, verify the formatting is QIIME '
            'compliant by using check_id_map.py') % map_fp)

    # dir means jackknifing or coordinate comparison type of processing
    if isdir(input_coords):
        offending_coords_fp = []
        coords_headers, coords_data, coords_eigenvalues, coords_pct=[],[],[],[]

        # iterate only over the non-hidden files and not folders and if anything
        # ignore the procrustes results file that is generated by
        # transform_coordinate_matrices.py suffixed in procrustes_results.txt
        coord_fps = [join(input_coords, f) for f in listdir(input_coords) if
            not f.startswith('.') and not isdir(join(abspath(input_coords),f))
            and not f.endswith('procrustes_results.txt')]

        # this could happen and we rather avoid this problem
        if len(coord_fps) == 0:
            option_parser.error('Could not use any of the files in the input '
                'directory.')

        # the master pcoa must be the first in the list of coordinates; however
        # if the visualization is not a jackknifed plot this gets ignored
        if master_pcoa and compare_plots == False:
            if master_pcoa in coord_fps: # remove it if duplicated
                coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + coord_fps # prepend it to the list
        # passing a master file means that the comparison is not serial
        elif master_pcoa and compare_plots:
            serial_comparison = False

            # guarantee that the master is the first and is not repeated
            if master_pcoa in  coord_fps:
                coord_fps.remove(master_pcoa)
                coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        # QIIME generates folders of transformed coordinates for the specific
        # purpose of connecting all coordinates to a set of origin coordinates.
        # The name of this file is suffixed as _transformed_reference.txt
        elif master_pcoa == None and len([f for f in coord_fps if f.endswith(
            '_transformed_reference.txt')]):
            master_pcoa = [f for f in coord_fps if f.endswith(
                '_transformed_reference.txt')][0]
            serial_comparison = False

            # Note: the following steps are to guarantee consistency.
            # remove the master from the list and re-add it as a first element
            # the rest of the files must be sorted alphabetically so the result
            # will be: ['unifrac_transformed_reference.txt',
            # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc
            coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        for fp in coord_fps:
            try:
                _coords_headers, _coords_data, _coords_eigenvalues,_coords_pct=\
                    parse_coords(open(fp,'U'))
            except (ValueError, QiimeParseError):
                offending_coords_fp.append(fp)

                # do not add any of the data and move along
                continue

            # pack all the data correspondingly only if it was correctly parsed
            coords_headers.append(_coords_headers)
            coords_data.append(_coords_data)
            coords_eigenvalues.append(_coords_eigenvalues)
            coords_pct.append(_coords_pct)

        # in case there were files that couldn't be parsed
        if offending_coords_fp:
            option_parser.error(('The following file(s): \'%s\' could not be '
                'parsed properly. Make sure the input folder only contains '
                'coordinates files.') % ', '.join(offending_coords_fp))

        # check all files contain the same sample identifiers by flattening the
        # list of available sample ids and returning the sample ids that are
        # in one of the sets of sample ids but not in the globablly shared ids
        non_shared_ids = set(sum([list(set(sum(coords_headers, []))^set(e))
            for e in coords_headers],[]))
        if non_shared_ids and len(coords_headers) > 1:
            option_parser.error(('The following sample identifier(s): \'%s\''
                'are not shared between all the files. The files used to '
                'make a jackknifed PCoA plot or coordinate comparison plot ('
                'procustes plot) must share all the same sample identifiers'
                'between each other.')%', '.join(list(non_shared_ids)))

        # flatten the list of lists into a 1-d list
        _coords_headers = list(set(sum(coords_headers, [])))

        # number of samples ids that are shared between coords and mapping files
        sids_intersection=list(set(zip(*mapping_data)[0])&set(_coords_headers))

        # sample ids that are not mapped but are in the coords
        sids_difference=list(set(_coords_headers)-set(zip(*mapping_data)[0]))

        # used to perform different validations in the script, very similar for
        # the case where the input is not a directory
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers[0])

    else:
        try:
            coords_headers, coords_data, coords_eigenvalues, coords_pct =\
                parse_coords(open(input_coords,'U'))
        # this exception was noticed when there were letters in the coords file
        # other exeptions should be catched here; code will be updated then
        except (ValueError, QiimeParseError):
            option_parser.error(('The PCoA file \'%s\' does not seem to be a '
                'coordinates formatted file, verify by manually inspecting '
                'the contents.') % input_coords)

        # number of samples ids that are shared between coords and mapping files
        sids_intersection = list(set(zip(*mapping_data)[0])&set(coords_headers))
        # sample ids that are not mapped but are in the coords
        sids_difference = list(set(coords_headers)-set(zip(*mapping_data)[0]))
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers)

    if taxa_fp:
        try:
            # for summarized tables the "otu_ids" are really the "lineages"
            otu_sample_ids, lineages, otu_table, _ = parse_otu_table(open(
                taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True)
        except ValueError, e:
            option_parser.error('There was a problem parsing the --taxa_fp: %s'%
                e.message)

        # make sure there are matching sample ids with the otu table
        if not len(list(set(sids_intersection)&set(otu_sample_ids))):
            option_parser.error('The sample identifiers in the OTU table must '
                'have at least one match with the data in the mapping file and '
                'with the coordinates file. Verify you are using input files '
                'that belong to the same dataset.')
        if len(lineages) <= 1:
            option_parser.error('Contingency tables with one or fewer rows are '
                'not supported, please try passing a contingency table with '
                'more than one row.')
Ejemplo n.º 6
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_coords = opts.input_coords
    map_fp = opts.map_fp
    output_dir = opts.output_dir
    color_by_column_names = opts.color_by
    add_unique_columns = opts.add_unique_columns
    custom_axes = opts.custom_axes
    ignore_missing_samples = opts.ignore_missing_samples
    missing_custom_axes_values = opts.missing_custom_axes_values
    jackknifing_method = opts.ellipsoid_method
    master_pcoa = opts.master_pcoa
    taxa_fp = opts.taxa_fp
    n_taxa_to_keep = opts.n_taxa_to_keep
    biplot_fp = opts.biplot_fp
    add_vectors = opts.add_vectors
    verbose_output = opts.verbose
    number_of_axes = opts.number_of_axes
    compare_plots = opts.compare_plots
    number_of_segments = opts.number_of_segments

    # add some metadata to the output
    emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML')

    # verifying that the number of axes requested is greater than 3
    if number_of_axes < 3:
        option_parser.error(('You need to plot at least 3 axes.'))

    # verifying that the number of segments is between the desired range
    if number_of_segments < 4 or number_of_segments > 14:
        option_parser.error(('number_of_segments should be between 4 and 14.'))

    # append headernames that the script didn't find in the mapping file
    # according to different criteria to the following variables
    offending_fields = []
    non_numeric_categories = []

    serial_comparison = True

    # can't do averaged pcoa plots _and_ custom axes in the same plot
    if custom_axes!=None and len(custom_axes.split(','))>1 and\
        isdir(input_coords):
        option_parser.error(
            ('Jackknifed plots are limited to one custom axis, '
             'currently trying to use: %s. Make sure you use only one.' %
             custom_axes))

    # make sure the flag is not misunderstood from the command line interface
    if isdir(input_coords) == False and compare_plots:
        option_parser.error(
            'Cannot use the \'--compare_plots\' flag unless the'
            ' input path is a directory.')

    # before creating any output, check correct parsing of the main input files
    try:
        mapping_data, header, comments = parse_mapping_file(open(map_fp, 'U'))

        # use this set variable to make presence/absensce checks faster
        lookup_header = set(header)
    except:
        option_parser.error(
            ('The metadata mapping file \'%s\' does not seem '
             'to be formatted correctly, verify the formatting is QIIME '
             'compliant by using check_id_map.py') % map_fp)

    # dir means jackknifing or coordinate comparison type of processing
    if isdir(input_coords):
        offending_coords_fp = []
        coords_headers, coords_data, coords_eigenvalues, coords_pct=[],[],[],[]

        # iterate only over the non-hidden files and not folders and if anything
        # ignore the procrustes results file that is generated by
        # transform_coordinate_matrices.py suffixed in procrustes_results.txt
        coord_fps = [
            join(input_coords, f) for f in listdir(input_coords) if
            not f.startswith('.') and not isdir(join(abspath(input_coords), f))
            and not f.endswith('procrustes_results.txt')
        ]

        # this could happen and we rather avoid this problem
        if len(coord_fps) == 0:
            option_parser.error('Could not use any of the files in the input '
                                'directory.')

        # the master pcoa must be the first in the list of coordinates; however
        # if the visualization is not a jackknifed plot this gets ignored
        if master_pcoa and compare_plots == False:
            if master_pcoa in coord_fps:  # remove it if duplicated
                coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + coord_fps  # prepend it to the list
        # passing a master file means that the comparison is not serial
        elif master_pcoa and compare_plots:
            serial_comparison = False

            # guarantee that the master is the first and is not repeated
            if master_pcoa in coord_fps:
                coord_fps.remove(master_pcoa)
                coord_fps = [master_pcoa
                             ] + sort_comparison_filenames(coord_fps)

        # QIIME generates folders of transformed coordinates for the specific
        # purpose of connecting all coordinates to a set of origin coordinates.
        # The name of this file is suffixed as _transformed_reference.txt
        elif master_pcoa == None and len(
            [f
             for f in coord_fps if f.endswith('_transformed_reference.txt')]):
            master_pcoa = [
                f for f in coord_fps
                if f.endswith('_transformed_reference.txt')
            ][0]
            serial_comparison = False

            # Note: the following steps are to guarantee consistency.
            # remove the master from the list and re-add it as a first element
            # the rest of the files must be sorted alphabetically so the result
            # will be: ['unifrac_transformed_reference.txt',
            # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc
            coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        for fp in coord_fps:
            try:
                _coords_headers, _coords_data, _coords_eigenvalues,_coords_pct=\
                    parse_coords(open(fp,'U'))
            except (ValueError, QiimeParseError):
                offending_coords_fp.append(fp)

                # do not add any of the data and move along
                continue

            # pack all the data correspondingly only if it was correctly parsed
            coords_headers.append(_coords_headers)
            coords_data.append(_coords_data)
            coords_eigenvalues.append(_coords_eigenvalues)
            coords_pct.append(_coords_pct)

        # in case there were files that couldn't be parsed
        if offending_coords_fp:
            option_parser.error(
                ('The following file(s): \'%s\' could not be '
                 'parsed properly. Make sure the input folder only contains '
                 'coordinates files.') % ', '.join(offending_coords_fp))

        # check all files contain the same sample identifiers by flattening the
        # list of available sample ids and returning the sample ids that are
        # in one of the sets of sample ids but not in the globablly shared ids
        non_shared_ids = set(
            sum([
                list(set(sum(coords_headers, [])) ^ set(e))
                for e in coords_headers
            ], []))
        if non_shared_ids and len(coords_headers) > 1:
            option_parser.error(
                ('The following sample identifier(s): \'%s\''
                 'are not shared between all the files. The files used to '
                 'make a jackknifed PCoA plot or coordinate comparison plot ('
                 'procustes plot) must share all the same sample identifiers'
                 'between each other.') % ', '.join(list(non_shared_ids)))

        # flatten the list of lists into a 1-d list
        _coords_headers = list(set(sum(coords_headers, [])))

        # number of samples ids that are shared between coords and mapping files
        sids_intersection = list(
            set(zip(*mapping_data)[0]) & set(_coords_headers))

        # sample ids that are not mapped but are in the coords
        sids_difference = list(
            set(_coords_headers) - set(zip(*mapping_data)[0]))

        # used to perform different validations in the script, very similar for
        # the case where the input is not a directory
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers[0])

    else:
        try:
            coords_headers, coords_data, coords_eigenvalues, coords_pct =\
                parse_coords(open(input_coords,'U'))
        # this exception was noticed when there were letters in the coords file
        # other exeptions should be catched here; code will be updated then
        except (ValueError, QiimeParseError):
            option_parser.error(
                ('The PCoA file \'%s\' does not seem to be a '
                 'coordinates formatted file, verify by manually inspecting '
                 'the contents.') % input_coords)

        # number of samples ids that are shared between coords and mapping files
        sids_intersection = list(
            set(zip(*mapping_data)[0]) & set(coords_headers))
        # sample ids that are not mapped but are in the coords
        sids_difference = list(
            set(coords_headers) - set(zip(*mapping_data)[0]))
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers)

    if taxa_fp:
        try:
            # for summarized tables the "otu_ids" are really the "lineages"
            otu_sample_ids, lineages, otu_table, _ = parse_otu_table(
                open(taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True)
        except ValueError, e:
            option_parser.error(
                'There was a problem parsing the --taxa_fp: %s' % e.message)

        # make sure there are matching sample ids with the otu table
        if not len(list(set(sids_intersection) & set(otu_sample_ids))):
            option_parser.error(
                'The sample identifiers in the OTU table must '
                'have at least one match with the data in the mapping file and '
                'with the coordinates file. Verify you are using input files '
                'that belong to the same dataset.')
        if len(lineages) <= 1:
            option_parser.error(
                'Contingency tables with one or fewer rows are '
                'not supported, please try passing a contingency table with '
                'more than one row.')