Esempio n. 1
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    unittest_glob = opts.unittest_glob
    temp_filepath = opts.temp_filepath
    script_usage_tests = opts.script_usage_tests
    suppress_unit_tests = opts.suppress_unit_tests
    suppress_script_usage_tests = opts.suppress_script_usage_tests
    suppress_javascript_unit_tests = opts.suppress_javascript_unit_tests

    # since the test data is in the tests folder just add scripts_test_data
    emperor_test_data_dir = join(abspath(dirname(__file__)),
        'scripts_test_data/')

    # offer the option for the user to pass the scripts dir from the command
    # line since there is no other way to get the scripts dir. If not provided
    # the base structure of the repository will be assumed. Note that for both
    # cases we are using absolute paths, to avoid unwanted failures.
    if opts.emperor_scripts_dir is None:
        emperor_scripts_dir = abspath(join(get_emperor_project_dir(),
            'scripts/'))

        # let's try to guess cases for qiime-deploy type of installs
        if get_emperor_project_dir().endswith('/lib'):
            emperor_scripts_dir = abspath(join(get_emperor_project_dir()[:-3],
                'scripts/'))

    else:
        emperor_scripts_dir = abspath(opts.emperor_scripts_dir)

    # make a sanity check
    if (suppress_unit_tests and suppress_script_usage_tests and
        suppress_javascript_unit_tests):
        option_parser.error("All tests have been suppresed. Nothing to run.")

    test_dir = abspath(dirname(__file__))

    unittest_good_pattern = re.compile('OK\s*$')
    application_not_found_pattern = re.compile('ApplicationNotFoundError')
    python_name = 'python'
    bad_tests = []
    missing_application_tests = []

    # Run through all of Emperor's unit tests, and keep track of any files which
    # fail unit tests, note that these are the unit tests only
    if not suppress_unit_tests:
        unittest_names = []
        if not unittest_glob:
            for root, dirs, files in walk(test_dir):
                for name in files:
                    if name.startswith('test_') and name.endswith('.py'):
                        unittest_names.append(join(root,name))
        else:
            for fp in glob(unittest_glob):
                fn = split(fp)[1]
                if fn.startswith('test_') and fn.endswith('.py'):
                    unittest_names.append(abspath(fp))

        unittest_names.sort()

        for unittest_name in unittest_names:
            print "Testing %s:\n" % unittest_name
            command = '%s %s -v' % (python_name, unittest_name)
            stdout, stderr, return_value = qcli_system_call(command)
            print stderr
            if not unittest_good_pattern.search(stderr):
                if application_not_found_pattern.search(stderr):
                    missing_application_tests.append(unittest_name)
                else:
                    bad_tests.append(unittest_name)

    script_usage_failures = 0

    # choose to run some of the script usage tests or all the available ones
    if not suppress_script_usage_tests and exists(emperor_test_data_dir) and\
        exists(emperor_scripts_dir):
        if script_usage_tests != None:
            script_tests = script_usage_tests.split(',')
        else:
            script_tests = None

        initial_working_directory = getcwd()

        # Run the script usage testing functionality; note that depending on the
        # module where this was imported, the name of the arguments will change
        # that's the reason why I added the name of the arguments in here
        script_usage_result_summary, script_usage_failures = \
            run_script_usage_tests( emperor_test_data_dir,  # test_data_dir
                                    emperor_scripts_dir,    # scripts_dir
                                    temp_filepath,          # working_dir
                                    True,                   # verbose
                                    script_tests,           # tests
                                    None,                   # failure_log_fp
                                    False)                  # force_overwrite

        # running script usage tests breaks the current working directory
        chdir(initial_working_directory)

    if not suppress_javascript_unit_tests:
        runner = join(test_dir, 'javascript_tests', 'runner.js')
        index = join(test_dir, 'javascript_tests', 'index.html')

        o, e, r = qcli_system_call('phantomjs %s %s' % (runner, index))

        if o:
            print o
        if e:
            print e

        # if all the tests passed
        javascript_tests_passed = True if r == 0 else False
    else:
        javascript_tests_passed = True


    print "==============\nResult summary\n=============="

    if not suppress_unit_tests:
        print "\nUnit test result summary\n------------------------\n"
        if bad_tests:
            print "\nFailed the following unit tests.\n%s" %'\n'.join(bad_tests)
    
        if missing_application_tests:
            print "\nFailed the following unit tests, in part or whole due "+\
                "to missing external applications.\nDepending on the Emperor "+\
                "features you plan to use, this may not be critical.\n%s"\
                % '\n'.join(missing_application_tests)
        
        if not(missing_application_tests or bad_tests):
            print "\nAll unit tests passed.\n"

    if not suppress_script_usage_tests:
        if exists(emperor_test_data_dir) and exists(emperor_scripts_dir):
            print "\nScript usage test result summary"+\
                "\n--------------------------------\n"
            print script_usage_result_summary
        else:
            print ("\nCould not run script usage tests.\nThe Emperor scripts "
                "directory could not be automatically located, try supplying "
                " it manually using the --emperor_scripts_dir option.")

    if not suppress_javascript_unit_tests:
        print ('\nJavaScript unit tests result summary\n'
               '------------------------------------\n')
        if javascript_tests_passed:
            print 'All JavaScript unit tests passed.\n'
        else:
            print 'JavaScript unit tests failed, check the summary above.'

    # In case there were no failures of any type, exit with a return code of 0
    return_code = 1
    if (len(bad_tests) == 0 and len(missing_application_tests) == 0 and
        script_usage_failures == 0 and javascript_tests_passed):
        return_code = 0

    return return_code
Esempio n. 2
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_coords = opts.input_coords
    map_fp = opts.map_fp
    output_dir = opts.output_dir
    color_by_column_names = opts.color_by
    add_unique_columns = opts.add_unique_columns
    custom_axes = opts.custom_axes
    ignore_missing_samples = opts.ignore_missing_samples
    missing_custom_axes_values = opts.missing_custom_axes_values
    jackknifing_method = opts.ellipsoid_method
    master_pcoa = opts.master_pcoa
    taxa_fp = opts.taxa_fp
    n_taxa_to_keep = opts.n_taxa_to_keep
    biplot_fp = opts.biplot_fp
    add_vectors = opts.add_vectors
    verbose_output = opts.verbose
    number_of_axes = opts.number_of_axes
    compare_plots = opts.compare_plots
    number_of_segments = opts.number_of_segments
    pct_variation_below_one = opts.pct_variation_below_one

    # add some metadata to the output
    emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML')

    # verifying that the number of axes requested is greater than 3
    if number_of_axes<3:
        option_parser.error(('You need to plot at least 3 axes.'))

    # verifying that the number of segments is between the desired range
    if number_of_segments<4 or number_of_segments>14:
        option_parser.error(('number_of_segments should be between 4 and 14.'))

    # append headernames that the script didn't find in the mapping file
    # according to different criteria to the following variables
    offending_fields = []
    non_numeric_categories = []

    serial_comparison = True

    # can't do averaged pcoa plots _and_ custom axes in the same plot
    if custom_axes!=None and len(custom_axes.split(','))>1 and\
        isdir(input_coords):
        option_parser.error(('Jackknifed plots are limited to one custom axis, '
            'currently trying to use: %s. Make sure you use only one.' %
            custom_axes))

    # make sure the flag is not misunderstood from the command line interface
    if isdir(input_coords) == False and compare_plots:
        option_parser.error('Cannot use the \'--compare_plots\' flag unless the'
            ' input path is a directory.')

    # before creating any output, check correct parsing of the main input files
    try:
        mapping_data, header, comments = parse_mapping_file(open(map_fp,'U'))

        # use this set variable to make presence/absensce checks faster
        lookup_header = set(header)
    except:
        option_parser.error(('The metadata mapping file \'%s\' does not seem '
            'to be formatted correctly, verify the formatting is QIIME '
            'compliant by using check_id_map.py') % map_fp)

    # dir means jackknifing or coordinate comparison type of processing
    if isdir(input_coords):
        offending_coords_fp = []
        coords_headers, coords_data, coords_eigenvalues, coords_pct=[],[],[],[]

        # iterate only over the non-hidden files and not folders and if anything
        # ignore the procrustes results file that is generated by
        # transform_coordinate_matrices.py suffixed in procrustes_results.txt
        coord_fps = [join(input_coords, f) for f in listdir(input_coords) if
            not f.startswith('.') and not isdir(join(abspath(input_coords),f))
            and not f.endswith('procrustes_results.txt')]

        # this could happen and we rather avoid this problem
        if len(coord_fps) == 0:
            option_parser.error('Could not use any of the files in the input '
                'directory.')

        # the master pcoa must be the first in the list of coordinates; however
        # if the visualization is not a jackknifed plot this gets ignored
        if master_pcoa and compare_plots == False:
            if master_pcoa in coord_fps: # remove it if duplicated
                coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + coord_fps # prepend it to the list
        # passing a master file means that the comparison is not serial
        elif master_pcoa and compare_plots:
            serial_comparison = False

            # guarantee that the master is the first and is not repeated
            if master_pcoa in  coord_fps:
                coord_fps.remove(master_pcoa)
                coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        # QIIME generates folders of transformed coordinates for the specific
        # purpose of connecting all coordinates to a set of origin coordinates.
        # The name of this file is suffixed as _transformed_reference.txt
        elif master_pcoa == None and len([f for f in coord_fps if f.endswith(
            '_transformed_reference.txt')]):
            master_pcoa = [f for f in coord_fps if f.endswith(
                '_transformed_reference.txt')][0]
            serial_comparison = False

            # Note: the following steps are to guarantee consistency.
            # remove the master from the list and re-add it as a first element
            # the rest of the files must be sorted alphabetically so the result
            # will be: ['unifrac_transformed_reference.txt',
            # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc
            coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        for fp in coord_fps:
            try:
                _coords_headers, _coords_data, _coords_eigenvalues,_coords_pct=\
                    parse_coords(open(fp,'U'))
            except (ValueError, QiimeParseError):
                offending_coords_fp.append(fp)

                # do not add any of the data and move along
                continue

            # pack all the data correspondingly only if it was correctly parsed
            coords_headers.append(_coords_headers)
            coords_data.append(_coords_data)
            coords_eigenvalues.append(_coords_eigenvalues)
            coords_pct.append(_coords_pct)

        # in case there were files that couldn't be parsed
        if offending_coords_fp:
            option_parser.error(('The following file(s): \'%s\' could not be '
                'parsed properly. Make sure the input folder only contains '
                'coordinates files.') % ', '.join(offending_coords_fp))

        # check all files contain the same sample identifiers by flattening the
        # list of available sample ids and returning the sample ids that are
        # in one of the sets of sample ids but not in the globablly shared ids
        non_shared_ids = set(sum([list(set(sum(coords_headers, []))^set(e))
            for e in coords_headers],[]))
        if non_shared_ids and len(coords_headers) > 1:
            option_parser.error(('The following sample identifier(s): \'%s\''
                'are not shared between all the files. The files used to '
                'make a jackknifed PCoA plot or coordinate comparison plot ('
                'procustes plot) must share all the same sample identifiers'
                'between each other.')%', '.join(list(non_shared_ids)))

        # flatten the list of lists into a 1-d list
        _coords_headers = list(set(sum(coords_headers, [])))

        # number of samples ids that are shared between coords and mapping files
        sids_intersection=list(set(zip(*mapping_data)[0])&set(_coords_headers))

        # sample ids that are not mapped but are in the coords
        sids_difference=list(set(_coords_headers)-set(zip(*mapping_data)[0]))

        # used to perform different validations in the script, very similar for
        # the case where the input is not a directory
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers[0])

    else:
        try:
            coords_headers, coords_data, coords_eigenvalues, coords_pct =\
                parse_coords(open(input_coords,'U'))
        # this exception was noticed when there were letters in the coords file
        # other exeptions should be catched here; code will be updated then
        except (ValueError, QiimeParseError):
            option_parser.error(('The PCoA file \'%s\' does not seem to be a '
                'coordinates formatted file, verify by manually inspecting '
                'the contents.') % input_coords)

        # number of samples ids that are shared between coords and mapping files
        sids_intersection = list(set(zip(*mapping_data)[0])&set(coords_headers))
        # sample ids that are not mapped but are in the coords
        sids_difference = list(set(coords_headers)-set(zip(*mapping_data)[0]))
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers)

    if taxa_fp:
        try:
            # for summarized tables the "otu_ids" are really the "lineages"
            otu_sample_ids, lineages, otu_table, _ = parse_otu_table(open(
                taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True)
        except ValueError, e:
            option_parser.error('There was a problem parsing the --taxa_fp: %s'%
                e.message)

        # make sure there are matching sample ids with the otu table
        if not len(list(set(sids_intersection)&set(otu_sample_ids))):
            option_parser.error('The sample identifiers in the OTU table must '
                'have at least one match with the data in the mapping file and '
                'with the coordinates file. Verify you are using input files '
                'that belong to the same dataset.')
        if len(lineages) <= 1:
            option_parser.error('Contingency tables with one or fewer rows are '
                'not supported, please try passing a contingency table with '
                'more than one row.')
Esempio n. 3
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    unittest_glob = opts.unittest_glob
    temp_filepath = opts.temp_filepath
    script_usage_tests = opts.script_usage_tests
    suppress_unit_tests = opts.suppress_unit_tests
    suppress_script_usage_tests = opts.suppress_script_usage_tests

    # since the test data is in the tests folder just add scripts_test_data
    emperor_test_data_dir = join(abspath(dirname(__file__)),
                                 'scripts_test_data/')

    # offer the option for the user to pass the scripts dir from the command
    # line since there is no other way to get the scripts dir. If not provided
    # the base structure of the repository will be assumed. Note that for both
    # cases we are using absolute paths, to avoid unwanted failures.
    if opts.emperor_scripts_dir is None:
        emperor_scripts_dir = abspath(
            join(get_emperor_project_dir(), 'scripts/'))

        # let's try to guess cases for qiime-deploy type of installs
        if get_emperor_project_dir().endswith('/lib'):
            emperor_scripts_dir = abspath(
                join(get_emperor_project_dir()[:-3], 'scripts/'))

    else:
        emperor_scripts_dir = abspath(opts.emperor_scripts_dir)

    # make a sanity check
    if (suppress_unit_tests and suppress_script_usage_tests):
        option_parser.error("All tests have been suppresed. Nothing to run.")

    test_dir = abspath(dirname(__file__))

    unittest_good_pattern = re.compile('OK\s*$')
    application_not_found_pattern = re.compile('ApplicationNotFoundError')
    python_name = 'python'
    bad_tests = []
    missing_application_tests = []

    # Run through all of Emperor's unit tests, and keep track of any files which
    # fail unit tests, note that these are the unit tests only
    if not suppress_unit_tests:
        unittest_names = []
        if not unittest_glob:
            for root, dirs, files in walk(test_dir):
                for name in files:
                    if name.startswith('test_') and name.endswith('.py'):
                        unittest_names.append(join(root, name))
        else:
            for fp in glob(unittest_glob):
                fn = split(fp)[1]
                if fn.startswith('test_') and fn.endswith('.py'):
                    unittest_names.append(abspath(fp))

        unittest_names.sort()

        for unittest_name in unittest_names:
            print "Testing %s:\n" % unittest_name
            command = '%s %s -v' % (python_name, unittest_name)
            stdout, stderr, return_value = qcli_system_call(command)
            print stderr
            if not unittest_good_pattern.search(stderr):
                if application_not_found_pattern.search(stderr):
                    missing_application_tests.append(unittest_name)
                else:
                    bad_tests.append(unittest_name)

    script_usage_failures = 0

    # choose to run some of the script usage tests or all the available ones
    if not suppress_script_usage_tests and exists(emperor_test_data_dir) and\
        exists(emperor_scripts_dir):
        if script_usage_tests != None:
            script_tests = script_usage_tests.split(',')
        else:
            script_tests = None
        # Run the script usage testing functionality; note that depending on the
        # module where this was imported, the name of the arguments will change
        # that's the reason why I added the name of the arguments in here
        script_usage_result_summary, script_usage_failures = \
            run_script_usage_tests( emperor_test_data_dir,  # test_data_dir
                                    emperor_scripts_dir,    # scripts_dir
                                    temp_filepath,          # working_dir
                                    True,                   # verbose
                                    script_tests,           # tests
                                    None,                   # failure_log_fp
                                    False)                  # force_overwrite

    print "==============\nResult summary\n=============="

    if not suppress_unit_tests:
        print "\nUnit test result summary\n------------------------\n"
        if bad_tests:
            print "\nFailed the following unit tests.\n%s" % '\n'.join(
                bad_tests)

        if missing_application_tests:
            print "\nFailed the following unit tests, in part or whole due "+\
                "to missing external applications.\nDepending on the Emperor "+\
                "features you plan to use, this may not be critical.\n%s"\
                % '\n'.join(missing_application_tests)

        if not (missing_application_tests or bad_tests):
            print "\nAll unit tests passed.\n\n"

    if not suppress_script_usage_tests:
        if exists(emperor_test_data_dir) and exists(emperor_scripts_dir):
            print "\nScript usage test result summary"+\
                "\n------------------------------------\n"
            print script_usage_result_summary
        else:
            print(
                "\nCould not run script usage tests.\nThe Emperor scripts "
                "directory could not be automatically located, try supplying "
                " it manually using the --emperor_scripts_dir option.")

    # In case there were no failures of any type, exit with a return code of 0
    return_code = 1
    if (len(bad_tests) == 0 and len(missing_application_tests) == 0
            and script_usage_failures == 0):
        return_code = 0

    return return_code
Esempio n. 4
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    unittest_glob = opts.unittest_glob
    temp_filepath = opts.temp_filepath
    script_usage_tests = opts.script_usage_tests
    suppress_unit_tests = opts.suppress_unit_tests
    suppress_script_usage_tests = opts.suppress_script_usage_tests
    suppress_javascript_unit_tests = opts.suppress_javascript_unit_tests

    # since the test data is in the tests folder just add scripts_test_data
    ili_test_data_dir = join(abspath(dirname(__file__)), 'scripts_test_data/')

    # offer the option for the user to pass the scripts dir from the command
    # line since there is no other way to get the scripts dir. If not provided
    # the base structure of the repository will be assumed. Note that for both
    # cases we are using absolute paths, to avoid unwanted failures.
    if opts.ili_scripts_dir is None:
        ili_scripts_dir = abspath(join(get_ili_project_dir(), 'scripts/'))
    else:
        ili_scripts_dir = abspath(opts.ili_scripts_dir)

    # make a sanity check
    if (suppress_unit_tests and suppress_script_usage_tests
            and suppress_javascript_unit_tests):
        option_parser.error("All tests have been suppresed. Nothing to run.")

    test_dir = abspath(dirname(__file__))

    unittest_good_pattern = re.compile('OK\s*$')
    application_not_found_pattern = re.compile('ApplicationNotFoundError')
    python_name = 'python'
    bad_tests = []
    missing_application_tests = []

    # Run through all of ili's unit tests, and keep track of any files
    # which fail unit tests, note that these are the unit tests only
    if not suppress_unit_tests:
        unittest_names = []
        if not unittest_glob:
            for root, dirs, files in walk(test_dir):
                for name in files:
                    if name.startswith('test_') and name.endswith('.py'):
                        unittest_names.append(join(root, name))
        else:
            for fp in glob(unittest_glob):
                fn = split(fp)[1]
                if fn.startswith('test_') and fn.endswith('.py'):
                    unittest_names.append(abspath(fp))

        unittest_names.sort()

        for unittest_name in unittest_names:
            print "Testing %s:\n" % unittest_name
            command = '%s %s -v' % (python_name, unittest_name)
            stdout, stderr, return_value = qcli_system_call(command)
            print stderr
            if not unittest_good_pattern.search(stderr):
                if application_not_found_pattern.search(stderr):
                    missing_application_tests.append(unittest_name)
                else:
                    bad_tests.append(unittest_name)

    script_usage_failures = 0

    # choose to run some of the script usage tests or all the available ones
    if (not suppress_script_usage_tests and exists(ili_test_data_dir)
            and exists(ili_scripts_dir)):
        if script_usage_tests is not None:
            script_tests = script_usage_tests.split(',')
        else:
            script_tests = None

        initial_working_directory = getcwd()

        # Run the script usage testing functionality; note that depending on
        # the module where this was imported, the name of the arguments will
        # change that's the reason why I added the name of the arguments in
        # here
        script_usage_result_summary, script_usage_failures = \
            run_script_usage_tests(ili_test_data_dir,      # test_data_dir
                                   ili_scripts_dir,        # scripts_dir
                                   temp_filepath,          # working_dir
                                   True,                   # verbose
                                   script_tests,           # tests
                                   None,                   # failure_log_fp
                                   False)                  # force_overwrite

        # running script usage tests breaks the current working directory
        chdir(initial_working_directory)

    if not suppress_javascript_unit_tests:
        runner = join(test_dir, 'javascript_tests', 'runner.js')
        index = join(test_dir, 'javascript_tests', 'index.html')

        o, e, r = qcli_system_call('phantomjs %s %s' % (runner, index))

        if o:
            print o
        if e:
            print e

        # if all the tests passed
        javascript_tests_passed = True if r == 0 else False
    else:
        javascript_tests_passed = True

    print "==============\nResult summary\n=============="

    if not suppress_unit_tests:
        print "\nUnit test result summary\n------------------------\n"
        if bad_tests:
            print("\nFailed the following unit tests.\n%s" %
                  '\n'.join(bad_tests))

        if missing_application_tests:
            print(
                "\nFailed the following unit tests, in part or whole due "
                "to missing external applications.\nDepending on the "
                "ili features you plan to use, this may not be "
                "critical.\n%s" % '\n'.join(missing_application_tests))

        if not (missing_application_tests or bad_tests):
            print "\nAll unit tests passed.\n"

    if not suppress_script_usage_tests:
        if exists(ili_test_data_dir) and exists(ili_scripts_dir):
            print(
                "\nScript usage test result summary"
                "\n--------------------------------\n")
            print script_usage_result_summary
        else:
            print(
                "\nCould not run script usage tests.\nThe ili scripts "
                "directory could not be automatically located, try "
                "supplying it manually using the --ili_scripts_dir "
                "option.")

    if not suppress_javascript_unit_tests:
        print(
            '\nJavaScript unit tests result summary\n'
            '------------------------------------\n')
        if javascript_tests_passed:
            print 'All JavaScript unit tests passed.\n'
        else:
            print 'JavaScript unit tests failed, check the summary above.'

    # In case there were no failures of any type, exit with a return code of 0
    return_code = 1
    if (len(bad_tests) == 0 and len(missing_application_tests) == 0
            and script_usage_failures == 0 and javascript_tests_passed):
        return_code = 0

    return return_code
Esempio n. 5
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_coords = opts.input_coords
    map_fp = opts.map_fp
    output_dir = opts.output_dir
    color_by_column_names = opts.color_by
    add_unique_columns = opts.add_unique_columns
    custom_axes = opts.custom_axes
    ignore_missing_samples = opts.ignore_missing_samples
    missing_custom_axes_values = opts.missing_custom_axes_values
    jackknifing_method = opts.ellipsoid_method
    master_pcoa = opts.master_pcoa
    taxa_fp = opts.taxa_fp
    n_taxa_to_keep = opts.n_taxa_to_keep
    biplot_fp = opts.biplot_fp
    add_vectors = opts.add_vectors
    verbose_output = opts.verbose
    number_of_axes = opts.number_of_axes
    compare_plots = opts.compare_plots
    number_of_segments = opts.number_of_segments
    pct_variation_below_one = opts.pct_variation_below_one

    # add some metadata to the output
    emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML')

    # verifying that the number of axes requested is greater than 3
    if number_of_axes < 3:
        option_parser.error(('You need to plot at least 3 axes.'))

    # verifying that the number of segments is between the desired range
    if not (4 <= number_of_segments <= 14):
        option_parser.error(('number_of_segments should be between 4 and 14.'))

    # append headernames that the script didn't find in the mapping file
    # according to different criteria to the following variables
    offending_fields = []
    non_numeric_categories = []

    serial_comparison = True

    # can't do averaged pcoa plots _and_ custom axes in the same plot
    if custom_axes is not None and isdir(input_coords):
        if custom_axes.count(',') > 0:
            option_parser.error(('Jackknifed plots are limited to one custom '
                                 'axis, currently trying to use: %s. Make '
                                 'sure you use only one.' % custom_axes))

    # make sure the flag is not misunderstood from the command line interface
    if not isdir(input_coords) and compare_plots:
        option_parser.error("Cannot use the '--compare_plots' flag unless the "
                            "input path is a directory.")

    # before creating any output, check correct parsing of the main input files
    try:
        mapping_data, header, comments = parse_mapping_file(open(map_fp, 'U'))

    except:
        option_parser.error(("The metadata mapping file '%s' does not seem "
                             "to be formatted correctly, verify the "
                             "formatting is QIIME compliant by using "
                             "validate_mapping_file.py") % map_fp)
    else:
        # use this set variable to make presence/absensce checks faster
        lookup_header = set(header)
        mapping_ids = {row[0] for row in mapping_data}

    # dir means jackknifing or coordinate comparison type of processing
    if isdir(input_coords):
        offending_coords_fp = []
        coords_headers = []
        coords_data = []
        coords_eigenvalues = []
        coords_pct = []

        coord_fps = guess_coordinates_files(input_coords)

        # QIIME generates folders of transformed coordinates for the specific
        # purpose of connecting all coordinates to a set of origin coordinates.
        # The name of this file is suffixed as _transformed_reference.txt
        trans_suf = '_transformed_reference.txt'
        transformed = [f for f in coord_fps if f.endswith(trans_suf)]

        # this could happen and we rather avoid this problem
        if len(coord_fps) == 0:

            option_parser.error('Could not use any of the files in the input '
                                'directory.')

        # the master pcoa must be the first in the list of coordinates; however
        # if the visualization is not a jackknifed plot this gets ignored
        if master_pcoa and not compare_plots:
            if master_pcoa in coord_fps:  # remove it if duplicated
                coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + coord_fps  # prepend it to the list
        # passing a master file means that the comparison is not serial
        elif master_pcoa and compare_plots:
            serial_comparison = False

            # guarantee that the master is the first and is not repeated
            if master_pcoa in coord_fps:
                coord_fps.remove(master_pcoa)
                sorted_filenames = sort_comparison_filenames(coord_fps)
                coord_fps = [master_pcoa] + sorted_filenames

        elif master_pcoa is None and len(transformed):
            master_pcoa = transformed[0]
            serial_comparison = False

            # Note: the following steps are to guarantee consistency.
            # remove the master from the list and re-add it as a first element
            # the rest of the files must be sorted alphabetically so the result
            # will be: ['unifrac_transformed_reference.txt',
            # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc
            coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        for fp in coord_fps:
            try:
                parsed = parse_coords(open(fp, 'U'))
            except (ValueError, QiimeParseError):
                offending_coords_fp.append(fp)

                # do not add any of the data and move along
                continue
            else:
                # pack all the data correspondingly only if it was correctly
                # parsed
                coords_headers.append(parsed[0])
                coords_data.append(parsed[1])
                coords_eigenvalues.append(parsed[2])
                coords_pct.append(parsed[3])

        # in case there were files that couldn't be parsed
        if offending_coords_fp:
            errout = ', '.join(offending_coords_fp)
            option_parser.error(("The following file(s): '%s' could not be "
                                 "parsed properly. Make sure the input folder "
                                 "only contains coordinates files.") % errout)

        # check all files contain the same sample identifiers by flattening the
        # list of available sample ids and returning the sample ids that are
        # in one of the sets of sample ids but not in the globablly shared ids
        _coords_headers = set(flatten(coords_headers))
        _per_file_missing = [_coords_headers - set(e) for e in coords_headers]
        non_shared_ids = set(flatten(_per_file_missing))
        if non_shared_ids:
            errout = ', '.join(non_shared_ids)
            option_parser.error(("The following sample identifier(s): '%s' "
                                 "are not shared between all the files. The "
                                 "files used to make a jackknifed PCoA plot "
                                 "or coordinate comparison plot (procustes "
                                 "plot) must share all the same sample "
                                 "identifiers between each other.") % errout)

        # number of samples ids that are shared between coords and mapping
        # files
        sids_intersection = mapping_ids.intersection(_coords_headers)

        # sample ids that are not mapped but are in the coords
        sids_difference = _coords_headers.difference(mapping_ids)

        # used to perform different validations in the script, very similar for
        # the case where the input is not a directory
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers[0])

    else:
        try:
            parsed = parse_coords(open(input_coords, 'U'))
        # this exception was noticed when there were letters in the coords file
        # other exeptions should be catched here; code will be updated then
        except (ValueError, QiimeParseError):
            option_parser.error(
                ("The PCoA file '%s' does not seem to be a "
                 "coordinates formatted file, verify by "
                 "manually inspecting the contents.") % input_coords)
        else:
            coords_headers = parsed[0]
            coords_data = parsed[1]
            coords_eigenvalues = parsed[2]
            coords_pct = parsed[3]

        # number of samples ids that are shared between coords and mapping
        # files
        sids_intersection = mapping_ids.intersection(coords_headers)

        # sample ids that are not mapped but are in the coords
        sids_difference = set(coords_headers).difference(mapping_ids)

        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers)

    if taxa_fp:
        try:
            # This should really use BIOM's Table.from_tsv
            # for summarized tables the "otu_ids" are really the "lineages"
            parsed = parse_otu_table(open(taxa_fp, 'U'),
                                     count_map_f=float,
                                     remove_empty_rows=True)
        except ValueError, e:
            option_parser.error(("There was a problem parsing the --taxa_fp: "
                                 "%s" % e.message))
        else:
            otu_sample_ids = parsed[0]
            lineages = parsed[1]
            otu_table = parsed[2]

        # make sure there are matching sample ids with the otu table
        if not sids_intersection.issuperset(otu_sample_ids):
            option_parser.error("The sample identifiers in the OTU table must "
                                "have at least one match with the data in the "
                                "mapping file and with the coordinates file. "
                                "Verify you are using input files that belong "
                                "to the same dataset.")
        if len(lineages) <= 1:
            option_parser.error("Contingency tables with one or fewer rows "
                                "are not supported, please try passing a "
                                "contingency table with more than one row.")
Esempio n. 6
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_coords = opts.input_coords
    map_fp = opts.map_fp
    output_dir = opts.output_dir
    color_by_column_names = opts.color_by
    add_unique_columns = opts.add_unique_columns
    custom_axes = opts.custom_axes
    ignore_missing_samples = opts.ignore_missing_samples
    missing_custom_axes_values = opts.missing_custom_axes_values
    jackknifing_method = opts.ellipsoid_method
    master_pcoa = opts.master_pcoa
    taxa_fp = opts.taxa_fp
    n_taxa_to_keep = opts.n_taxa_to_keep
    biplot_fp = opts.biplot_fp
    add_vectors = opts.add_vectors
    verbose_output = opts.verbose
    number_of_axes = opts.number_of_axes
    compare_plots = opts.compare_plots
    number_of_segments = opts.number_of_segments
    pct_variation_below_one = opts.pct_variation_below_one

    # add some metadata to the output
    emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML')

    # verifying that the number of axes requested is greater than 3
    if number_of_axes < 3:
        option_parser.error(('You need to plot at least 3 axes.'))

    # verifying that the number of segments is between the desired range
    if not (4 <= number_of_segments <= 14):
        option_parser.error(('number_of_segments should be between 4 and 14.'))

    # append headernames that the script didn't find in the mapping file
    # according to different criteria to the following variables
    offending_fields = []
    non_numeric_categories = []

    serial_comparison = True

    # can't do averaged pcoa plots _and_ custom axes in the same plot
    if custom_axes is not None and isdir(input_coords):
        if custom_axes.count(',') > 0:
            option_parser.error(('Jackknifed plots are limited to one custom '
                                 'axis, currently trying to use: %s. Make '
                                 'sure you use only one.' % custom_axes))

    # make sure the flag is not misunderstood from the command line interface
    if not isdir(input_coords) and compare_plots:
        option_parser.error("Cannot use the '--compare_plots' flag unless the "
                            "input path is a directory.")

    # before creating any output, check correct parsing of the main input files
    try:
        mapping_data, header, comments = parse_mapping_file(open(map_fp, 'U'))

    except:
        option_parser.error(("The metadata mapping file '%s' does not seem "
                             "to be formatted correctly, verify the "
                             "formatting is QIIME compliant by using "
                             "validate_mapping_file.py") % map_fp)
    else:
        # use this set variable to make presence/absensce checks faster
        lookup_header = set(header)
        mapping_ids = {row[0] for row in mapping_data}

    # dir means jackknifing or coordinate comparison type of processing
    if isdir(input_coords):
        offending_coords_fp = []
        coords_headers = []
        coords_data = []
        coords_eigenvalues = []
        coords_pct = []

        coord_fps = guess_coordinates_files(input_coords)

        # QIIME generates folders of transformed coordinates for the specific
        # purpose of connecting all coordinates to a set of origin coordinates.
        # The name of this file is suffixed as _transformed_reference.txt
        trans_suf = '_transformed_reference.txt'
        transformed = [f for f in coord_fps if f.endswith(trans_suf)]

        # this could happen and we rather avoid this problem
        if len(coord_fps) == 0:

            option_parser.error('Could not use any of the files in the input '
                                'directory.')

        # the master pcoa must be the first in the list of coordinates; however
        # if the visualization is not a jackknifed plot this gets ignored
        if master_pcoa and not compare_plots:
            if master_pcoa in coord_fps:  # remove it if duplicated
                coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + coord_fps  # prepend it to the list
        # passing a master file means that the comparison is not serial
        elif master_pcoa and compare_plots:
            serial_comparison = False

            # guarantee that the master is the first and is not repeated
            if master_pcoa in coord_fps:
                coord_fps.remove(master_pcoa)
                sorted_filenames = sort_comparison_filenames(coord_fps)
                coord_fps = [master_pcoa] + sorted_filenames

        elif master_pcoa is None and len(transformed):
            master_pcoa = transformed[0]
            serial_comparison = False

            # Note: the following steps are to guarantee consistency.
            # remove the master from the list and re-add it as a first element
            # the rest of the files must be sorted alphabetically so the result
            # will be: ['unifrac_transformed_reference.txt',
            # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc
            coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        for fp in coord_fps:
            try:
                parsed = parse_coords(open(fp, 'U'))
            except (ValueError, QiimeParseError):
                offending_coords_fp.append(fp)

                # do not add any of the data and move along
                continue
            else:
                # pack all the data correspondingly only if it was correctly
                # parsed
                coords_headers.append(parsed[0])
                coords_data.append(parsed[1])
                coords_eigenvalues.append(parsed[2])
                coords_pct.append(parsed[3])

        # in case there were files that couldn't be parsed
        if offending_coords_fp:
            errout = ', '.join(offending_coords_fp)
            option_parser.error(("The following file(s): '%s' could not be "
                                 "parsed properly. Make sure the input folder "
                                 "only contains coordinates files.") % errout)

        # check all files contain the same sample identifiers by flattening the
        # list of available sample ids and returning the sample ids that are
        # in one of the sets of sample ids but not in the globablly shared ids
        _coords_headers = set(flatten(coords_headers))
        _per_file_missing = [_coords_headers - set(e) for e in coords_headers]
        non_shared_ids = set(flatten(_per_file_missing))
        if non_shared_ids:
            errout = ', '.join(non_shared_ids)
            option_parser.error(("The following sample identifier(s): '%s' "
                                 "are not shared between all the files. The "
                                 "files used to make a jackknifed PCoA plot "
                                 "or coordinate comparison plot (procustes "
                                 "plot) must share all the same sample "
                                 "identifiers between each other.") % errout)

        # number of samples ids that are shared between coords and mapping
        # files
        sids_intersection = mapping_ids.intersection(_coords_headers)

        # sample ids that are not mapped but are in the coords
        sids_difference = _coords_headers.difference(mapping_ids)

        # used to perform different validations in the script, very similar for
        # the case where the input is not a directory
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers[0])

    else:
        try:
            parsed = parse_coords(open(input_coords, 'U'))
        # this exception was noticed when there were letters in the coords file
        # other exeptions should be catched here; code will be updated then
        except (ValueError, QiimeParseError):
            option_parser.error(("The PCoA file '%s' does not seem to be a "
                                 "coordinates formatted file, verify by "
                                 "manually inspecting the contents.") %
                                input_coords)
        else:
            coords_headers = parsed[0]
            coords_data = parsed[1]
            coords_eigenvalues = parsed[2]
            coords_pct = parsed[3]

        # number of samples ids that are shared between coords and mapping
        # files
        sids_intersection = mapping_ids.intersection(coords_headers)

        # sample ids that are not mapped but are in the coords
        sids_difference = set(coords_headers).difference(mapping_ids)

        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers)

    if taxa_fp:
        try:
            # This should really use BIOM's Table.from_tsv
            # for summarized tables the "otu_ids" are really the "lineages"
            parsed = parse_otu_table(open(taxa_fp, 'U'), count_map_f=float,
                                     remove_empty_rows=True)
        except ValueError, e:
            option_parser.error(("There was a problem parsing the --taxa_fp: "
                                 "%s" % e.message))
        else:
            otu_sample_ids = parsed[0]
            lineages = parsed[1]
            otu_table = parsed[2]

        # make sure there are matching sample ids with the otu table
        if not sids_intersection.issuperset(otu_sample_ids):
            option_parser.error("The sample identifiers in the OTU table must "
                                "have at least one match with the data in the "
                                "mapping file and with the coordinates file. "
                                "Verify you are using input files that belong "
                                "to the same dataset.")
        if len(lineages) <= 1:
            option_parser.error("Contingency tables with one or fewer rows "
                                "are not supported, please try passing a "
                                "contingency table with more than one row.")
Esempio n. 7
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_coords = opts.input_coords
    map_fp = opts.map_fp
    output_dir = opts.output_dir
    color_by_column_names = opts.color_by
    add_unique_columns = opts.add_unique_columns
    custom_axes = opts.custom_axes
    ignore_missing_samples = opts.ignore_missing_samples
    missing_custom_axes_values = opts.missing_custom_axes_values
    jackknifing_method = opts.ellipsoid_method
    master_pcoa = opts.master_pcoa
    taxa_fp = opts.taxa_fp
    n_taxa_to_keep = opts.n_taxa_to_keep
    biplot_fp = opts.biplot_fp
    add_vectors = opts.add_vectors
    verbose_output = opts.verbose
    number_of_axes = opts.number_of_axes
    compare_plots = opts.compare_plots
    number_of_segments = opts.number_of_segments

    # add some metadata to the output
    emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML')

    # verifying that the number of axes requested is greater than 3
    if number_of_axes < 3:
        option_parser.error(('You need to plot at least 3 axes.'))

    # verifying that the number of segments is between the desired range
    if number_of_segments < 4 or number_of_segments > 14:
        option_parser.error(('number_of_segments should be between 4 and 14.'))

    # append headernames that the script didn't find in the mapping file
    # according to different criteria to the following variables
    offending_fields = []
    non_numeric_categories = []

    serial_comparison = True

    # can't do averaged pcoa plots _and_ custom axes in the same plot
    if custom_axes!=None and len(custom_axes.split(','))>1 and\
        isdir(input_coords):
        option_parser.error(
            ('Jackknifed plots are limited to one custom axis, '
             'currently trying to use: %s. Make sure you use only one.' %
             custom_axes))

    # make sure the flag is not misunderstood from the command line interface
    if isdir(input_coords) == False and compare_plots:
        option_parser.error(
            'Cannot use the \'--compare_plots\' flag unless the'
            ' input path is a directory.')

    # before creating any output, check correct parsing of the main input files
    try:
        mapping_data, header, comments = parse_mapping_file(open(map_fp, 'U'))

        # use this set variable to make presence/absensce checks faster
        lookup_header = set(header)
    except:
        option_parser.error(
            ('The metadata mapping file \'%s\' does not seem '
             'to be formatted correctly, verify the formatting is QIIME '
             'compliant by using check_id_map.py') % map_fp)

    # dir means jackknifing or coordinate comparison type of processing
    if isdir(input_coords):
        offending_coords_fp = []
        coords_headers, coords_data, coords_eigenvalues, coords_pct=[],[],[],[]

        # iterate only over the non-hidden files and not folders and if anything
        # ignore the procrustes results file that is generated by
        # transform_coordinate_matrices.py suffixed in procrustes_results.txt
        coord_fps = [
            join(input_coords, f) for f in listdir(input_coords) if
            not f.startswith('.') and not isdir(join(abspath(input_coords), f))
            and not f.endswith('procrustes_results.txt')
        ]

        # this could happen and we rather avoid this problem
        if len(coord_fps) == 0:
            option_parser.error('Could not use any of the files in the input '
                                'directory.')

        # the master pcoa must be the first in the list of coordinates; however
        # if the visualization is not a jackknifed plot this gets ignored
        if master_pcoa and compare_plots == False:
            if master_pcoa in coord_fps:  # remove it if duplicated
                coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + coord_fps  # prepend it to the list
        # passing a master file means that the comparison is not serial
        elif master_pcoa and compare_plots:
            serial_comparison = False

            # guarantee that the master is the first and is not repeated
            if master_pcoa in coord_fps:
                coord_fps.remove(master_pcoa)
                coord_fps = [master_pcoa
                             ] + sort_comparison_filenames(coord_fps)

        # QIIME generates folders of transformed coordinates for the specific
        # purpose of connecting all coordinates to a set of origin coordinates.
        # The name of this file is suffixed as _transformed_reference.txt
        elif master_pcoa == None and len(
            [f
             for f in coord_fps if f.endswith('_transformed_reference.txt')]):
            master_pcoa = [
                f for f in coord_fps
                if f.endswith('_transformed_reference.txt')
            ][0]
            serial_comparison = False

            # Note: the following steps are to guarantee consistency.
            # remove the master from the list and re-add it as a first element
            # the rest of the files must be sorted alphabetically so the result
            # will be: ['unifrac_transformed_reference.txt',
            # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc
            coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        for fp in coord_fps:
            try:
                _coords_headers, _coords_data, _coords_eigenvalues,_coords_pct=\
                    parse_coords(open(fp,'U'))
            except (ValueError, QiimeParseError):
                offending_coords_fp.append(fp)

                # do not add any of the data and move along
                continue

            # pack all the data correspondingly only if it was correctly parsed
            coords_headers.append(_coords_headers)
            coords_data.append(_coords_data)
            coords_eigenvalues.append(_coords_eigenvalues)
            coords_pct.append(_coords_pct)

        # in case there were files that couldn't be parsed
        if offending_coords_fp:
            option_parser.error(
                ('The following file(s): \'%s\' could not be '
                 'parsed properly. Make sure the input folder only contains '
                 'coordinates files.') % ', '.join(offending_coords_fp))

        # check all files contain the same sample identifiers by flattening the
        # list of available sample ids and returning the sample ids that are
        # in one of the sets of sample ids but not in the globablly shared ids
        non_shared_ids = set(
            sum([
                list(set(sum(coords_headers, [])) ^ set(e))
                for e in coords_headers
            ], []))
        if non_shared_ids and len(coords_headers) > 1:
            option_parser.error(
                ('The following sample identifier(s): \'%s\''
                 'are not shared between all the files. The files used to '
                 'make a jackknifed PCoA plot or coordinate comparison plot ('
                 'procustes plot) must share all the same sample identifiers'
                 'between each other.') % ', '.join(list(non_shared_ids)))

        # flatten the list of lists into a 1-d list
        _coords_headers = list(set(sum(coords_headers, [])))

        # number of samples ids that are shared between coords and mapping files
        sids_intersection = list(
            set(zip(*mapping_data)[0]) & set(_coords_headers))

        # sample ids that are not mapped but are in the coords
        sids_difference = list(
            set(_coords_headers) - set(zip(*mapping_data)[0]))

        # used to perform different validations in the script, very similar for
        # the case where the input is not a directory
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers[0])

    else:
        try:
            coords_headers, coords_data, coords_eigenvalues, coords_pct =\
                parse_coords(open(input_coords,'U'))
        # this exception was noticed when there were letters in the coords file
        # other exeptions should be catched here; code will be updated then
        except (ValueError, QiimeParseError):
            option_parser.error(
                ('The PCoA file \'%s\' does not seem to be a '
                 'coordinates formatted file, verify by manually inspecting '
                 'the contents.') % input_coords)

        # number of samples ids that are shared between coords and mapping files
        sids_intersection = list(
            set(zip(*mapping_data)[0]) & set(coords_headers))
        # sample ids that are not mapped but are in the coords
        sids_difference = list(
            set(coords_headers) - set(zip(*mapping_data)[0]))
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers)

    if taxa_fp:
        try:
            # for summarized tables the "otu_ids" are really the "lineages"
            otu_sample_ids, lineages, otu_table, _ = parse_otu_table(
                open(taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True)
        except ValueError, e:
            option_parser.error(
                'There was a problem parsing the --taxa_fp: %s' % e.message)

        # make sure there are matching sample ids with the otu table
        if not len(list(set(sids_intersection) & set(otu_sample_ids))):
            option_parser.error(
                'The sample identifiers in the OTU table must '
                'have at least one match with the data in the mapping file and '
                'with the coordinates file. Verify you are using input files '
                'that belong to the same dataset.')
        if len(lineages) <= 1:
            option_parser.error(
                'Contingency tables with one or fewer rows are '
                'not supported, please try passing a contingency table with '
                'more than one row.')