Ejemplo n.º 1
0
class ServeHTMLInterface(Command):
    BriefDescription = "Start the HTMLInterface server"
    LongDescription = ("Start the HTMLInterface server and load the provided "
                       "interface_module and port")
    CommandIns = ParameterCollection([
        CommandIn(Name='port',
                  DataType=int,
                  Description='The port to run the server on',
                  Required=False,
                  Default=8080),
        CommandIn(Name='interface_module',
                  DataType=str,
                  Description='The module to serve the interface for',
                  Required=True)
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name='result',
                   DataType=str,
                   Description='Signals the termination of the HTMLInterface '
                   'server')
    ])

    def run(self, **kwargs):
        """Start the HTMLInterface server with the port and interface_module"""
        fin = start_server(kwargs['port'], kwargs['interface_module'])

        return {'result': fin}
class BenchResultsProcesser(Command):
    """Subclassing the pyqi.core.command.Command class"""
    BriefDescription = "Processes the benchmark suite results"
    LongDescription = ("Takes the benchmark suite output directory and "
                       "processes the benchmark measurements, creating plots "
                       "and collapsing results in a usable form.")
    CommandIns = ParameterCollection([
        CommandIn(Name='bench_results',
                  DataType=list,
                  Description='List with the benchmark results',
                  Required=True),
        CommandIn(Name='job_ids',
                  DataType=list,
                  Description='List of job ids to wait for if running in a '
                  'pbs cluster',
                  Required=False)
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name="bench_data",
                   DataType=CompData,
                   Description="Dictionary with the benchmark results"),
    ])

    def run(self, **kwargs):
        bench_results = kwargs['bench_results']
        job_ids = kwargs['job_ids']

        if job_ids:
            wait_on(job_ids)

        data = process_benchmark_results(bench_results)

        return {'bench_data': data}
Ejemplo n.º 3
0
class BashCompletion(Command):
    BriefDescription = "Construct a bash completion script"
    LongDescription = (
        "Construct a bash tab completion script that will search"
        " through available commands and options")

    CommandIns = ParameterCollection([
        CommandIn(Name='command_config_module',
                  DataType=str,
                  Description="CLI command configuration module",
                  Required=True),
        CommandIn(Name='driver_name',
                  DataType=str,
                  Description="name of the driver script",
                  Required=True)
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name='result',
                   DataType=str,
                   Description="result bash completion script")
    ])

    def run(self, **kwargs):
        driver = kwargs['driver_name']
        cfg_mod_path = kwargs['command_config_module']
        cfg_mod = _get_cfg_module(cfg_mod_path)
        command_names = get_command_names(cfg_mod_path)
        command_list = ' '.join(command_names)

        commands = []
        for cmd in command_names:
            cmd_cfg, _ = get_command_config(cfg_mod_path,
                                            cmd,
                                            exit_on_failure=False)

            if cmd_cfg is not None:
                command_options = []
                command_options.extend(
                    sorted(['--%s' % p.Name for p in cmd_cfg.inputs]))
                opts = ' '.join(command_options)

                commands.append(command_fmt % {
                    'command': cmd,
                    'options': opts
                })

        all_commands = ''.join(commands)
        return {
            'result': script_fmt % {
                'driver': driver,
                'commands': all_commands,
                'command_list': command_list
            }
        }
class BenchResultsProcesser(Command):
    BriefDescription = "Processes the benchmark suite results"
    LongDescription = "Takes the benchmark suite output directory and " +\
        "processes the benchmark measurements, creating plots and collapsing" +\
        " results in a usable form."
    CommandIns = ParameterCollection([
        CommandIn(Name='input_dir',
                  DataType=str,
                  Description='Path to the directory with the time results',
                  Required=True),
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name="bench_data",
                   DataType=dict,
                   Description="Dictionary with the benchmark results"),
        CommandOut(Name="time_fig",
                   DataType=Figure,
                   Description="Figure with the execution time results"),
        CommandOut(
            Name="time_str",
            DataType=str,
            Description="String with the best polynomial fit to the benchmark "
            "execution time results"),
        CommandOut(Name="mem_fig",
                   DataType=Figure,
                   Description="Figure with the memory consumption results"),
        CommandOut(
            Name="mem_str",
            DataType=str,
            Description="String with the best polynomial fit to the benchmark "
            "memory consumption results")
    ])

    def run(self, **kwargs):
        result = {}

        input_dir = kwargs['input_dir']

        data, time_fig, time_str, mem_fig, mem_str = \
                                            process_benchmark_results(input_dir)

        result['bench_data'] = data
        result['time_fig'] = time_fig
        result['time_str'] = time_str
        result['mem_fig'] = mem_fig
        result['mem_str'] = mem_str

        return result
Ejemplo n.º 5
0
class ReadGooderCommand(Command):
    BriefDescription = "Read gooder"
    LongDescription = "Put all the words in a box"
    CommandIns = ParameterCollection([
        CommandIn(Name='text', DataType=str,
                  Description='The input text', Required=True),
        CommandIn(Name='tickmark_in_box_index', DataType=int,
                  Description='The tick position of the box',
                  Required=False, Default=8),
        CommandIn(Name='buffer_size', DataType=int,
                  Description='Ring buffer size', Default=100),
        CommandIn(Name='box_width', DataType=int,
                  Description='Length of the box', Default=20)
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name="orp_word", DataType=GeneratorType,
                   Description="Resulting formatted words"),
    ])

    def _boxify_callback(self, box_width, tick_pos, wk):
        """Put a box on it"""
        # this should be pushed to the output handler
        dash = u'\u2500'
        downtick = u'\u252c'
        uptick = u'\u2534'

        left = dash * tick_pos
        right = dash * (box_width - tick_pos - 1)

        up = left + downtick + right
        down = left + uptick + right

        return "%s\n%s\n%s" % (up, wk.state['formatted'], down)

    def run(self, **kwargs):
        text = kwargs.pop('text')
        box_width = kwargs.pop('box_width')
        tickpos = kwargs['tickmark_in_box_index']
        ring_size = kwargs.pop('buffer_size')

        state.ringbuffer = RingBuffer(ring_size)
        success_f = partial(self._boxify_callback, box_width, tickpos)

        wf = ReadGooder({}, options=kwargs)
        iter_ = wf(text, success_callback=success_f)

        return {'orp_word': iter_}
Ejemplo n.º 6
0
    def test_init(self):
        """Jog the init"""
        params = [
            CommandIn('a', str, 'help', Required=False),
            CommandIn('b', float, 'help2', Required=True)
        ]
        obj = ParameterCollection(params)

        self.assertEqual(obj.Parameters, params)
        self.assertEqual(obj['a'], params[0])
        self.assertEqual(obj['b'], params[1])

        # Duplicate Parameter names.
        params.append(CommandIn('a', int, 'help3'))
        with self.assertRaises(IncompetentDeveloperError):
            _ = ParameterCollection(params)
Ejemplo n.º 7
0
        class foo(Command):
            Parameters = ParameterCollection([
                CommandIn('a', str, 'help1', Required=True),
                CommandIn('b', str, 'help2', Required=False)
            ])

            def run(self, **kwargs):
                return {}
Ejemplo n.º 8
0
class MakeCommand(CodeHeaderGenerator):
    BriefDescription = "Construct a stubbed out Command object"
    LongDescription = ("This command is intended to construct the basics of a "
        "Command object so that a developer can dive straight into the "
        "implementation of the command")

    CommandIns = ParameterCollection(
          CodeHeaderGenerator.CommandIns.Parameters + [
          CommandIn(Name='name', DataType=str,
                    Description='the name of the Command', Required=True),
          CommandIn(Name='test_code', DataType=bool,
                    Description='create stubbed out unit test code',
                    Required=False, Default=False)
          ]
    )
    CommandOuts = ParameterCollection([
          CommandOut(Name='result',DataType=list, 
                    Description='The resulting template')
          ]
    )

    def run(self, **kwargs):
        code_header_lines = super(MakeCommand, self).run(
                author=kwargs['author'], email=kwargs['email'],
                license=kwargs['license'], copyright=kwargs['copyright'],
                version=kwargs['version'], credits=kwargs['credits'])['result']

        result_lines = code_header_lines

        if kwargs['test_code']:
            result_lines.extend(
                    (test_format % {'name': kwargs['name']}).split('\n'))
        else:
            result_lines.extend(command_imports.split('\n'))
            result_lines.append('')
            result_lines.extend((command_format % (
                    kwargs['name'], kwargs['name'])).split('\n'))

        return {'result':  result_lines}
class BenchResultsComparator(Command):
    """Subclassing the pyqi.core.command.Command class"""
    BriefDescription = "Compare different run results of the same bench suite"
    LongDescription = ("Takes the benchmark results of different runs of the "
                       "same benchmark suite and generates a plot with the "
                       "wall time and a plot with the memory consumption of "
                       "the different runs, allowing performance comparison "
                       "between them.")
    CommandIns = ParameterCollection([
        CommandIn(Name='bench_results', DataType=list,
                  Description='List with the benchmark results of the '
                              'different runs of the same benchmark suite',
                  Required=True),
        CommandIn(Name='labels', DataType=list,
                  Description='List of strings to label each data series of '
                              'the plot',
                  Required=True)
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name="comp_data", DataType=dict,
                   Description="")
    ])

    def run(self, **kwargs):
        bench_results = list(kwargs['bench_results'])
        labels = kwargs['labels']

        if len(bench_results) < 2:
            raise CommandError("You should provide at least two directories "
                               "with the benchmark results")
        if len(bench_results) != len(labels):
            raise CommandError("The number of results and the number of labels"
                               " should match: %s != %s" % (len(bench_results),
                                                            len(labels)))

        data = compare_benchmark_results(bench_results, labels)

        return {'comp_data': data}
Ejemplo n.º 10
0
        class stubby(Command):
            CommandIns = ParameterCollection([
                CommandIn('a', int, '', Required=True),
                CommandIn('b', int, '', Required=False, Default=5),
                CommandIn('c',
                          int,
                          '',
                          Required=False,
                          Default=10,
                          ValidateValue=lambda x: x == 10)
            ])

            def run(self, **kwargs):
                return {}
Ejemplo n.º 11
0
 def setUp(self):
     self.pc = ParameterCollection([CommandIn('foo', str, 'help')])
Ejemplo n.º 12
0
class ghetto(Command):
    CommandIns = ParameterCollection([CommandIn('c', str, 'b')])
    CommandOuts = ParameterCollection([CommandOut('itsaresult', str, 'x')])

    def run(self, **kwargs):
        return {'itsaresult': 10}
Ejemplo n.º 13
0
class MakeOptparse(CodeHeaderGenerator):
    BriefDescription = "Consume a Command, stub out an optparse configuration"
    LongDescription = """Construct and stub out the basic optparse configuration for a given Command. This template provides comments and examples of what to fill in."""

    CommandIns = ParameterCollection(
        CodeHeaderGenerator.CommandIns.Parameters + [
        CommandIn(Name='command', DataType=Command,
                  Description='an existing Command', Required=True),
        CommandIn(Name='command_module', DataType=str,
                  Description='the Command source module', Required=True)
        ]
    )

    CommandOuts = ParameterCollection([
        CommandOut(Name='result', DataType=list,
                   Description='The resulting template configuration')
    ])

    def run(self, **kwargs):
        code_header_lines = super(MakeOptparse, self).run(
                author=kwargs['author'], email=kwargs['email'],
                license=kwargs['license'], copyright=kwargs['copyright'],
                version=kwargs['version'], credits=kwargs['credits'])['result']

        result_lines = code_header_lines

        # construct inputs based off of CommandIns
        cmdin_formatted = []
        for cmdin in sorted(kwargs['command'].CommandIns.values(),
                            key=attrgetter('Name')):
            if cmdin.Required:
                default_block = ''
            else:
                default_fmt = {
                        'default': repr(cmdin.Default),
                        'default_description': repr(cmdin.DefaultDescription)
                }
                default_block = default_block_format % default_fmt

            if cmdin.DataType is bool:
                action = 'store_true'
                data_type = None
            else:
                action = 'store'
                data_type = cmdin.DataType

            fmt = {'name':cmdin.Name,
                   'datatype':getattr(data_type, '__name__', None),
                   'action':action, 'required':str(cmdin.Required),
                   'help':cmdin.Description, 'default_block':default_block}
            cmdin_formatted.append(input_format % fmt)

        cmdout_formatted = []
        for cmdin in sorted(kwargs['command'].CommandOuts.values(),
                            key=attrgetter('Name')):
            fmt = {'name':cmdin.Name}
            cmdout_formatted.append(output_format % fmt)


        cmdin_formatted = ''.join(cmdin_formatted)
        cmdout_formatted = ''.join(cmdout_formatted)
        header_fmt = {'command_module':kwargs['command_module'],
                      'input_fmt': cmdin_formatted,
                      'output_fmt':cmdout_formatted}

        result_lines.extend((header_format % header_fmt).split('\n'))
        return {'result': result_lines}
Ejemplo n.º 14
0
class BenchSuiteMaker(Command):
    BriefDescription = "Generates a benchmark suite file"
    LongDescription = (
        "Given a command and a list of benchmarks files or a "
        "dictionary with the options to test, this command generates a shell "
        "script that executes a complete benchmark suite.")
    CommandIns = ParameterCollection([
        CommandIn(Name='command',
                  DataType=str,
                  Description='command to benchmark',
                  Required=True),
        CommandIn(
            Name='parameters',
            DataType=dict,
            Description='dictionary where the keys are the parameters to test '
            'and the values are a list of values for such parameter.',
            DefaultDescription='No parameters used',
            Default=None),
        CommandIn(
            Name='bench_files',
            DataType=list,
            Description='List of lists of paths to the benchmark files to use '
            'as input for the command. Each inner list is a test case and '
            'should have the same length as the in_opts parameter.',
            DefaultDescription='No bench_files used',
            Required=False,
            Default=None),
        CommandIn(
            Name='in_opts',
            DataType=list,
            Description='list of options used for providing the benchmark files'
            ' to the command. It should have the same length and order than the'
            ' inner lists of bench_files.',
            DefaultDescription='["-i"] is used as a default',
            Required=False,
            Default=["-i"]),
        CommandIn(
            Name='out_opt',
            DataType=str,
            Description='Option used for providing the output path to the '
            'command to benchmark.',
            DefaultDescription='"-o" is used as default',
            Required=False,
            Default="-o")
    ])
    CommandOuts = ParameterCollection([
        CommandOut(Name='bench_suite',
                   DataType=str,
                   Description='String with the benchmark suite')
    ])

    def run(self, **kwargs):
        result = {}

        command = kwargs['command']
        out_opt = kwargs['out_opt']
        parameters = kwargs['parameters']
        bench_files = kwargs['bench_files']
        in_opts = kwargs['in_opts']
        if parameters:
            if bench_files:
                raise CommandError("Parameters or bench_files should be "
                                   "provided, but not both.")
            bench_str = make_bench_suite_parameters(command, parameters,
                                                    out_opt)
        elif bench_files:
            if not all(len(x) == len(in_opts) for x in bench_files):
                raise CommandError(
                    "The length of bench_files and in_opts must "
                    "be the same.")
            bench_str = make_bench_suite_files(command, in_opts, bench_files,
                                               out_opt)
        else:
            raise CommandError("Must specify parameters or bench_files.")

        result['bench_suite'] = bench_str

        return result
Ejemplo n.º 15
0
class BenchSuiteMaker(Command):
    """Subclassing the pyqi.core.command.Command class"""
    BriefDescription = "Generates a benchmark suite file"
    LongDescription = ("Given a command and a list of benchmarks files or a "
                       "dictionary with the options to test, generates a shell"
                       " script that executes a complete benchmark suite.")
    CommandIns = ParameterCollection([
        CommandIn(Name='command',
                  DataType=str,
                  Description='command to benchmark',
                  Required=True),
        CommandIn(Name='parameters',
                  DataType=dict,
                  Description='dictionary where the keys are the parameters '
                  'to test and the values are a list of values for such '
                  'parameter.',
                  DefaultDescription='No parameters used'),
        CommandIn(Name='bench_files',
                  DataType=list,
                  Description='List of lists of paths to the benchmark files '
                  'to use as input for the command. Each inner list is a test '
                  'case and should have the same length as the in_opts '
                  'parameter.',
                  DefaultDescription='No bench_files used',
                  Required=False),
        CommandIn(Name='in_opts',
                  DataType=list,
                  Description='list of options used for providing the '
                  'benchmark files to the command. It should have the same '
                  'length and order than the inner lists of bench_files.',
                  DefaultDescription='["-i"] is used as a default',
                  Required=False,
                  Default=["-i"]),
        CommandIn(Name='out_opt',
                  DataType=str,
                  Description='Option used for providing the output path to '
                  'the command to benchmark.',
                  DefaultDescription='"-o" is used as default',
                  Required=False,
                  Default="-o"),
        CommandIn(Name='pbs',
                  DataType=bool,
                  Description='Flag to determine if the benchmark suite will '
                  'run in a PBS cluster environment',
                  DefaultDescription='False: run serially in bash',
                  Required=False,
                  Default=False),
        CommandIn(Name='job_prefix',
                  DataType=str,
                  Description='Prefix for the job name in case of a PBS '
                  'cluster environment',
                  DefaultDescription='"bench_" is used as a default prefix',
                  Required=False,
                  Default="bench_"),
        CommandIn(Name='queue',
                  DataType=str,
                  Description='PBS queue to submit jobs',
                  DefaultDescription='"" is used as default, which will submit'
                  ' the jobs to the system default queue',
                  Required=False,
                  Default=""),
        CommandIn(Name='pbs_extra_args',
                  DataType=str,
                  Description='Any extra arguments needed to qsub',
                  DefaultDescription='No extra arguments are used',
                  Required=False,
                  Default="")
    ])
    CommandOuts = ParameterCollection([
        CommandOut(Name='bench_suite',
                   DataType=str,
                   Description='String with the benchmark suite')
    ])

    def run(self, **kwargs):
        # Get command parameters
        command = kwargs['command']
        out_opt = kwargs['out_opt']
        parameters = kwargs['parameters']
        bench_files = kwargs['bench_files']
        in_opts = kwargs['in_opts']
        pbs = kwargs['pbs']
        job_prefix = kwargs['job_prefix']
        queue = kwargs['queue']
        pbs_extra_args = kwargs['pbs_extra_args']

        # Check which type of bench suite are we generating
        if parameters:
            # We are generating a benchmark suite based on different parameter
            # values. In such case, the user should not provide any bench file
            if bench_files:
                raise CommandError("Parameters or bench_files should be "
                                   "provided, but not both.")
            bench_str = make_bench_suite_parameters(command, parameters,
                                                    out_opt, pbs, job_prefix,
                                                    queue, pbs_extra_args)
        elif bench_files:
            # We are generating a benchmark suite based on input files,
            # Check that the number of benchmark files for test case match
            # the number of options to provide the input files
            if not all(len(x) == len(in_opts) for x in bench_files):
                raise CommandError("The length of bench_files and in_opts "
                                   "must be the same.")
            bench_str = make_bench_suite_files(command, in_opts, bench_files,
                                               out_opt, pbs, job_prefix, queue,
                                               pbs_extra_args)
        else:
            # Not enough parameters!
            raise CommandError("Must specify parameters or bench_files.")

        return {'bench_suite': bench_str}
class biomtocorediversityanalyses(Command):
    BriefDescription = "This command allows to run core diversity analysis using as input a biom table (i.e. output from fasta_to_closed_reference_otu_picking.py script)"
    LongDescription = "A command for running core diversity analyses in order to obtain the alpha and beta diversity using a miRNAs biom table as input. Alpha diversity is performed with observed species metric while the beta diversity with Bray-curtis metric. THIS CODE IS CURRENTLY UNTESTED. YOU SHOULD NOT USE THIS VERSION OF THE CODE. THIS MESSAGE WILL BE REMOVED WHEN TESTS ARE ADDED."

    CommandIns = ParameterCollection([
        CommandIn(Name='input_file',
                  DataType=str,
                  Description='directory containing the input biom table',
                  Required=True),
        CommandIn(
            Name='output_dir',
            DataType=str,
            Description=
            'the path where the output of core diversity analysis should be written',
            Required=True),
        CommandIn(Name='mapping_file',
                  DataType=str,
                  Description='the path where the mapping file is located',
                  Required=True),
        CommandIn(
            Name='sampling_depth',
            DataType=int,
            Description=
            'Sequencing depth to use for even sub-sampling and maximum rarefaction depth. You should review the output of print_biom_table_summary.py on the miRNAs biom table to decide on this value',
            Required=True),
        CommandIn(Name='jobs_to_start',
                  DataType=int,
                  Description='the number of jobs you want to run in parallel',
                  Default=1),
        CommandIn(
            Name='category',
            DataType=str,
            Description=
            'The metadata category or categories to compare (i.e. column headers in the mapping file)',
            Required=False)
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name='status', DataType=str,
                   Description='the final result'),
        CommandOut(Name='error', DataType=str, Description='the error result')
    ])

    # Qiime is required to be installed by the User so that every scripts can be called in the command line within the User $HOME.

    # Scripts included in Qiime
    core_diversity_analyses_path = "core_diversity_analyses.py"

    # Temporary folder to store the files:
    temp_dir = gettempdir()
    verbose = True

    def run(self, **kwargs):

        input_fp = kwargs['input_file']

        output_dir = kwargs['output_dir']

        #Mapping file
        mapping_file_fp = kwargs['mapping_file']
        input_mapping_file_pattern = join(mapping_file_fp, 'mapping_file.txt')

        temp_files_to_remove = []
        temp_dirs_to_remove = []
        input_filename = split(input_fp)[1]
        input_basename = splitext(input_filename)[0]

        #Create and call the core_diversity_analysis.py command and run it using a miRNAs biom table
        command = "%s -i %s -m %s -e %s -o %s -a -O %s -c %s --suppress_otu_category_significance --nonphylogenetic_diversity" % (
            self.core_diversity_analyses_path, input_fp, mapping_file_fp,
            int(kwargs["sampling_depth"]), output_dir,
            int(kwargs["jobs_to_start"]), str(kwargs["category"]))
        if self.verbose:
            print command
        stdout, stderr, ret_val = pyqi_system_call(command)
        if ret_val != 0:

            return {"status": ret_val, "error": stderr}
Ejemplo n.º 17
0
class MetadataAdder(Command):
    BriefDescription = "Add metadata to a BIOM table"
    LongDescription = ("Add sample and/or observation metadata to "
                       "BIOM-formatted files. Detailed usage examples can be "
                       "found here: http://biom-format.org/documentation/add"
                       "ing_metadata.html")

    CommandIns = ParameterCollection([
        CommandIn(Name='table',
                  DataType=Table,
                  Description='the input BIOM table',
                  Required=True),
        # sample_metadata and observation_metadata are currently files (or
        # file-like) because of the existing metadata map / processing function
        # support. Ideally, these two parameters should be MetadataMap
        # instances.
        CommandIn(Name='sample_metadata',
                  DataType=file,
                  Description='the sample metadata map (will add sample '
                  'metadata to the input BIOM table, if provided)'),
        CommandIn(Name='observation_metadata',
                  DataType=file,
                  Description='the observation metadata map (will add '
                  'observation metadata to the input BIOM table, if '
                  'provided)'),
        CommandIn(Name='sc_separated',
                  DataType=list,
                  Description='list of the metadata fields to split on '
                  'semicolons. This is useful for hierarchical data such as '
                  'taxonomy or functional categories'),
        CommandIn(Name='sc_pipe_separated',
                  DataType=list,
                  Description='list of the metadata fields to split on '
                  'semicolons and pipes ("|"). This is useful for '
                  'hierarchical data such as functional categories with '
                  'one-to-many mappings (e.g. x;y;z|x;y;w)'),
        CommandIn(Name='int_fields',
                  DataType=list,
                  Description='list of the metadata fields to cast to '
                  'integers. This is useful for integer data such as '
                  '"DaysSinceStart"'),
        CommandIn(Name='float_fields',
                  DataType=list,
                  Description='list of the metadata fields to cast to '
                  'floating point numbers. This is useful for real number '
                  'data such as "pH"'),
        CommandIn(Name='sample_header',
                  DataType=list,
                  Description='list of the sample metadata field names. This '
                  'is useful if a header line is not provided with the '
                  'metadata, if you want to rename the fields, or if you want '
                  'to include only the first n fields where n is the number '
                  'of entries provided here',
                  DefaultDescription='use header from sample metadata map'),
        CommandIn(Name='observation_header',
                  DataType=list,
                  Description='list of the observation metadata field names. '
                  'This is useful if a header line is not provided with the '
                  'metadata, if you want to rename the fields, or if you want '
                  'to include only the first n fields where n is the number '
                  'of entries provided here',
                  DefaultDescription='use header from observation metadata '
                  'map'),
        CommandIn(Name='output_as_json',
                  DataType=bool,
                  Description='Output as JSON',
                  Default=False)
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name='table',
                   DataType=tuple,
                   Description='Table with added metadata, and the output '
                   'format')
    ])

    def run(self, **kwargs):
        table = kwargs['table']
        sample_metadata = kwargs['sample_metadata']
        observation_metadata = kwargs['observation_metadata']
        sc_separated = kwargs['sc_separated']
        sc_pipe_separated = kwargs['sc_pipe_separated']
        int_fields = kwargs['int_fields']
        float_fields = kwargs['float_fields']
        sample_header = kwargs['sample_header']
        observation_header = kwargs['observation_header']
        output_as = 'json' if kwargs['output_as_json'] else 'hdf5'

        # define metadata processing functions, if any
        process_fns = {}
        if sc_separated is not None:
            process_fns.update(
                dict.fromkeys(sc_separated, self._split_on_semicolons))

        if sc_pipe_separated is not None:
            process_fns.update(
                dict.fromkeys(sc_pipe_separated,
                              self._split_on_semicolons_and_pipes))

        if int_fields is not None:
            process_fns.update(dict.fromkeys(int_fields, self._int))

        if float_fields is not None:
            process_fns.update(dict.fromkeys(float_fields, self._float))

        # parse mapping files
        if sample_metadata is not None:
            sample_metadata = MetadataMap.from_file(sample_metadata,
                                                    process_fns=process_fns,
                                                    header=sample_header)

        if observation_metadata is not None:
            observation_metadata = MetadataMap.from_file(
                observation_metadata,
                process_fns=process_fns,
                header=observation_header)

        if sample_metadata is None and observation_metadata is None:
            raise CommandError('Must specify sample_metadata and/or '
                               'observation_metadata.')

        # NAUGHTY: this is modifying the input table IN PLACE!!! And then
        # RETURNING IT! MetadataAdder is angry!

        # add metadata as necessary
        if sample_metadata:
            table.add_metadata(sample_metadata, axis='sample')

        if observation_metadata:
            table.add_metadata(observation_metadata, axis='observation')

        return {'table': (table, output_as)}

    def _split_on_semicolons(self, x):
        return [e.strip() for e in x.split(';')]

    def _split_on_semicolons_and_pipes(self, x):
        return [[e.strip() for e in y.split(';')] for y in x.split('|')]

    def _int(self, x):
        try:
            return int(x)
        except ValueError:
            return x

    def _float(self, x):
        try:
            return float(x)
        except ValueError:
            return x
Ejemplo n.º 18
0
class CodeHeaderGenerator(Command):
    BriefDescription = "Generate header code for use in a Python file"
    LongDescription = ("Generate valid Python code containing header "
                       "information, such as author, email address, "
                       "maintainer, version, etc.. This code can be placed at "
                       "the top of a Python file.")

    CommandIns = ParameterCollection([
        CommandIn(Name='author',
                  DataType=str,
                  Description='author/maintainer name',
                  Required=False,
                  Default=None),
        CommandIn(Name='email',
                  DataType=str,
                  Description='maintainer email address',
                  Required=False,
                  Default=None),
        CommandIn(Name='license',
                  DataType=str,
                  Description='license (e.g., BSD)',
                  Required=False,
                  Default=None),
        CommandIn(Name='copyright',
                  DataType=str,
                  Description='copyright (e.g., Copyright 2013, The pyqi '
                  'project)',
                  Required=False,
                  Default=None),
        CommandIn(Name='version',
                  DataType=str,
                  Description='version (e.g., 0.1)',
                  Required=False,
                  Default=None),
        CommandIn(Name='credits',
                  DataType=list,
                  Description='list of other authors',
                  Required=False,
                  Default=None)
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name='result',
                   DataType=list,
                   Description='the resulting header')
    ])

    def run(self, **kwargs):
        # Build a string formatting dictionary for the file header.
        head = {}
        head['author'] = kwargs['author']
        head['email'] = kwargs['email']
        head['license'] = kwargs['license']
        head['copyright'] = kwargs['copyright']
        head['version'] = kwargs['version']

        # Credits always includes author. Note that even if neither author nor
        # credits is passed, credits will be an empty list and will be written
        # out
        credits = [head['author']]
        if kwargs['credits'] is not None:
            credits.extend(kwargs['credits'])
        credits = filter(lambda x: x is not None, credits)
        f = lambda x: '"%s"' % x
        head['credits'] = ', '.join(map(f, credits))

        header_lines = []
        header_lines.append("#!/usr/bin/env python")
        header_lines.append("from __future__ import division")
        header_lines.append("")

        if head['author'] is not None:
            header_lines.append('__author__ = "%s"' % head['author'])
        if head['copyright'] is not None:
            header_lines.append('__copyright__ = "%s"' % head['copyright'])
        if head['credits'] is not None:
            header_lines.append('__credits__ = [%s]' % head['credits'])
        if head['license'] is not None:
            header_lines.append('__license__ = "%s"' % head['license'])
        if head['version'] is not None:
            header_lines.append('__version__ = "%s"' % head['version'])
        if head['author'] is not None:
            header_lines.append('__maintainer__ = "%s"' % head['author'])
        if head['email'] is not None:
            header_lines.append('__email__ = "%s"' % head['email'])

        header_lines.append("")
        header_format = ''.join(header_lines)

        return {'result': header_lines}
Ejemplo n.º 19
0
class TableConverter(Command):
    ObservationMetadataTypes = {
        'sc_separated': lambda x: [e.strip() for e in x.split(';')],
        'naive': lambda x: x
    }

    ObservationMetadataFormatters = {
        'sc_separated': lambda x: '; '.join(x),
        'naive': lambda x: x
    }

    ObservationMetadataTypes['taxonomy'] = \
        ObservationMetadataTypes['sc_separated']

    BriefDescription = "Convert to/from the BIOM table format"
    LongDescription = ("Convert between BIOM and 'classic' (tab-delimited) "
                       "table formats. Detailed usage examples can be found "
                       "here: http://biom-format.org/documentation/biom_conver"
                       "sion.html")

    CommandIns = ParameterCollection([
        # This is not an ideal usage of the pyqi framework because we are
        # expecting a file-like object here, and a lot of the parameters deal
        # with I/O-ish things, like converting between file formats. Even
        # though no I/O is forced here, it would be better to have rich objects
        # as input and output, instead of lines of data. However, this will
        # likely require a refactoring/redesign of our interface for table
        # conversions because the primary input here can be either a BIOM table
        # or a classic table. One possible solution is to split out different
        # types of conversions into their own (smaller and simpler) commands,
        # which would allow us to avoid some of this I/O-ish stuff.
        CommandIn(Name='table', DataType=Table,
                  Description='the input table (file-like object), either in '
                  'BIOM or classic format', Required=True),
        CommandIn(Name='to_json', DataType=bool,
                  Description='Output as a JSON table', Default=False),
        CommandIn(Name='to_hdf5', DataType=bool,
                  Description='Output as a HDF5 table', Default=False),
        CommandIn(Name='to_tsv', DataType=bool,
                  Description='Output as a TSV table', Default=False),
        CommandIn(Name='sample_metadata', DataType=MetadataMap,
                  Description='the sample metadata map (will add sample '
                  'metadata to the BIOM table, if provided). Only applies '
                  'when converting from classic table file to BIOM table '
                  'file'),
        CommandIn(Name='observation_metadata', DataType=MetadataMap,
                  Description='the observation metadata map (will add '
                  'observation metadata to the BIOM table, if provided). Only '
                  'applies when converting from classic table file to BIOM '
                  'table file'),
        CommandIn(Name='header_key', DataType=str,
                  Description='pull this key from observation metadata within '
                  'a BIOM table file when creating a classic table file',
                  DefaultDescription='no observation metadata will be '
                  'included'),
        CommandIn(Name='output_metadata_id', DataType=str,
                  Description='the name to be given to the observation '
                  'metadata column when creating a classic table from a BIOM-'
                  'formatted table', DefaultDescription='same name as in the '
                  'BIOM-formatted table'),
        CommandIn(Name='process_obs_metadata', DataType=str,
                  Description='process metadata associated with observations '
                  'when converting from a classic table. Must be one of: %s' %
                  ', '.join(ObservationMetadataTypes.keys()), Default='naive'),
        CommandIn(Name='tsv_metadata_formatter', DataType=str,
                  Description='Method for formatting the observation '
                  'metadata, must be one of: %s' %
                  ', '.join(ObservationMetadataFormatters),
                  Default='sc_separated')
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name='table', DataType=tuple,
                   Description='The resulting table and format')
    ])

    def run(self, **kwargs):
        table = kwargs['table']
        sample_metadata = kwargs['sample_metadata']
        observation_metadata = kwargs['observation_metadata']
        header_key = kwargs['header_key']
        output_metadata_id = kwargs['output_metadata_id']
        process_obs_metadata = kwargs['process_obs_metadata']
        obs_md_fmt = kwargs['tsv_metadata_formatter']
        to_tsv = kwargs['to_tsv']
        to_hdf5 = kwargs['to_hdf5']
        to_json = kwargs['to_json']

        if sum([to_tsv, to_hdf5, to_json]) == 0:
            raise CommandError("Must specify an output format")
        elif sum([to_tsv, to_hdf5, to_json]) > 1:
            raise CommandError("Can only specify a single output format")

        if obs_md_fmt not in self.ObservationMetadataFormatters:
            raise CommandError("Unknown tsv_metadata_formatter: %s" %
                               obs_md_fmt)
        else:
            obs_md_fmt_f = self.ObservationMetadataFormatters[obs_md_fmt]

        if sample_metadata is not None:
            table.add_metadata(sample_metadata)

        # if the user does not specify a name for the output metadata column,
        # set it to the same as the header key
        output_metadata_id = output_metadata_id or header_key

        if process_obs_metadata not in self.ObservationMetadataTypes:
            raise CommandError(
                "Unknown observation metadata processing method, must be "
                "one of: %s" %
                ', '.join(self.ObservationMetadataTypes.keys()))
        else:
            # assume we had a table coming in as TSV
            if table.observation_metadata is None:
                raise CommandError("Obseration metadata processing requested "
                                   "but it doesn't appear that there is any "
                                   "metadata to operate on!")

            # and if this came in as TSV, then we expect only a single type of
            # metadata
            md_key = table.observation_metadata[0].keys()[0]

            process_f = self.ObservationMetadataTypes[process_obs_metadata]
            it = zip(table.observation_ids, table.observation_metadata)
            new_md = {id_: {md_key: process_f(md[md_key])} for id_, md in it}

            if observation_metadata:
                for k, v in observation_metadata.items():
                    new_md[k].update(v)
            table.add_metadata(new_md, 'observation')

        if to_tsv:
            result = table.to_tsv(header_key=header_key,
                                  header_value=output_metadata_id,
                                  metadata_formatter=obs_md_fmt_f)
            fmt = 'tsv'
        elif to_json:
            result = table.to_json(generatedby())
            fmt = 'json'
        elif to_hdf5:
            result = table
            fmt = 'hdf5'

        return {'table': (result, fmt)}
Ejemplo n.º 20
0
class MakeRelease(Command):
    BriefDescription = "Make the release"
    LongDescription = "Do all the things for a release"
    CommandIns = ParameterCollection([
        CommandIn(Name='package_name',
                  DataType=str,
                  Description='The name of the package to release',
                  Required=True),
        CommandIn(Name='real_run',
                  DataType=bool,
                  Description='Perform a real run',
                  Required=False,
                  Default=False)
    ])

    CommandOuts = ParameterCollection([])
    RealRun = False
    _date_clean_re = re.compile(r'(\d+)(st|nd|rd|th)')

    def _parse_changelog(self, pkg_name):
        with open('ChangeLog.md') as f:
            lineiter = iter(f)
            for line in lineiter:
                match = re.search('^%s\s+(.*)' % pkg_name, line.strip())
                if match is None:
                    continue
                length = len(match.group(1))
                version = match.group(1).strip()
                if lineiter.next().count('-') != len(match.group(0)):
                    continue
                while 1:
                    change_info = lineiter.next().strip()
                    if change_info:
                        break

                match = re.search(r'released on (\w+\s+\d+\w+\s+\d+)',
                                  change_info)
                if match is None:
                    continue

                datestr = match.group(1)
                return version, self._parse_date(datestr)

    def _bump_version(self, version):
        try:
            parts = map(int, version.split('.'))
        except ValueError:
            self._fail('Current version is not numeric')
        parts[-1] += 1
        return '.'.join(map(str, parts))

    def _parse_date(self, string):
        string = self._date_clean_re.sub(r'\1', string)
        return datetime.strptime(string, '%B %d %Y')

    def _set_filename_version(self, filename, version_number, pattern):
        changed = []

        def inject_version(match):
            before, old, after = match.groups()
            changed.append(True)
            return before + version_number + after

        with open(filename) as f:
            contents = re.sub(
                r"""^(\s*%s\s*=\s*(?:'|"))(.+?)((?:'|"))(?sm)""" % pattern,
                inject_version, f.read())

        if not changed:
            self._fail('Could not find %s in %s', pattern, filename)

        if self.RealRun:
            with open(filename, 'w') as f:
                f.write(contents)

    def _set_init_version(self, pkg_name, version):
        self._info('Setting __init__.py version to %s', version)
        self._set_filename_version('%s/__init__.py' % pkg_name, version,
                                   '__version__')

    def _set_setup_version(self, version):
        self._info('Setting setup.py version to %s', version)
        self._set_filename_version('setup.py', version, '__version__')

    def _set_doc_version(self, version):
        self._info('Setting doc/conf.py version to %s', version)
        self._set_filename_version('doc/conf.py', version, 'release')

    def _build_and_upload(self):
        cmd = [sys.executable, 'setup.py', 'sdist', 'upload']
        stdout, stderr, retval = pyqi_system_call(cmd,
                                                  shell=False,
                                                  dry_run=not self.RealRun)
        if retval is not 0:
            self._fail("build and upload failed,\nSTDOUT:\n%s\n\nSTDERR:\n%s",
                       stdout, stderr)

    def _fail(self, message, *args):
        sys.stderr.write('Error: ')
        sys.stderr.write(message % args)
        sys.stderr.write('\n')
        sys.exit(1)

    def _info(self, message, *args):
        sys.stderr.write(message % args)
        sys.stderr.write('\n')

    def _get_git_tags(self):
        cmd = ['git', 'tag']
        stdout, stderr, retval = pyqi_system_call(cmd,
                                                  shell=False,
                                                  dry_run=not self.RealRun)
        if retval is not 0:
            self._fail("Could not git tag, \nSTDOUT:\n%s\n\nSTDERR:\n%s",
                       stdout, stderr)

        return stdout.splitlines()

    def _git_is_clean(self):
        cmd = ['git', 'diff', '--quiet']

        # always execute, even in dry run
        stdout, stderr, retval = pyqi_system_call(cmd, shell=False)
        return retval == 0

    def _make_git_commit(self, message, *args):
        message = message % args
        cmd = ['git', 'commit', '-am', message]
        stdout, stderr, retval = pyqi_system_call(cmd,
                                                  shell=False,
                                                  dry_run=not self.RealRun)
        if retval is not 0:
            self._fail("Could not git commit, \nSTDOUT:\n%s\n\nSTDERR:\n%s",
                       stdout, stderr)

    def _make_git_tag(self, tag):
        self._info('Tagging "%s"', tag)
        cmd = ['git', 'tag', tag]
        stdout, stderr, retval = pyqi_system_call(cmd,
                                                  shell=False,
                                                  dry_run=not self.RealRun)
        if retval is not 0:
            self._fail("Could not git tag, \nSTDOUT:\n%s\n\nSTDERR:\n%s",
                       stdout, stderr)

    def _get_git_branch(self):
        cmd = ['git', 'rev-parse', '--abbrev-ref', 'HEAD']

        # ignoring self.RealRun, always execute
        stdout, stderr, retval = pyqi_system_call(cmd, shell=False)
        if retval is not 0:
            self._fail(
                "Could not get git branch, \nSTDOUT:\n%s\n\nSTDERR:\n%s",
                stdout, stderr)
        return stdout.strip()

    def _git_push_branch(self):
        branch = self._get_git_branch()
        self._info('Pushing branch %s to origin', branch)
        cmd = ['git', 'push', 'upstream', branch]
        stdout, stderr, retval = pyqi_system_call(cmd,
                                                  shell=False,
                                                  dry_run=not self.RealRun)
        if retval is not 0:
            self._fail(
                "Could not push branch %s, \nSTDOUT:\n%s\n\nSTDERR:\n%s",
                stdout, stderr, branch)

    def _git_push_tag(self, tag):
        self._info('Pushing tag "%s"', tag)
        cmd = ['git', 'push', 'upstream', tag]
        stdout, stderr, retval = pyqi_system_call(cmd,
                                                  shell=False,
                                                  dry_run=not self.RealRun)
        if retval is not 0:
            self._fail("Could not push tag %s, \nSTDOUT:\n%s\n\nSTDERR:\n%s",
                       stdout, stderr, tag)

    def run(self, **kwargs):
        pkg_name = kwargs['package_name']
        self.RealRun = kwargs['real_run']

        try:
            pkg_module = importlib.import_module(pkg_name)
        except ImportError:
            sys.stderr.write("Could not import %s!\n" % pkg_name)
            sys.exit(1)

        os.chdir(os.path.join(os.path.dirname(pkg_module.__file__), '..'))

        rv = self._parse_changelog(pkg_name)
        if rv is None:
            self._fail('Could not parse changelog')

        version, release_date = rv
        dev_version = version + '-dev'

        self._info('Releasing %s (release date %s)', version,
                   release_date.strftime('%m/%d/%Y'))
        tags = self._get_git_tags()

        if version in tags:
            self._fail('Version "%s" is already tagged', version)
        if release_date.date() != date.today():
            self._fail('Release date is not today (%s != %s)',
                       release_date.strftime('%Y-%m-%d'), date.today())

        if not self._git_is_clean():
            self._fail('You have uncommitted changes in git')

        self._set_init_version(pkg_name, version)
        self._set_setup_version(version)
        self._set_doc_version(version)
        self._make_git_commit('Bump version number to %s', version)
        self._make_git_tag(version)
        self._build_and_upload()
        self._set_init_version(pkg_name, dev_version)
        self._set_setup_version(dev_version)
        self._set_doc_version(dev_version)
        self._make_git_commit('Bump version number to %s', dev_version)
        self._git_push_branch()
        self._git_push_tag(version)

        return {}
Ejemplo n.º 21
0
class InstallationInformer(Command):
    BriefDescription = ("Provide information about the biom-format "
                        "installation")
    LongDescription = ("Provide information about the biom-format "
                       "installation, including settings pulled from the "
                       "configuration file. For more details, see "
                       "http://biom-format.org")
    CommandIns = ParameterCollection([])
    CommandOuts = ParameterCollection([
        CommandOut(Name='install_info_lines',
                   DataType='str',
                   Description='Installation info')
    ])

    def run(self, **kwargs):
        lines = []

        lines.extend(self.get_formatted_system_info())
        lines.extend(self.get_formatted_dependency_version_info())
        lines.extend(self.get_formatted_package_info())
        lines.append('')

        return {'install_info_lines': lines}

    def get_formatted_system_info(self):
        return self._format_info(self.get_system_info(), 'System information')

    def get_formatted_dependency_version_info(self):
        return self._format_info(self.get_dependency_version_info(),
                                 'Dependency versions')

    def get_formatted_package_info(self):
        return self._format_info(self.get_package_info(),
                                 'biom-format package information')

    def get_system_info(self):
        return (("Platform", platform), ("Python/GCC version",
                                         python_version.replace('\n', ' ')),
                ("Python executable", executable))

    def get_dependency_version_info(self):
        not_installed_msg = "Not installed"

        try:
            from pyqi import __version__ as pyqi_lib_version
        except ImportError:
            pyqi_lib_version = not_installed_msg

        try:
            from numpy import __version__ as numpy_lib_version
        except ImportError:
            numpy_lib_version = ("ERROR: Not installed - this is required! "
                                 "(This will also cause the BIOM library to "
                                 "not be importable.)")

        try:
            from scipy import __version__ as scipy_lib_version
        except ImportError:
            scipy_lib_version = not_installed_msg

        try:
            from h5py import __version__ as h5py_lib_version
        except ImportError:
            h5py_lib_version = ("WARNING: Not installed - this is an optional "
                                "dependency. It is strongly recommended for "
                                "large datasets.")

        return (("pyqi version", pyqi_lib_version), ("NumPy version",
                                                     numpy_lib_version),
                ("SciPy version", scipy_lib_version), ("h5py version",
                                                       h5py_lib_version))

    def get_package_info(self):
        import_error_msg = ("ERROR: Can't find the BIOM library code (or "
                            "numpy) - is it installed and in your "
                            "$PYTHONPATH?")
        try:
            from biom import __version__ as biom_lib_version
        except ImportError:
            biom_lib_version = import_error_msg

        return (("biom-format version", biom_lib_version), )

    def _format_info(self, info, title):
        max_len = self._get_max_length(info)

        lines = ['']
        lines.append(title)
        lines.append('=' * len(title))
        for e in info:
            lines.append("%*s:\t%s" % (max_len, e[0], e[1]))

        return lines

    def _get_max_length(self, info):
        return max([len(e[0]) for e in info])
Ejemplo n.º 22
0
class TableSubsetter(Command):
    Axes = ['sample', 'observation']

    BriefDescription = "Subset a BIOM table"
    LongDescription = ("Subset a BIOM table, over either observations or "
                       "samples, without fully parsing it. This command is "
                       "intended to assist in working with very large tables "
                       "when tight on memory, or as a lightweight way to "
                       "subset a full table. Currently, it is possible to "
                       "produce tables with rows or columns (observations or "
                       "samples) that are fully zeroed.")

    CommandIns = ParameterCollection([
        CommandIn(Name='json_table_str',
                  DataType=str,
                  Description='the input BIOM table as an unparsed json '
                  'string',
                  Required=False),
        CommandIn(Name='hdf5_table',
                  DataType=str,
                  Description='the fp to the input BIOM table',
                  Required=False),
        CommandIn(Name='axis',
                  DataType=str,
                  Description='the axis to subset over, either ' +
                  ' or '.join(Axes),
                  Required=True),
        CommandIn(Name='ids',
                  DataType=list,
                  Description='the IDs to retain (either sample IDs or '
                  'observation IDs, depending on the axis)',
                  Required=True)
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name='subsetted_table',
                   DataType=tuple,
                   Description='The subset generator')
    ])

    def run(self, **kwargs):
        json_table_str = kwargs['json_table_str']
        hdf5_biom = kwargs['hdf5_table']
        axis = kwargs['axis']
        ids = kwargs['ids']

        if axis not in self.Axes:
            raise CommandError(
                "Invalid axis '%s'. Must be either %s." %
                (axis, ' or '.join(map(lambda e: "'%s'" % e, self.Axes))))

        if hdf5_biom is None and json_table_str is None:
            raise CommandError("Must specify an input table")
        elif hdf5_biom is not None and json_table_str is not None:
            raise CommandError("Can only specify one input table")

        if json_table_str is not None:
            idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis)
            new_data = direct_slice_data(json_table_str, idxs, axis)

            # multiple walks over the string. bad form, but easy right now
            # ...should add a yield_and_ignore parser or something.
            def subset_generator():
                yield "{"
                yield direct_parse_key(json_table_str, "id")
                yield ","
                yield direct_parse_key(json_table_str, "format")
                yield ","
                yield direct_parse_key(json_table_str, "format_url")
                yield ","
                yield direct_parse_key(json_table_str, "type")
                yield ","
                yield direct_parse_key(json_table_str, "generated_by")
                yield ","
                yield direct_parse_key(json_table_str, "date")
                yield ","
                yield direct_parse_key(json_table_str, "matrix_type")
                yield ","
                yield direct_parse_key(json_table_str, "matrix_element_type")
                yield ","
                yield new_data
                yield ","
                yield new_axis_md
                yield ","

                if axis == "observation":
                    yield direct_parse_key(json_table_str, "columns")
                else:
                    yield direct_parse_key(json_table_str, "rows")
                yield "}"

            format_ = 'json'
            table = subset_generator()
        else:
            with biom_open(hdf5_biom) as f:
                table = Table.from_hdf5(f, ids=ids, axis=axis)
            format_ = 'hdf5'

        return {'subsetted_table': (table, format_)}
Ejemplo n.º 23
0
class FastaToParallelPickOtusUclustRef(Command):
    BriefDescription = "This command allows to run a parallel closed-reference otu picking in Qiime using a fasta file containing mirna sequences (i.e. output from sra_to_qiime.py script)"
    LongDescription = "A command for running parallel closed-reference otu picking in Qiime in order to obtain a final biom table with the mirnas annotation that can be used for further analysis. THIS CODE IS CURRENTLY UNTESTED. YOU SHOULD NOT USE THIS VERSION OF THE CODE. THIS MESSAGE WILL BE REMOVED WHEN TESTS ARE ADDED."
    
    CommandIns = ParameterCollection([
        CommandIn(Name='input_file', DataType=str,
                  Description='directory containing the input mirnas fasta file', Required=True),
        CommandIn(Name='output_dir', DataType=str,
                  Description='the path where the output biom table with the mirnas annotation should be written', Required=True),
        CommandIn(Name='ncRNAdb_file', DataType=str,
                  Description='the path to the non coding rna database file', Required=True),
        CommandIn(Name='jobs_to_start', DataType=int,
                  Description='the number of jobs you want to run in parallel', Default=1),
        CommandIn(Name='mature_miRNAs_database_file', DataType=str,
                  Description='the path to the miRbase mature mirna database file', Required=True)
    
        ])
     
    CommandOuts = ParameterCollection([
        CommandOut(Name='status', DataType=str,
                  Description='the final result'),
        CommandOut(Name='error', DataType=str,
                  Description='the error result')
       
    ])

# Qiime is required to be installed by the User so that every scripts can be called in the command line within the User $HOME.
# The modified version of the Ensemble 'all non coding except mirnas (nc_ex_mirna)' database needs to be downloaded by the User.
# The human miRBase database needs to be downloaded by the user.

    # Scripts included in Qiime 
    parallel_pick_otus_uclust_ref_path = "parallel_pick_otus_uclust_ref.py"
    filter_fasta = "filter_fasta.py"
    make_otu_table = "make_otu_table.py"
    
    
    
    # Temporary folder to store the files:
    temp_dir = gettempdir()
    verbose = True

    def run(self, **kwargs):
        
        input_fp = kwargs['input_file']
       
        
        output_dir = kwargs['output_dir']
        
        ncrnadb_fp = kwargs['ncRNAdb_file']
        input_all_ncrna_except_mirna_database_pattern = ncrnadb_fp
        
        maturemirnadb_fp = kwargs['mature_miRNAs_database_file']
        input_human_mature_mirna_database_pattern = maturemirnadb_fp
         
         
        temp_files_to_remove = []
        temp_dirs_to_remove = []
        input_filename = split(input_fp)[1]
        input_basename = splitext(input_filename)[0]
            
        #Create and call the parallel_pick_otus_uclust_ref.py command and run it against Ensemble nc_ex_mirna database
        command = "%s -i %s -r %s -o %s -O %s --enable_rev_strand_match --max_accepts 1 --max_rejects 8 --stepwords 8 --word_length 8" % (self.parallel_pick_otus_uclust_ref_path, input_fp, input_all_ncrna_except_mirna_database_pattern, self.temp_dir, int(kwargs["jobs_to_start"]))
        if self.verbose:
                print command
        stdout, stderr, ret_val = pyqi_system_call(command)
        if ret_val != 0:
            raise Exception(stderr)


        # Filter all the sequences from the previous closed-reference picking otu that didn't hit the database (i.e. standard output from parallel_pick_otus_uclust_ref.py = *_failures.txt) using the script 'filter_fasta.py -f input_fasta -s index_list -o output) 
        temp_fasta_index_list_failing_to_hit_database_fp = join(self.temp_dir, '%s_failures.txt' % input_basename)
        temp_fasta_filtered_fp = join(self.temp_dir, '%s_filtered.fasta' % input_basename)

        command = "%s -f %s -s %s  -o %s" % (self.filter_fasta, input_fp, temp_fasta_index_list_failing_to_hit_database_fp, temp_fasta_filtered_fp)
        if self.verbose:
                print command
        stdout, stderr, ret_val = pyqi_system_call(command)
        if ret_val != 0:
            raise Exception(stderr)


        # Create and call the parallel_pick_otus_uclust_ref.py command against mirBase - human mature mirna database
            
        temp_index_of_otus_hitting_miRbase_fp= join(self.temp_dir, '%s_otus.txt' % input_basename)
        stdout, stderr, ret_val = pyqi_system_call(command)
        command = "%s -i %s -r %s -o %s --enable_rev_strand_match --max_accepts 1 --max_rejects 8 --stepwords 8 --word_length 8" % (self.parallel_pick_otus_uclust_ref_path, temp_fasta_filtered_fp, input_human_mature_mirna_database_pattern, self.temp_dir)
        if self.verbose:
                print command
        stdout, stderr, ret_val = pyqi_system_call(command)
        if ret_val != 0:
            raise Exception(stderr)

            
        # Create an otu_table using the outuput otu_map from the previous step:
            
            
        mirna_final_biom_table = join(output_dir, '%s.biom' % input_basename)
        command = "%s -i %s -o %s"  % (self.make_otu_table, temp_index_of_otus_hitting_miRbase_fp, mirna_final_biom_table)
        stdout, stderr, ret_val = pyqi_system_call(command)
        if self.verbose:
                print command
        stdout, stderr, ret_val = pyqi_system_call(command)
        if ret_val != 0:
            raise Exception(stderr)
            
        return {"status": "is ok",
                "error":None}
Ejemplo n.º 24
0
class TableNormalizer(Command):
    Axes = ['sample', 'observation']

    BriefDescription = "Normalize a BIOM table"
    LongDescription = ("Normalize the values of a BIOM table through various "
                       "methods. Relative abundance will take the relative "
                       "abundance of each observation in terms of samples or "
                       "observations.  Presence absensece will convert "
                       "observations to 1's and 0's based on presence of the "
                       "observation")

    CommandIns = ParameterCollection([
        CommandIn(Name='biom_table',
                  DataType=str,
                  Description='the input BIOM table'),
        CommandIn(Name='axis',
                  DataType=str,
                  Description='the axis to subset over, either ' +
                  ' or '.join(Axes),
                  Required=False),
        CommandIn(Name='relative_abund',
                  DataType=bool,
                  Description='normalize the table by relative abundance',
                  Required=False),
        CommandIn(Name='presence_absence',
                  DataType=bool,
                  Description='convert table to presence/absence values',
                  Required=False)
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name='table',
                   DataType=tuple,
                   Description='The resulting table and format')
    ])

    def run(self, **kwargs):
        biom_table = kwargs['biom_table']
        axis = kwargs['axis']
        relative_abund = kwargs['relative_abund']
        p_a = kwargs['presence_absence']

        if axis not in self.Axes:
            raise CommandError(
                "Invalid axis '%s'. Must be either %s." %
                (axis, ' or '.join(map(lambda e: "'%s'" % e, self.Axes))))

        if biom_table is None:
            raise CommandError("Must specify an input table")

        if relative_abund is False and p_a is False:
            raise CommandError("Must specifiy a normalization type")
        elif relative_abund is True and p_a is True:
            raise CommandError("Must specify only one normalization type")

        table = load_table(biom_table)

        if relative_abund is True:
            table.norm(axis=axis)
        else:
            table.pa()

        if HAVE_H5PY:
            return {'table': (table, 'hdf5')}
        else:
            return {'table': (table, 'json')}
Ejemplo n.º 25
0
class TableSummarizer(Command):
    """
     Example usage:
      from biom.commands.table_summarizer import TableSummarizer
      from biom.parse import parse_biom_table
      c = TableSummarizer()
      table_f = open("table.biom")
      t = parse_biom_table(table_f)
      table_f.seek(0)
      result = c(table=(t,None))
      result = c(table=(t,None),qualitative=True)
      result = c(table=(t,table_f),qualitative=True)
      table_f.close()
    """
    BriefDescription = "Summarize sample or observation data in a BIOM table"
    LongDescription = ("Provides details on the observation counts per sample,"
                       " including summary statistics, as well as metadata "
                       "categories associated with samples and observations.")

    CommandIns = ParameterCollection([
        CommandIn(Name='table',
                  DataType=tuple,
                  Description='the input BIOM table',
                  Required=True),
        CommandIn(Name='qualitative',
                  DataType=bool,
                  Description=('Present counts as number of unique '
                               'observation ids per sample, rather than '
                               'counts of observations per sample.'),
                  Required=False,
                  Default=False),
        CommandIn(Name='observations',
                  DataType=bool,
                  Default=False,
                  Description=('Summarize over observations'))
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name='biom_summary',
                   DataType=list,
                   Description='The table summary')
    ])

    def run(self, **kwargs):
        result = {}
        qualitative = kwargs['qualitative']
        by_observations = kwargs['observations']
        table, table_lines = kwargs['table']

        if by_observations:
            table = table.transpose()

        min_counts, max_counts, median_counts, mean_counts, counts_per_samp =\
            compute_counts_per_sample_stats(table, qualitative)
        num_observations = len(table.ids(axis='observation'))

        counts_per_sample_values = counts_per_samp.values()

        if table.metadata() is None:
            sample_md_keys = ["None provided"]
        else:
            sample_md_keys = table.metadata()[0].keys()

        if table.metadata(axis='observation') is None:
            observation_md_keys = ["None provided"]
        else:
            observation_md_keys = table.metadata(axis='observation')[0].keys()

        lines = []

        num_samples = len(table.ids())

        if by_observations:
            # as this is a transpose of the original table...
            lines.append('Num samples: %d' % num_observations)
            lines.append('Num observations: %d' % num_samples)
        else:
            lines.append('Num samples: %d' % num_samples)
            lines.append('Num observations: %d' % num_observations)

        if not qualitative:
            total_count = sum(counts_per_sample_values)
            lines.append('Total count: %d' % total_count)
            lines.append('Table density (fraction of non-zero values): %1.3f' %
                         table.get_table_density())

        lines.append('')

        if qualitative:
            if by_observations:
                lines.append('Sample/observations summary:')
            else:
                lines.append('Observations/sample summary:')
        else:
            lines.append('Counts/sample summary:')

        lines.append(' Min: %r' % min_counts)
        lines.append(' Max: %r' % max_counts)
        lines.append(' Median: %1.3f' % median_counts)
        lines.append(' Mean: %1.3f' % mean_counts)
        lines.append(' Std. dev.: %1.3f' % std(counts_per_sample_values))

        if by_observations:
            # since this is a transpose...
            lines.append(' Sample Metadata Categories: %s' %
                         '; '.join(observation_md_keys))
            lines.append(' Observation Metadata Categories: %s' %
                         '; '.join(sample_md_keys))
            lines.append('')
        else:
            lines.append(' Sample Metadata Categories: %s' %
                         '; '.join(sample_md_keys))
            lines.append(' Observation Metadata Categories: %s' %
                         '; '.join(observation_md_keys))
            lines.append('')

        if qualitative:
            lines.append('Observations/sample detail:')
        else:
            lines.append('Counts/sample detail:')

        for k, v in sorted(counts_per_samp.items(), key=itemgetter(1)):
            lines.append(' %s: %r' % (k, v))

        result['biom_summary'] = lines
        return result
Ejemplo n.º 26
0
class SraToQiime(Command):
    BriefDescription = "post split libraries format: This script allows to convert .sra miRNA sequence data into a QIIME compatible format"
    LongDescription = "A script for converting SRA miRNA sequence data into a format that can be used with QIIME's closed reference OTU picking workflows. THIS CODE IS CURRENTLY UNTESTED. YOU SHOULD NOT USE THIS VERSION OF THE CODE. THIS MESSAGE WILL BE REMOVED WHEN TESTS ARE ADDED."
    CommandIns = ParameterCollection([
        CommandIn(Name='input_dir',
                  DataType=str,
                  Description='directory containng input .sra files',
                  Required=True),
        CommandIn(Name='output_fp',
                  DataType=str,
                  Description=
                  'the path where the output fasta file should be written',
                  Required=True)
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name='result', DataType=str, Description='the final result')
    ])

    # sratoolkit and SCHIRMP are required to be installed by the User so that the tools sra_dump and fastq_to_fasta can be called in the command line within the User $HOME.

    sra_dump_path = "fastq-dump.2.3.1"
    fastq_to_fasta = "fastq_to_fasta"
    temp_dir = "/tmp/"
    verbose = True

    def run(self, **kwargs):

        input_dir = kwargs['input_dir']
        input_sra_pattern = join(input_dir, '*.sra')
        input_filepaths = glob(input_sra_pattern)
        output_fp = kwargs['output_fp']

        for input_filepath in input_filepaths:
            temp_files_to_remove = []
            temp_dirs_to_remove = []
            input_filename = split(input_filepath)[1]
            input_basename = splitext(input_filename)[0]

            # create and call the sra-dump command
            temp_fastq_dir = join(self.temp_dir, '%s_fastq' % input_basename)
            command = "%s %s -O %s" % (self.sra_dump_path, input_filepath,
                                       temp_fastq_dir)
            if self.verbose:
                print command
            stdout, stderr, ret_val = pyqi_system_call(command)
            temp_dirs_to_remove.append(temp_fastq_dir)
            temp_fastq_fp = glob(join(temp_fastq_dir, '*.fastq'))[0]

            # convert fastq to fasta
            temp_fasta_fp = join(self.temp_dir, '%s.fasta' % input_basename)
            command = "%s %s > %s" % (self.fastq_to_fasta, temp_fastq_fp,
                                      temp_fasta_fp)
            stdout, stderr, ret_val = pyqi_system_call(command)
            if self.verbose:
                print command
            temp_files_to_remove.append(temp_fasta_fp)

            # convert fasta to qiime-compatible headers
            stdout, stderr, ret_val = pyqi_system_call(command)
            command = "sed 's/\./_/g;s/ .*$//g' %s >> %s" % (temp_fasta_fp,
                                                             output_fp)
            stdout, stderr, ret_val = pyqi_system_call(command)
            if self.verbose:
                print command

            # clean up
            #if self.verbose:
            #   print "Removing files: %s" % " ".join(temp_files_to_remove)
            #  print "Removing directories: %s" % " ".join(temp_dirs_to_remove)
            #remove_files(temp_files_to_remove)
            #for temp_dir_to_remove in temp_dirs_to_remove:
            #   rmtree(temp_dir_to_remove)

        return {'result': output_fp}
Ejemplo n.º 27
0
class TableValidator(Command):
    BriefDescription = "Validate a BIOM-formatted file"
    LongDescription = ("Test a file for adherence to the Biological "
                       "Observation Matrix (BIOM) format specification. This "
                       "specification is defined at http://biom-format.org")

    CommandIns = ParameterCollection([
        CommandIn(Name='table',
                  DataType=object,
                  Description='the input BIOM JSON object (e.g., the output '
                  'of json.load)',
                  Required=True),
        CommandIn(Name='is_json',
                  DataType=bool,
                  Description='the input type',
                  Required=False,
                  Default=False),
        CommandIn(Name='format_version',
                  DataType=str,
                  Description='the specific format version to validate '
                  'against',
                  Required=False,
                  Default='1.0.0'),
        CommandIn(Name='detailed_report',
                  DataType=bool,
                  Description='include more details in the output report',
                  Required=False,
                  Default=False)
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name='valid_table',
                   Description='Is the table valid?',
                   DataType=bool),
        CommandOut(Name='report_lines',
                   Description='Detailed report',
                   DataType=list)
    ])

    FormatURL = "http://biom-format.org"
    TableTypes = set([
        'otu table', 'pathway table', 'function table', 'ortholog table',
        'gene table', 'metabolite table', 'taxon table'
    ])
    MatrixTypes = set(['sparse', 'dense'])
    ElementTypes = {'int': int, 'str': str, 'float': float, 'unicode': unicode}
    HDF5FormatVersions = set([(2, 0)])

    def run(self, **kwargs):
        is_json = kwargs['is_json']

        # this is not pyqi-appriopriate, but how we parse this thing is
        # dependent on runtime options :(
        with biom_open(kwargs['table']) as f:
            if is_json:
                kwargs['table'] = json.load(f)
                return self._validate_json(**kwargs)
            elif HAVE_H5PY:
                kwargs['table'] = f
                return self._validate_hdf5(**kwargs)
            else:
                raise IOError("h5py is not installed, can only validate JSON "
                              "tables")

    def _validate_hdf5(self, **kwargs):
        table = kwargs['table']

        # Need to make this an attribute so that we have this info during
        # validation.
        detailed_report = kwargs['detailed_report']

        report_lines = []
        valid_table = True

        if detailed_report:
            report_lines.append("Validating BIOM table...")

        required_attrs = [('format-url', self._valid_format_url),
                          ('format-version', self._valid_hdf5_format_version),
                          ('type', self._valid_type),
                          ('shape', self._valid_shape),
                          ('nnz', self._valid_nnz),
                          ('generated-by', self._valid_generated_by),
                          ('id', self._valid_nullable_id),
                          ('creation-date', self._valid_creation_date)]

        required_groups = [
            'observation', 'sample', 'observation/matrix', 'sample/matrix'
        ]

        required_datasets = [
            'observation/ids', 'observation/matrix/data',
            'observation/matrix/indices', 'observation/matrix/indptr',
            'sample/ids', 'sample/matrix/data', 'sample/matrix/indices',
            'sample/matrix/indptr'
        ]

        for required_attr, attr_validator in required_attrs:
            if required_attr not in table.attrs:
                valid_table = False
                report_lines.append("Missing attribute: '%s'" % required_attr)
                continue

            if detailed_report:
                report_lines.append("Validating '%s'..." % required_attr)

            status_msg = attr_validator(table)

            if len(status_msg) > 0:
                valid_table = False
                report_lines.append(status_msg)

        for group in required_groups:
            if group not in table:
                valid_table = False
                if detailed_report:
                    report_lines.append("Missing group: %s" % group)

        for dataset in required_datasets:
            if dataset not in table:
                valid_table = False
                if detailed_report:
                    report_lines.append("Missing dataset: %s" % dataset)

        if 'shape' in table.attrs:
            if detailed_report:
                report_lines.append("Validating 'shape' versus number of "
                                    "samples and observations...")

            n_obs, n_samp = table.attrs['shape']
            obs_ids = table.get('observation/ids', None)
            samp_ids = table.get('sample/ids', None)

            if obs_ids is None:
                valid_table = False
                report_lines.append("observation/ids does not exist, cannot "
                                    "validate shape")

            if samp_ids is None:
                valid_table = False
                report_lines.append("sample/ids does not exist, cannot "
                                    "validate shape")

            if n_obs != len(obs_ids):
                valid_table = False
                report_lines.append("Number of observation IDs is not equal "
                                    "to the described shape")

            if n_samp != len(samp_ids):
                valid_table = False
                report_lines.append("Number of sample IDs is not equal "
                                    "to the described shape")

        return {'valid_table': valid_table, 'report_lines': report_lines}

    def _validate_json(self, **kwargs):
        table_json = kwargs['table']

        # Need to make this an attribute so that we have this info during
        # validation.
        self._format_version = kwargs['format_version']
        detailed_report = kwargs['detailed_report']

        report_lines = []
        valid_table = True

        if detailed_report:
            report_lines.append("Validating BIOM table...")

        required_keys = [
            ('format', self._valid_format),
            ('format_url', self._valid_format_url), ('type', self._valid_type),
            ('rows', self._valid_rows), ('columns', self._valid_columns),
            ('shape', self._valid_shape), ('data', self._valid_data),
            ('matrix_type', self._valid_matrix_type),
            ('matrix_element_type', self._valid_matrix_element_type),
            ('generated_by', self._valid_generated_by),
            ('id', self._valid_nullable_id), ('date', self._valid_datetime)
        ]

        for key, method in required_keys:
            if key not in table_json:
                valid_table = False
                report_lines.append("Missing field: '%s'" % key)
                continue

            if detailed_report:
                report_lines.append("Validating '%s'..." % key)

            status_msg = method(table_json)

            if len(status_msg) > 0:
                valid_table = False
                report_lines.append(status_msg)

        if 'shape' in table_json:
            if detailed_report:
                report_lines.append("Validating 'shape' versus number of rows "
                                    "and columns...")

            if ('rows' in table_json
                    and len(table_json['rows']) != table_json['shape'][0]):
                valid_table = False
                report_lines.append("Number of rows in 'rows' is not equal to "
                                    "'shape'")

            if ('columns' in table_json
                    and len(table_json['columns']) != table_json['shape'][1]):
                valid_table = False
                report_lines.append("Number of columns in 'columns' is not "
                                    "equal to 'shape'")

        return {'valid_table': valid_table, 'report_lines': report_lines}

    def _json_or_hdf5_get(self, table, key):
        if hasattr(table, 'attrs'):
            return table.attrs.get(key, None)
        else:
            return table.get(key, None)

    def _json_or_hdf5_key(self, table, key):
        if hasattr(table, 'attrs'):
            return key.replace('_', '-')
        else:
            return key

    def _is_int(self, x):
        """Return True if x is an int"""
        return isinstance(x, int)

    def _valid_nnz(self, table):
        """Check if nnz seems correct"""
        if not isinstance(table.attrs['nnz'], int):
            return "nnz is not an integer!"
        if table.attrs['nnz'] < 0:
            return "nnz is negative!"
        return ''

    def _valid_format_url(self, table):
        """Check if format_url is correct"""
        key = self._json_or_hdf5_key(table, 'format_url')
        value = self._json_or_hdf5_get(table, key)

        if value != self.FormatURL:
            return "Invalid '%s'" % key
        else:
            return ''

    def _valid_shape(self, table):
        """Matrix header is (int, int) representing the size of a 2D matrix"""
        a, b = self._json_or_hdf5_get(table, 'shape')

        if not (self._is_int(a) and self._is_int(b)):
            return "'shape' values do not appear to be integers"
        else:
            return ''

    def _valid_matrix_type(self, table_json):
        """Check if a valid matrix type exists"""
        if table_json['matrix_type'] not in self.MatrixTypes:
            return "Unknown 'matrix_type'"
        else:
            return ''

    def _valid_matrix_element_type(self, table_json):
        """Check if a valid element type exists"""
        if table_json['matrix_element_type'] not in self.ElementTypes:
            return "Unknown 'matrix_element_type'"
        else:
            return ''

    def _check_date(self, val):
        valid_times = [
            "%Y-%m-%d", "%Y-%m-%dT%H:%M", "%Y-%m-%dT%H:%M:%S",
            "%Y-%m-%dT%H:%M:%S.%f"
        ]
        valid_time = False
        for fmt in valid_times:
            try:
                datetime.strptime(val, fmt)
                valid_time = True
                break
            except:
                pass

        if valid_time:
            return ''
        else:
            return "Timestamp does not appear to be ISO 8601"

    def _valid_creation_date(self, table):
        """Verify datetime can be parsed

        Expects ISO 8601 datetime format (for example, 2011-12-19T19:00:00
                                          note that a 'T' separates the date
                                          and time)
        """
        return self._check_date(table.attrs['creation-date'])

    def _valid_datetime(self, table):
        """Verify datetime can be parsed

        Expects ISO 8601 datetime format (for example, 2011-12-19T19:00:00
                                          note that a 'T' separates the date
                                          and time)
        """
        return self._check_date(table['date'])

    def _valid_sparse_data(self, table_json):
        """All index positions must be integers and values are of dtype"""
        dtype = self.ElementTypes[table_json['matrix_element_type']]
        n_rows, n_cols = table_json['shape']
        n_rows -= 1  # adjust for 0-based index
        n_cols -= 1  # adjust for 0-based index

        for idx, coord in enumerate(table_json['data']):
            try:
                x, y, val = coord
            except:
                return "Bad matrix entry idx %d: %s" % (idx, repr(coord))

            if not self._is_int(x) or not self._is_int(y):
                return "Bad x or y type at idx %d: %s" % (idx, repr(coord))

            if not isinstance(val, dtype):
                return "Bad value at idx %d: %s" % (idx, repr(coord))

            if x < 0 or x > n_rows:
                return "x out of bounds at idx %d: %s" % (idx, repr(coord))

            if y < 0 or y > n_cols:
                return "y out of bounds at idx %d: %s" % (idx, repr(coord))

        return ''

    def _valid_dense_data(self, table_json):
        """All elements must be of dtype and correspond to shape"""
        dtype = self.ElementTypes[table_json['matrix_element_type']]
        n_rows, n_cols = table_json['shape']

        for row in table_json['data']:
            if len(row) != n_cols:
                return "Incorrect number of cols: %s" % repr(row)

            if not reduce(and_, [isinstance(v, dtype) for v in row]):
                return "Bad datatype in row: %s" % repr(row)

        if len(table_json['data']) != n_rows:
            return "Incorrect number of rows in matrix"

        return ''

    def _valid_hdf5_format_version(self, table):
        """Format must be the expected version"""
        ver = table.attrs['format-version']
        if tuple(ver) not in self.HDF5FormatVersions:
            return "Invalid format version '%s'" % str(ver)
        else:
            return ""

    def _valid_format(self, table_json):
        """Format must be the expected version"""
        if table_json['format'] != self._format_version:
            return "Invalid format '%s', must be '%s'" % (table_json['format'],
                                                          self._format_version)
        else:
            return ''

    def _valid_type(self, table):
        """Table must be a known table type"""
        key = self._json_or_hdf5_key(table, 'type')
        value = self._json_or_hdf5_get(table, key)
        if value.lower() not in self.TableTypes:
            return "Unknown BIOM type: %s" % value
        else:
            return ''

    def _valid_generated_by(self, table):
        """Validate the generated_by field"""
        key = self._json_or_hdf5_key(table, 'generated_by')
        value = self._json_or_hdf5_get(table, key)
        if not value:
            return "'generated_by' is not populated"

        return ''

    def _valid_nullable_id(self, table_json):
        """Validate the table id"""
        # this is nullable and don't actually care what is in here
        return ''

    def _valid_id(self, record):
        """Validate id for a row or column"""
        if not record['id']:
            return "'id' in %s appears empty" % record
        else:
            return ''

    def _valid_metadata(self, record):
        """Validate the metadata field for a row or column"""
        # this is nullable and don't actually care what is in here
        if record['metadata'] is None:
            return ''
        if isinstance(record['metadata'], dict):
            return ''

        return "metadata is neither null or an object"

    def _valid_rows(self, table_json):
        """Validate the 'rows' under 'table'"""
        required_keys = [('id', self._valid_id),
                         ('metadata', self._valid_metadata)]
        required_by_type = {}
        required_keys.extend(
            required_by_type.get(table_json['type'].lower(), []))

        for idx, row in enumerate(table_json['rows']):
            for key, method in required_keys:
                if key not in row:
                    return "ROW IDX %d MISSING '%s' FIELD" % (idx, key)

                result = method(row)
                if len(result) > 0:
                    return result
        return ''

    def _valid_columns(self, table_json):
        """Validate the 'columns' under 'table'"""
        required_keys = [('id', self._valid_id),
                         ('metadata', self._valid_metadata)]
        required_by_type = {}
        required_keys.extend(
            required_by_type.get(table_json['type'].lower(), []))

        for idx, col in enumerate(table_json['columns']):
            for key, method in required_keys:
                if key not in col:
                    return "COL IDX %d MISSING '%s' FIELD" % (idx, key)

                result = method(col)
                if len(result) > 0:
                    return result
        return ''

    def _valid_data(self, table_json):
        """Validate the 'matrix' under 'table'"""
        if table_json['matrix_type'].lower() == 'sparse':
            return self._valid_sparse_data(table_json)
        elif table_json['matrix_type'].lower() == 'dense':
            return self._valid_dense_data(table_json)
        else:
            return "Unknown matrix type"
Ejemplo n.º 28
0
 class stubby:
     CommandIns = ParameterCollection([pc, bool_param])
     CommandOuts = ParameterCollection([])