def main(): usage = "%prog [options] <model-type> <model_name> <in-file>" description = "Trains a backoff builder model using the given "\ "input data. Specify a model type (ngram, etc) and a name to "\ "identify it. The data file should be a stored SequenceIndex file." parser = OptionParser(usage=usage, description=description) parser.add_option('-p', '--partitions', dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.") parser.add_option('--opts', dest="training_opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.") # File input options parser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Same filetypes as jazzparser", default='bulk-db') parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.") options, arguments = parse_args_with_config(parser) if len(arguments) < 3: print >>sys.stderr, "You must specify a model type, a model name and an input data file as arguments" sys.exit(1) filename = os.path.abspath(arguments[2]) model_type = arguments[0] model_name = arguments[1] builder_cls = get_backoff_builder(model_type) model_cls = builder_cls.MODEL_CLASS # Load the sequence data from a dbinput file input_data = command_line_input(filename=filename, filetype=options.filetype, options=options.file_options, allowed_types=['bulk-db', 'bulk-db-annotated']) # Handle any training options that were given on the command line if options.training_opts is None: training_opts = {} elif options.training_opts.lower() == "help": print options_help_text(model_cls.TRAINING_OPTIONS, intro="Training options for %s" % model_cls.__name__) sys.exit(0) else: training_opts = ModuleOption.process_option_dict( ModuleOption.process_option_string(options.training_opts), model_cls.TRAINING_OPTIONS) if options.partitions is not None: parts = holdout_partition(input_data, options.partitions) models = [(builder_cls.partition_model_name(model_name,num),seqs) for \ num,seqs in enumerate(parts)] else: models = [(model_name,input_data)] for part_name,seqs in models: # Instantiate a fresh model with this name model = model_cls(part_name, options=training_opts) # Train it with the loaded data model.train(seqs) model.save() print "Trained model %s" % (part_name)
def check_options(cls, options): """ Normally, options are validated when the tagger is instantiated. This allows you to check them before that. """ return ModuleOption.process_option_dict(options, cls.TAGGER_OPTIONS)
def main(): usage = "%prog [options] <song-set> <results-file0> [<results-file1> ...]" parser = OptionParser(usage=usage) parser.add_option("--popt", "--parser-options", dest="popts", action="append", help="specify options for the parser that interprets the gold standard annotations. Type '--popt help' to get a list of options (we use a DirectedCkyParser)") parser.add_option("-m", "--metric", dest="metric", action="store", help="semantics distance metric to use. Use '-m help' for a list of available metrics") parser.add_option("--mopt", "--metric-options", dest="mopts", action="append", help="options to pass to the semantics metric. Use with '--mopt help' with -m to see available options") parser.add_option("-r", "--print-results", dest="print_results", action="store", default=5, type="int", help="number of top search results to print for each query (parse result). Default: 5. Use -1 to print distances from all songs in the corpus") parser.add_option("-g", "--gold-only", dest="gold_only", action="store_true", help="skip results that have no gold standard sequence associated with them (we can't tell which is the right answer for these)") parser.add_option("--mc", "--metric-computation", dest="metric_computation", action="store_true", help="output the computation information for the metric between the parse result and each top search result") options, arguments = parser.parse_args() # For now, we always use the music_halfspan formalism with this script # If we wanted to make it generic, we'd just load the formalism according # to a command-line option formalism = Formalism # Process parser options if options.popts is not None: poptstr = options.popts if "help" in [s.strip().lower() for s in poptstr]: # Output this parser's option help print options_help_text(DirectedCkyParser.PARSER_OPTIONS, intro="Available options for gold standard interpreter") sys.exit(0) poptstr = ":".join(poptstr) else: poptstr = "" popts = ModuleOption.process_option_string(poptstr) # Check that the options are valid try: DirectedCkyParser.check_options(popts) except ModuleOptionError, err: logger.error("Problem with parser options (--popt): %s" % err) sys.exit(1)
def process_option_dict(cls, options): """ Verifies and processes the training option values. Returns the processed dict. """ return ModuleOption.process_option_dict(options, cls.OPTIONS)
def main(): usage = "%prog [options] <chord-corpus-file> <chord-labeling-model> <midi-file>" description = "Like findsong, but searches by chord label sequence "\ "similarity. The input is not a results file, but a midi file, or "\ "a midi bulk input (CSV)." parser = OptionParser(usage=usage) parser.add_option("--popt", "--parser-options", dest="popts", action="append", help="specify options for the parser that interprets the gold standard annotations. Type '--popt help' to get a list of options (we use a DirectedCkyParser)") parser.add_option("-r", "--print-results", dest="print_results", action="store", default=5, type="int", help="number of top search results to print for each query (parse result). Default: 5. Use -1 to print distances from all songs in the corpus") parser.add_option("--filetype", "--ft", dest="filetype", action="store", default="bulk-segmidi", help="filetype to read in. Use 'segmidi' to read a single midi file, or 'bulk-segmidi' (default) to read many from a CSV") parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.") parser.add_option("--labeler-options", "--lopt", dest="labeler_options", action="store", help="options for the labeler. Type '--lopt help' for a list of available options.") parser.add_option("-g", "--gold-only", dest="gold_only", action="store_true", help="skip results that have no gold standard sequence associated with them (we can't tell which is the right answer for these)") parser.add_option("--align", "--print-alignment", dest="print_alignment", action="store_true", help="print out the full alignment between the labeling and the top match") options, arguments = parser.parse_args() # Process parser options if options.popts is not None: poptstr = options.popts if "help" in [s.strip().lower() for s in poptstr]: # Output this parser's option help print options_help_text(DirectedCkyParser.PARSER_OPTIONS, intro="Available options for gold standard interpreter") sys.exit(0) poptstr = ":".join(poptstr) else: poptstr = "" popts = ModuleOption.process_option_string(poptstr) # Check that the options are valid try: DirectedCkyParser.check_options(popts) except ModuleOptionError, err: logger.error("Problem with parser options (--popt): %s" % err) sys.exit(1)
class ModelBackoffBuilder(BackoffBuilder): """ Subclass of L{BackoffBuilder} that handles model loading. """ MODEL_CLASS = None # This should be set by subclasses BUILDER_OPTIONS = BackoffBuilder.BUILDER_OPTIONS + [ ModuleOption('model', filter=str, help_text="Model name. This model must have been previously trained. Required", usage="model=X, where X is the name of a trained model", required=True), ModuleOption('partition', filter=int, help_text="If given, the numbered partition of the partitioned "\ "model will be used. (This generally involves appending the "\ "partition number to the model name.)", usage="partition=P, where P is an int", default=None ), ] def __init__(self, *args, **kwargs): BackoffBuilder.__init__(self, *args, **kwargs) # Check the subclass is properly defined if type(self).MODEL_CLASS is None: raise NotImplementedError, "BackoffBuilder "\ "subclass %s does not define a model class" % type(self).__name__ if self.options['partition'] is not None: self.model_name = type(self).partition_model_name( self.options['model'], self.options['partition']) else: self.model_name = self.options['model'] self.logger.info("Backoff model: %s" % self.model_name) # Load a TaggerModel subclass instance to load the trained model data self.model = (type(self).MODEL_CLASS).load_model(self.model_name) @staticmethod def partition_model_name(model_name, partition_number): """ The model name to use when the given partition number is requested. The default implementation simply appends the number to the model name. Subclasses may override this if they want to do something different. """ return "%s%d" % (model_name, partition_number)
def __init__(self, grammar, tagger, options={}, backoff=None, backoff_options={}, logger=None): """ @param grammar: the L{jazzparser.grammar.Grammar} instance to use for parsing @param tagger: the L{jazzparser.taggers.tagger.Tagger} subclass instance to use to tag the input @param backoff: an optional L{jazzparser.backoff.base.BackoffBuilder} class to use as a fallback if the parser returns no parses. Whether this is used and in what circumstances depends on the type of parser. @param backoff_options: dictionary of options to pass to the backoff model if it gets used. @type logger: C{logging.Logger} @param logger: a logger to which all progress information during parsing will be written. By default, outputs to stderr. """ self.grammar = grammar self.tagger = tagger self.backoff_options = backoff_options if backoff is not None: # Look up the backoff model if one is requested self.backoff = backoff # Pre-check the options dict # This will be done again by the module when instantiated, but # we do it now to verify the options ModuleOption.process_option_dict(backoff_options, backoff.BUILDER_OPTIONS) else: self.backoff = None # Initialize using parser-specific options self.options = type(self).check_options(options) if logger is None: # Output to stderr instead self.logger = create_plain_stderr_logger() else: self.logger = logger self.timed_out = False
def check_options(cls, options): """ In normal parser usage, the options dictionary is checked for validity when the parser is instantiated. In this interface, you may want to check the options before this point using this method. """ return ModuleOption.process_option_dict(options, cls.PARSER_OPTIONS)
def process_training_options(self): """ Verifies and processes the training option values. Access them in self.options. """ self._options = ModuleOption.process_option_dict(self._options_dict, self.TRAINING_OPTIONS)
def get_chord_mapping_module_option(name="chord_mapping"): return ModuleOption(name, filter=choose_from_dict(NAMED_MAPPINGS), help_text="Choose a mapping to apply to chord types "\ "to reduce the chord vocabulary", usage="%s=M, where M is one of %s. Default: %s" % \ (name, ", ".join(MAPPINGS), DEFAULT_MAPPING), default=get_chord_mapping())
def process_training_options(self): """ Verifies and processes the training option values. Access them in self.options. """ self._options = ModuleOption.process_option_dict( self._options_dict, self.TRAINING_OPTIONS)
def __init__(self, model, options={}): self.model = model # Check this model is of one of the types we can train if type(model) not in self.MODEL_TYPES: raise RaphstoHmmParameterError, "trainer %s cannot train a model "\ "of type %s" % (type(self).__name__, type(model).__name__) self.options = ModuleOption.process_option_dict(options, self.OPTIONS) self.model_cls = type(model)
def __init__(cls, name, bases, dict): # Skip all this when the base class if created if name != "FormalismBase": # Initialize all the output options # If they're never set by whatever script is running, this # ensures that their default values are available formalism = cls.get_name() opts = ModuleOption.process_option_dict({}, cls.output_options) # Store this so it's globally available to the formalism settings.OPTIONS.OUTPUT[formalism] = opts
def process_output_options(cls, optdict): """ Makes output options globally available, based on a dictionary. @see: L{output_options}. """ formalism = cls.get_name() opts = ModuleOption.process_option_dict(optdict, cls.output_options) settings.OPTIONS.OUTPUT[formalism] = opts
class MidiTaggerTrainingBulkInput(SegmentedMidiBulkInput): """ Subclass of L{SegmentedMidiBulkInput} for taking training input for midi supertaggers. This is identical to L{SegmentedMidiBulkInput}, but has an additional option C{chords} to specify a path from which to read a L{AnnotatedDbBulkInput}. This may be used by the training procedure to initialize or train parameters, in addition to the main midi training input. Accepts additionally all options accepted by L{AnnotatedDbBulkInput}. These will be passed on to L{DbBulkInput} when it's read in. """ FILE_INPUT_OPTIONS = \ SegmentedMidiBulkInput.FILE_INPUT_OPTIONS + \ [ModuleOption('chords', help_text="path from which to read a bulk-db input, "\ "which may be used in addition to the midi training "\ "data by the training procedure", usage="chords=F, where F is an filename")] + \ AnnotatedDbBulkInput.FILE_INPUT_OPTIONS def __init__(self, inputs, chords=None): self.inputs = inputs self.chords = chords @staticmethod def from_file(filename, options={}): if 'chords' in options and options['chords'] is not None: # Read in the AnnotatedDbBulkInput from this file # Take AnnotatedDbBulkInput's options out of the option dict dboptions = {} for dbopt in AnnotatedDbBulkInput.FILE_INPUT_OPTIONS: if dbopt.name in options: dboptions[dbopt.name] = options.pop(dbopt.name) chords = AnnotatedDbBulkInput.from_file(options['chords'], options=dboptions) else: chords = None # Read the main midi data just as SegmentedMidiBulkInput does main_data = SegmentedMidiBulkInput.from_file(filename, options) return MidiTaggerTrainingBulkInput(main_data.inputs, chords=chords) def subset(self, *ranges): # Custom implementation so subsets get the chord input return MidiTaggerTrainingBulkInput(\ sum([self.inputs[start:end] for (start,end) in ranges], []), chords=self.chords)
def cl_output_options(cls, string): """ Convenience method so you don't have to do this lots of times over. Take a string of output options from the command line and set the output options from it. Should only be used in command-line scripts. """ if string is not None and string.lower() == "help": print "Available output options" print "========================" print options_help_text(cls.output_options) sys.exit(0) optdict = ModuleOption.process_option_string(string) cls.process_output_options(optdict)
class SongSelfSimilarityTool(Tool): """ For fooling around with comparing songs to themselves to see what happens. """ name = "Self similarity" commands = ['selfsim'] usage = ('selfsim <song-num>', "") help = "" tool_options = Tool.tool_options + [ ModuleOption('local', filter=str_to_bool, usage="local=B, where B is true or false", default=False, help_text="Sort results by local alignment score, not "\ "global"), ] def run(self, args, state): from jazzparser.formalisms.music_halfspan.evaluation import \ tonal_space_local_alignment, tonal_space_distance songnum = int(args[0]) name, song = get_song(songnum, state) songset = state.get_data("songset") distances = [] # Try comparing this song to each song in the set for other_name, other_song in songset.analyses: # Align locally and globally ops,steps1,steps2,local_distance = \ tonal_space_local_alignment(other_song.lf, song.lf) global_distance = \ tonal_space_distance(other_song.lf, song.lf) distances.append((other_name, local_distance, global_distance)) # Sort the results if self.options['local']: distances.sort(key=lambda x: x[1]) else: distances.sort(key=lambda x: x[2]) # Print out each one print "Aligned %s with:" % name for other_name, local_distance, global_distance in distances: print "%s: local: %s, global: %s" % \ (other_name,local_distance,global_distance)
class SongDependencyGraphTool(Tool): """ Converts a song's semantics to a tree. Mainly just for debugging. """ name = "Song dependency graph" commands = ['depgraph', 'dep'] usage = ('depgraph <song-num>', "converts the semantics of the song to a "\ "dependency graph representation") tool_options = Tool.tool_options + [ ModuleOption('res', filter=str_to_bool, usage="res=B, where B is true or false", default=False, help_text="Show a result, instead of a corpus song"), ] help = """\ Converts the semantics of the numbered song to its tree representation that will be used for comparison to other logical forms. This is mainly for debugging and has no use in itself. """ def run(self, args, state): from jazzparser.formalisms.music_halfspan.harmstruct import \ semantics_to_dependency_graph if self.options['res']: resnum = int(args[0]) res = state.results[resnum] song = res.semantics print "Dependency graph for result %d\n" % resnum else: songnum = int(args[0]) name, song = get_song(songnum, state) print "Dependency graph for '%s'\n" % name print "Semantics:" print song print graph, times = semantics_to_dependency_graph(song) print graph
class CkyParser(Parser): """ CkyParser is the central class for the jazz chord sequence recogniser parsing mechanism. It constitutes the "algorithm" module of the system. It begins with a set of signs assigned to the input by the tagger and parses to produce a chart, from which the resultant signs can be extracted. """ shell_tools = [ ChartTool(), InteractiveChartTool(), ] PARSER_OPTIONS = Parser.PARSER_OPTIONS + [ ModuleOption('max_iter', filter=int, help_text="Maximum number of parser iterations to perform "\ "before giving up. If 0 or unspecified, continues "\ "until parse is complete.", usage="max_iter=X, where X is an integer.", default=0, ), ModuleOption('min_iter', filter=int, help_text="Usually, the parser will stop as soon as it finds a "\ "full parse. Use min_iter to make it continue parsing until "\ "it has done min_iter iterations or the tagger has ceased to "\ "return any categories. Use -1 to keep going until the tagger "\ "gives no more categories.", usage="min_iter=X, where X is an integer.", default=0, ), ModuleOption('parses', filter=int, help_text="Number of parses to require before we terminate. "\ "Default is 1: the parser will terminate as soon as it finds "\ "at least one full parse (unless another option, like "\ "min_iter, prevents it", usage="parses=X, where X is an integer", default=1, ), ModuleOption('timeout', filter=int, help_text="Maximum time allowed for the main parse loop, in "\ "minutes. If this is exceded, the backoff will kick "\ "in, if one is specified. Otherwise, no results will be "\ "returned. The parser will not stop as soon as the timeout "\ "expires, but after finishing processing the current input "\ "word. 0 (default) imposes no timeout.", usage="timeout=X, where X is an integer number of seconds.", default=0, ), ModuleOption('inspect', filter=str_to_bool, help_text="If true, the graphical chart inspector will be "\ "displayed during parsing.", usage="inspect=X, where X is a boolean value.", default=False ), ModuleOption('inspect_persist', filter=str_to_bool, help_text="Makes the chart inspector window persist after parsing "\ "is completed. By default, it will be killed", usage="inspect_persist=X, where X is a boolean value.", default=False ), ModuleOption('dump_chart', filter=new_file_option, help_text="A file to dump the chart state to during parsing. "\ "The first dump will be when the chart is created and "\ "new dumps will be made throughout the parse.", usage="dump_chart=X, where X is a filename." ), ModuleOption('derivations', filter=str_to_bool, help_text="Store derivation traces along with the results", usage="derivations=X, where X is a boolean value", default=None, ), ] def _create_chart(self, *args, **kwargs): self.chart = Chart(self.grammar, *args, **kwargs) return self.chart def _add_signs(self, offset=0, prob_adder=None): """ Adds new signs to the chart from the supertagger, using the given offset when requesting them from the tagger. @rtype: list of tuples @return: all the signs that were actually added. Each is represented by a tuple (start_node, end_node, sign) """ signs = self.tagger.get_signs(offset) words = self.tagger.get_string_input() if signs is None or len(signs) == 0: return [] # Add each new sign to the chart added = [] for (start, end, signtup) in signs: word_list = words[start:end] word = " ".join(w for w in word_list) # Add the probabilities as an attribute to the signs cat, tag, prob = signtup if prob_adder is not None: prob_adder(start, end, signtup, word_list) # Add the signs to the chart newadd = self.chart.add_word_signs([signtup[0]], start, word, end_node=end) # Keep a record of those that got added if newadd: added.append((start, end, signtup)) return added def parse(self, derivations=False, summaries=False, inspect=False): """ Run the parser on the input, using the specified tagger. Runs the CKY parsing algorithm to do chart parsing. For details of chart parsing, see Chart class. If the parser was given a maximum number of iterations, the routine will return as usual after this number is completed, even if no parses have been found. @type derivations: bool @param derivations: store derivation traces, which can subsequently be used to trace all the derivations that led to any given sign in the chart. Overridden by the module option if it's given @type summaries: int/bool @param summaries: output chart summary information to stderr during parsing to track progress. Set to 2 to output some info, but not the full chart. @type inspect: bool @param inspect: launch a graphical chart inspector during the parse to display interactive chart information. @return: a list of signs that span the full input. """ if 'derivations' in self.options and self.options[ 'derivations'] is not None: derivations = self.options['derivations'] # Time excecution if we're showing any summaries time = bool(summaries) # Find out from the tagger how long the input it read in was input_length = self.tagger.input_length # Create and initialise a chart for parsing # Don't initialise the chart with signs - we'll add signs gradually instead chart = self._create_chart([[]] * input_length, derivations=derivations) # Launch a chart inspector if requested if self.options['inspect'] or inspect: # Get a string form of the input to display input_strs = self.tagger.get_string_input() chart.launch_inspector(input=input_strs) # Start dumping the chart if requested if self.options['dump_chart']: # Make the first dump of the empty chart from .chart import dump_chart dump_chart(chart, self.options['dump_chart']) # Stop after a given number of iterations if self.options['max_iter'] == 0: max_iter = None else: max_iter = self.options['max_iter'] if self.options['min_iter'] == -1: # Special case: never stop until we've got all the categories min_iter = None else: min_iter = self.options['min_iter'] required_parses = self.options['parses'] timeout = 60 * self.options['timeout'] check_timeout = timeout > 0 # Make sure the timed out flag is unset to start with self.timed_out = False # This is where progress output will go # Note that it's not the same as logger, which is the main system logger prog_logger = self.logger if check_timeout: prog_logger.info("Due to timeout after %d mins" % self.options['timeout']) ################################################## ### Here is the parser itself. # Keep track of how long since we started for timing out timeout_timer = ExecutionTimer(clock=True) signs_taken = [0] * input_length offset = 0 last_lexicals = [0] * (input_length) try: # Keep adding signs until none left, or we get a full parse, # or we complete the maximum iterations allowed # Keep going if min_iter is None (special value meaning don't stop # when we get a parse while (min_iter is None or (offset < min_iter) \ or len(chart.parses) < required_parses): if max_iter is not None and offset >= max_iter: # Exceded maximum number of iterations: give up prog_logger.info("Reached maximum number of iterations: "\ "continuing to backoff/fail") break prog_logger.info(">>> Parsing iteration: %d" % (offset + 1)) # Get new signs from the tagger added = self._add_signs(offset=offset) # Note whether we added anything new if added: # Apply unary rules to these new signs added_spans = set([(start, end) for (start, end, sign) in added]) for (start, end) in added_spans: chart.apply_unary_rules(start, end) else: # No new signs added by the tagger: no point in continuing prog_logger.info("No new signs added: ending parse") break ##### Main parser loop: produce all possible results # Set end point to each node for end in range(1, input_length + 1): if time: # Start a timer timer = ExecutionTimer() chart.apply_unary_rules(end - 1, end) # Set start point to each node before the end, in reverse order for start in range(end - 2, -1, -1): for middle in range(start + 1, end): chart.apply_binary_rules(start, middle, end) # Check whether the timeout has expired and don't process # any more if it has if check_timeout: # Check whether the timeout has passed if int(timeout_timer.get_time()) > timeout: # Move on to post-parse stuff raise ParserTimeout # Check for new unary rule applications chart.apply_unary_rules(start, end) if summaries: prog_logger.info( "Completed parsing up to node %d / %d (%.2f secs)" % (end, input_length, timer.get_time())) if summaries != 2: prog_logger.info(chart.summary) if self.options['dump_chart']: # Dump an update of the chart to the file dump_chart(chart, self.options['dump_chart']) if summaries: prog_logger.info("Completed parsing to end of sequence") if summaries != 2: prog_logger.info(chart.summary) offset += 1 except ParserTimeout: # The given timeout elapsed: just continue with no parses prog_logger.info("Parse timeout (%d mins) expired: continuing "\ "to backoff/fail" % self.options['timeout']) # Set the timed_out flag so we can check later whether we timed out self.timed_out = True except KeyboardInterrupt: # We pass the interrupt on to a higher level, but first kill # the inspector window, so it doesn't hang around and mess up self.chart.kill_inspector() raise parses = chart.parses if len(parses) == 0 and self.backoff is not None: prog_logger.info("Using backoff model") backoff_results = self.run_backoff() if len(backoff_results) > 0: for res in backoff_results: # Put the semantics result into a sign, with a dummy # syntactic category sign = self.grammar.formalism.Syntax.Sign( self.grammar.formalism.Syntax.DummyCategory(), res) # If the semantics has a probability, put this on the sign if hasattr(res, "probability"): sign.probability = res.probability parses.append(sign) elif len(parses): prog_logger.info("Parse finished with %d results" % len(parses)) else: prog_logger.info("Parse finished with no results") # Close the inspector window if one was opened if not self.options['inspect_persist']: self.chart.kill_inspector() return parses
def main(): usage = "%prog [options] <model-type> <model_name> <in-file>" description = "Trains a supertagging model using the given "\ "input data. Specify a model type (baseline1, etc) and a name to "\ "identify it. The data file may be a stored SequenceIndex file, or "\ "any other type of bulk data file. "\ "This can only be used with the follow types of models: %s" % ", ".join(TRAINABLE_MODELS) parser = OptionParser(usage=usage, description=description) parser.add_option('-p', '--partitions', dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.") parser.add_option('--opts', dest="training_opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.") # File input options parser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Same filetypes as jazzparser", default='bulk-db') parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.") # Logging output parser.add_option('--log', dest="log", action="store", help="file to output training logs to. Specify a base filename; <modelname>.log will be added to the end") options, arguments = parse_args_with_config(parser) grammar = Grammar() # Get the model type first: we might not need the other args if len(arguments) == 0: print >>sys.stderr, "You must specify a model type, a model name and an input data file as arguments" model_type = arguments[0] if model_type not in TRAINABLE_MODELS: print >>sys.stderr, "'%s' is not a valid model type. Available taggers are: %s" % \ (model_type, ", ".join(TRAINABLE_MODELS)) sys.exit(1) if model_type not in TAGGERS: print >>sys.stderr, "'%s' isn't a registered model type. Check that "\ "the name in TRAINABLE_MODELS is correct" % model_type sys.exit(1) tagger_cls = get_tagger(model_type) if not issubclass(tagger_cls, ModelTagger): print >>sys.stderr, "'%s' tagger cannot be trained with this script. Only model taggers can be." % (tagger_cls.__name__) sys.exit(1) model_cls = tagger_cls.MODEL_CLASS # Handle any training options that were given on the command line if options.training_opts is None: training_opts = {} elif options.training_opts.lower() == "help": print options_help_text(model_cls.TRAINING_OPTIONS, intro="Training options for %s" % model_cls.__name__) sys.exit(0) else: training_opts = ModuleOption.process_option_dict( ModuleOption.process_option_string(options.training_opts), model_cls.TRAINING_OPTIONS) # Get the rest of the args if len(arguments) < 3: print >>sys.stderr, "You must specify a model type, a model name and an input data file as arguments" sys.exit(1) filename = os.path.abspath(arguments[2]) model_name = arguments[1] # Load the sequence data # Only allow bulk types input_data = command_line_input(filename=filename, filetype=options.filetype, options=options.file_options, allowed_types=get_input_type_names(single=False, bulk=True)) if options.partitions is not None and options.partitions > 1: parts = input_data.get_partitions(options.partitions)[1] models = [(tagger_cls.partition_model_name(model_name,num),seqs) for \ num,seqs in enumerate(parts)] else: models = [(model_name,input_data)] for part_name,seqs in models: # Instantiate a fresh model with this name model = model_cls(part_name, options=training_opts) if options.log is not None: # Prepare a logger logfile = "%s%s.log" % (options.log, part_name) print "Logging output to file %s" % logfile logger = create_logger(filename=logfile) else: logger = None # Train the model with the loaded data model.train(seqs, logger=logger) model.save() print "Trained model %s" % (part_name)
class Formalism(FormalismBase): rules = { 'application': rules.ApplicationRule, 'composition': rules.CompositionRule, 'development': rules.DevelopmentRule, 'coordination': rules.CoordinationRule, 'tonicrepetition': rules.TonicRepetitionRule, 'cadencerepetition': rules.CadenceRepetitionRule, } lexicon_builder = staticmethod(domxml.build_sign_from_node) # We don't need to do anything to distinguish variables distinguish_categories = staticmethod(lambda x, y: None) unify = staticmethod(syntax.unify) # This doesn't need to do anything for now clean_results = staticmethod(lambda x: x) shell_tools = [ TimeOutputTool(), songtools.LoadCorpusTool(), songtools.ListSongsTool(), songtools.PrintAnalysisTool(), songtools.ResultSongTSEditDistanceTool(), songtools.ResultSongDependencyRecoveryTool(), songtools.RecogniseSongTool(), songtools.SongSelfSimilarityTool(), songtools.SongTreeTool(), songtools.SongDependencyGraphTool(), ] output_options = [ ModuleOption('tsformat', choose_from_list(['coord', 'xycoord', 'roman','alpha']), help_text="Tonal space output format", default="coord", usage="tsformat=X, where X is one of 'coord', 'xycoord', "\ "'alpha' or 'roman'"), ] backoff_states_to_lf = staticmethod(semantics.backoff_states_to_lf) semantics_to_coordinates = staticmethod(semantics.semantics_to_coordinates) semantics_to_functions = staticmethod(semantics.semantics_to_functions) semantics_to_keys = staticmethod(semantics.semantics_to_keys) semantics_distance_metrics = [ distance.TonalSpaceEditDistance, distance.LargestCommonEmbeddedSubtrees, distance.RandomDistance, distance.DependencyGraphSize, distance.OptimizedDependencyRecovery, distance.DependencyRecovery, ] PcfgModel = pcfg.HalfspanPcfgModel class Syntax(FormalismBase.Syntax): Sign = syntax.Sign ComplexCategory = syntax.ComplexCategory AtomicCategory = syntax.AtomicCategory Slash = syntax.Slash DummyCategory = syntax.DummyCategory merge_equal_signs = staticmethod(syntax.merge_equal_signs) # Unlike previous formalisms, we can't use the normal category # structure abstraction, so we inject our own handling of # half categories pre_generalize_category = staticmethod(syntax.pre_generalize_category) @classmethod def is_complex_category(cls, obj): """ For the sake of efficiency, override this and don't use isinstance. This gets called a LOT of times! """ return obj.ATOMIC == False @classmethod def is_atomic_category(cls, obj): """ For the sake of efficiency, override this and don't use isinstance. This gets called a LOT of times! This works because the category classes in this formalism all define ATOMIC, so we don't need to check the type. """ return obj.ATOMIC == True class Semantics(FormalismBase.Semantics): Semantics = semantics.Semantics apply = staticmethod(semantics.apply) compose = staticmethod(semantics.compose) class PcfgParser(object): """ Formalism interface for the PcfgParser parser module. """ # Function to generate the representation of a category to # be used to index the model category_representation = staticmethod(pcfg.model_category_repr) # Mapping between the short names used for rules in annotated # trees and the rule instantiations rule_short_names = { 'compf': ('composition', { 'dir': 'forward' }), 'compb': ('composition', { 'dir': 'backward' }), 'appf': ('application', { 'dir': 'forward' }), 'appb': ('application', { 'dir': 'backward' }), 'cont': ('development', {}), 'coord': ('coordination', {}), } category_relative_chord = staticmethod(pcfg.category_relative_chord) class Evaluation(FormalismBase.Evaluation): tonal_space_alignment_costs = staticmethod( evaluation.tonal_space_alignment_costs) tonal_space_distance = staticmethod(evaluation.tonal_space_distance) tonal_space_f_score = staticmethod(evaluation.tonal_space_f_score) tonal_space_alignment_score = staticmethod( evaluation.tonal_space_alignment_score) tonal_space_alignment = staticmethod(evaluation.tonal_space_alignment) tonal_space_length = staticmethod(evaluation.tonal_space_length) """ Number of points on the tonal space path represented by the semantics """
def main(): usage = "%prog [options] <model_name> <in-file>" description = "Trains a chord labeling model using the given "\ "input data. The data file may be a stored SequenceIndex file, or "\ "any other type of bulk data file." parser = OptionParser(usage=usage, description=description) parser.add_option('-p', '--partitions', dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.") parser.add_option('--opts', dest="training_opts", action="append", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.") # File input options parser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Same filetypes as jazzparser", default='bulk-db') parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.") # Logging output parser.add_option('--log', dest="log", action="store", help="file to output training logs to. Specify a base filename; <modelname>.log will be added to the end") options, arguments = parse_args_with_config(parser) grammar = Grammar() # Handle any training options that were given on the command line if options.training_opts is None: training_opts = {} elif "help" in [opt.lower() for opt in options.training_opts]: print options_help_text(HPChordLabeler.TRAINING_OPTIONS, intro="Training options:") sys.exit(0) else: training_opts = ModuleOption.process_option_string(options.training_opts) if len(arguments) < 2: print >>sys.stderr, "You must specify a model name and an input data file as arguments" sys.exit(1) filename = os.path.abspath(arguments[1]) model_name = arguments[0] # Load the sequence data # Only allow bulk types input_data = command_line_input(filename=filename, filetype=options.filetype, options=options.file_options, allowed_types=get_input_type_names(single=False, bulk=True)) # Only partition the chord data, not the MIDI data if options.partitions is not None and not \ (isinstance(input_data, MidiTaggerTrainingBulkInput) and \ input_data.chords is not None): print >>sys.stderr, "Can only partition chord data and no chord data "\ "was supplied" sys.exit(1) if options.partitions: # The input includes chord training data parts = input_data.chords.get_partitions(options.partitions)[1] models = [("%s%d" % (model_name,num),chord_data) \ for num,chord_data in enumerate(parts)] else: models = [(model_name,None)] for part_name,chord_data in models: if options.log is not None: # Prepare a logger logfile = "%s%s.log" % (options.log, part_name) print "Logging output to file %s" % logfile logger = create_logger(filename=logfile) else: logger = None # Create a fresh model with this name model = HPChordLabeler.train(input_data, part_name, logger=logger, options=training_opts, chord_data=chord_data) print "Trained model %s" % (part_name)
def check_options(cls, options): return ModuleOption.process_option_dict(options, cls.BUILDER_OPTIONS)
class NgramTagger(ModelTagger): MODEL_CLASS = NgramTaggerModel TAGGER_OPTIONS = ModelTagger.TAGGER_OPTIONS + [ ModuleOption('decode', filter=choose_from_list(DECODERS), help_text="Decoding method for inference.", usage="decode=X, where X is one of %s" % \ ", ".join("'%s'" % d for d in DECODERS), default="forward-backward"), ] INPUT_TYPES = ['db', 'chords'] def __init__(self, grammar, input, options={}, *args, **kwargs): """ Tags using an ngram model backed by NLTK. """ super(NgramTagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) #### Tag the input sequence #### self._tagged_data = [] self._batch_ranges = [] # Group the input into pairs to get observations inpairs = group_pairs(self.input, none_final=True) # Convert the pairs into observations observations = [ observation_from_chord_pair(pair[0], pair[1], self.model.chordmap) for pair in inpairs ] # Use the ngram model to get tag probabilities for each input by # computing the forward probability matrix if self.options['decode'] == "viterbi": probabilities = self.model.viterbi_probabilities(observations) elif self.options['decode'] == "forward": probabilities = self.model.forward_probabilities(observations) else: probabilities = self.model.forward_backward_probabilities( observations) word_tag_probs = [] for index, probs in enumerate(probabilities): features = { 'duration': self.durations[index], 'time': self.times[index], } word_signs = [] # Now assign a probability to each tag, given the observation for tag in self.model.tags: # Read a full sign out of the grammar sign = self.grammar.get_sign_for_word_by_tag( self.input[index], tag, extra_features=features) if sign is not None: # Read off the probability from the matrix probability = probs[tag] word_signs.append((sign, tag, probability)) # Randomly sort the list first to make sure equal probabilities are randomly ordered word_signs = [(sign, tag, prob) for sign, tag, prob in word_signs] random.shuffle(word_signs) # Now sort by probability word_signs = list(reversed(sorted(word_signs, key=lambda x: x[2]))) self._tagged_data.append(word_signs) # Store the list of probabilities for tags, which we'll use # after we've tagged every word to work out the sizes # of the tag batches word_tag_probs.append([p for __, __, p in word_signs]) if self.options['best']: # Only return one for each word self._batch_ranges = [[(0, 1)] for i in range(len(self.input))] else: # Work out the number of tags to return in each batch batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio) # So far, this has assigned a probability to every possible # tag. We don't want the tagger ever to return the least # probably batch of tags, unless it's the only one. #batch_sizes = [batches[:-1] if len(batches) > 1 else batches for batches in batch_sizes] # Transform these into a form that's easier to use for getting the signs self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \ for batches in batch_sizes] def get_signs(self, offset=0): all_signs = [] for start_node in range(len(self.input)): # Get the indices of the signs to return in this offset batch ranges = self._batch_ranges[start_node] if offset >= len(ranges): # No more batches left for this word continue start, end = ranges[offset] signs = self._tagged_data[start_node][start:end] # Add each sign to the output list along with its node values for sign in signs: all_signs.append((start_node, start_node + 1, sign)) return all_signs def get_word(self, index): return self.input[index]
logger.error("The tagger '%s' could not be loaded. Possible "\ "taggers are: %s" % (options.supertagger, ", ".join(TAGGERS))) return 1 # Get supertagger options before initializing the tagger if options.topts is not None: toptstr = options.topts if "help" in [s.strip().lower() for s in toptstr]: # Output this tagger's option help from jazzparser.utils.options import options_help_text print options_help_text(tagger_cls.TAGGER_OPTIONS, intro="Available options for selected tagger") return 0 toptstr = ":".join(toptstr) else: toptstr = "" topts = ModuleOption.process_option_string(toptstr) # Check that the options are valid try: tagger_cls.check_options(topts) except ModuleOptionError, err: logger.error("Problem with tagger options (--topt): %s" % err) return 1 ######## Backoff ######## # Load the requested backoff model, if any if options.backoff is not None: from jazzparser.backoff import BUILDERS if options.backoff.lower() == "help": print "Available backoff model types are: %s" % ", ".join(BUILDERS) return 0 try:
def process_labeling_options(opts): """ Verifies and processes the labeling option values (dict). """ return ModuleOption.process_option_dict(opts, HPChordLabeler.LABELING_OPTIONS)
def process_training_options(opts): """ Verifies and processes the training option values. """ return ModuleOption.process_option_dict(opts, HPChordLabeler.TRAINING_OPTIONS)
else: # No metric found matching this name print "No metric '%s'" % options.metric sys.exit(1) print >>sys.stderr, "Using distance metric: %s" % metric_cls.name # Now process the metric options if options.mopts is not None: moptstr = options.mopts if "help" in [s.strip().lower() for s in moptstr]: # Output this parser's option help print options_help_text(metric_cls.OPTIONS, intro="Available options for metric '%s'" % metric_cls.name) sys.exit(0) moptstr = ":".join(moptstr) else: moptstr = "" mopts = ModuleOption.process_option_string(moptstr) # Instantiate the metric with these options metric = metric_cls(options=mopts) if len(arguments) < 2: print >>sys.stderr, "Specify a song corpus name and one or more files to read results from" sys.exit(1) # First argument is an TonalSpaceAnalysisSet corpus_name = arguments[0] # Load the corpus file corpus = TonalSpaceAnalysisSet.load(corpus_name) # The rest of the args are result files to analyze res_files = arguments[1:]
def main(): usage = "%prog [options] <model_name> <in-file>" description = "Loads a chord labeling model and uses it to assign chord "\ "labels to the given MIDI file." parser = OptionParser(usage=usage, description=description) # File input options parser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Same filetypes as jazzparser", default='segmidi') parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.") # Labeling options parser.add_option("--labeler-options", "--lopt", dest="labeler_options", action="append", help="options for the labeler. Type '--lopt help' for a list of available options.") parser.add_option("--no-key", "--nk", dest="no_key", action="store_true", help="merge together labels with the same key (same as --lopt nokey)") # Output options parser.add_option("--single", "-1", dest="single", action="store_true", help="show only one chord per time segment (same as --lopt n=1, but formats the output in a simpler way)") parser.add_option('-r', '--realize', dest="realize", action="store", help="realize the chord sequence as a midi file, overlaid on the input") parser.add_option('--chords-only', dest="chords_only", action="store_true", help="only realize the chords: don't overlay on the input midi (only works with -r)") options, arguments = parse_args_with_config(parser) if options.labeler_options is not None and "help" in options.labeler_options: print options_help_text(HPChordLabeler.LABELING_OPTIONS, intro="Options for HP chord labeler") sys.exit(0) if len(arguments) < 2: print >>sys.stderr, "You must specify a model name and an input "\ "(MIDI) data file as arguments" sys.exit(1) filename = os.path.abspath(arguments[1]) model_name = arguments[0] # Process the labeler options lopt_dict = ModuleOption.process_option_string(options.labeler_options) if options.single: # No point in getting more than one label, since we only display one lopt_dict['n'] = 1 if options.no_key: # Just set the nokey option lopt_dict['nokey'] = True # Check they're valid before doing anything else HPChordLabeler.process_labeling_options(lopt_dict) input_data = command_line_input(filename, filetype=options.filetype, options=options.file_options, allowed_types=['segmidi','bulk-segmidi']) bulk = not is_bulk_type(type(input_data)) if bulk: input_data = [input_data] for i,data in enumerate(input_data): input_stream = data.stream print "Read midi data in %d segments" % len(data) # Load the model model = HPChordLabeler.load_model(model_name) # Perform labeling labels = model.label(data, options=lopt_dict) # Try labeling as it will be passed to the tagger labs = model.label_lattice(data, options=lopt_dict) if options.single: # Special output for single label output print ", ".join(["%s" % timelabs[0][0] for timelabs in labels]) else: # Print out the labels for each timestep for time,timelabs in enumerate(labels): print "%d: %s" % (time, ", ".join(["%s (%.2e)" % (label,prob) for (label,prob) in timelabs])) if options.realize is not None: # Get the single best chord label for each time best_labels = [timelabs[0][0] for timelabs in labels] # Realize as a midi file print "Realizing output chord sequence" real = ChordSequenceRealizer(best_labels, model.chord_vocab, resolution=input_stream.resolution, chord_length=data.time_unit, text_events=True) if options.chords_only: # Don't overlay stream = real.generate(offset=data.tick_offset) else: stream = real.generate(overlay=input_stream, offset=data.tick_offset) if bulk: filename = "%s-%d" % (options.realize, i) else: filename = options.realize write_midifile(stream, filename)
class CandcMultiTagger(CandcTagger): """ Uses the C&C supertagger component to get multiple tags for each word. """ command = "msuper" # Use a very low beta, so we get loads of tags, even improbable ones extra_args = ["--beta", "0.0"] TAGGER_OPTIONS = CandcTagger.TAGGER_OPTIONS + [ ModuleOption('ignore-unknown', filter=str_to_bool, help_text="Ignore any tags that the tagger returns but which "\ "are not found in the grammar. By default, an error will "\ "be thrown.", usage="ignore-unknown=True (default False)", default=False), ] def __init__(self, *args, **kwargs): super(CandcMultiTagger, self).__init__(*args, **kwargs) def _tags_from_output(self, output): tags = [] # Split up the output text to extract tags and probabilities for line in output.split("\n"): line = line.strip() if len(line): cols = line.split("\t") num_results = int(cols[2]) results = [] all_tags = [] # Get the tags and probs from the output for result_num in range(num_results): cat = cols[3+result_num*2] prob = float(cols[4+result_num*2]) results.append((cat, prob)) all_tags.append(cat) # Check all the tags are covered and add them with 0 prob if not for tag in self.tag_list: if tag not in all_tags: results.append((tag, 0.0)) tags.append(list(reversed(sorted(results, key=lambda x:x[1])))) if len(tags) != self.input_length: raise CandcTaggingError, "C&C output did not give a correct "\ "set of tags: %s" % output # Redistribute the tag probability to account for unseen tags if self.options['unseen_tag_prob'] > 0.0: unseen_prob = self.options['unseen_tag_prob'] # Scale down everything that has a probability prob_scale = 1.0 - unseen_prob for i in range(len(tags)): # Add reserved mass equally to every tag prob_add = unseen_prob / len(tags[i]) tags[i] = [(tag,(prob*prob_scale+prob_add)) for \ tag,prob in tags[i]] skip_tags = [] # Work out what tags we're going to ignore altogether if self.options['ignore-unknown']: for tag_sequence in tags: for tag,prob in tag_sequence: if tag not in self.grammar.families: # This tag's not in the grammar: just ignore it skip_tags.append(tag) logger.warn("Ignoring tag '%s', which is not in "\ "the grammar." % tag) #~ #### I've already done this above #~ # Some tags get given zero probability by the model, either because #~ # it's not smoothing enough, or because of rounding errors #~ # We do a basic smoothing here, giving everything with 0 probability #~ # a probability smaller than the smallest the model assigned #~ smoothed_tags = [] #~ for tag_probs in tags: #~ zeros = sum(prob == 0.0 for (tag,prob) in tag_probs) #~ # No need to smooth if everything got some prob #~ if zeros: #~ smallest = min(prob for (tag,prob) in tag_probs if prob > 0.0) #~ if smallest == 1.0: #~ # This occasionally happens and messes things up #~ # Just reserve a small amount for the zeros in this case #~ smallest = 0.001 #~ # Divide the smallest probability among the zero prob tags #~ # and discount the others #~ smooth_prob = smallest / zeros #~ discount = 1.0-(smallest) #~ tag_probs = [(tag, prob*discount if prob > 0.0 #~ else smooth_prob) #~ for (tag,prob) in tag_probs] #~ smoothed_tags.append(tag_probs) #~ print smoothed_tags signs = [[] for i in range(self.input_length)] # Get an actual sign for each word/tag combination for index,word in enumerate(self.tokens): for (tag,prob) in tags[index]: if tag not in skip_tags: # Consult the grammar to get a suitable sign if we can sign = self.grammar.get_sign_for_word_by_tag( word, tag, extra_features={ 'time' : self.times[index], 'duration' : self.durations[index] }) signs[index].append((sign,tag, prob)) self.batch_sizes = [] for results in signs: # Work out the batches that these should be returned in self.batch_sizes.append(batch_sizes([p for __,__,p in results], self.tag_batch_ratio)) return signs
options=options.file_options, allowed_types=['segmidi', 'bulk-segmidi']) if isinstance(input_data, SegmentedMidiInput): # Single input input_data = [input_data] # Work out how many results to print out if options.print_results == -1: print_up_to = None else: print_up_to = options.print_results # Process the labeler options lopt_dict = ModuleOption.process_option_string(options.labeler_options) # No point in getting more than one label, since we'll only use one lopt_dict['viterbi'] = True lopt_dict['nokey'] = True # Load the chord labeling model model_name = arguments[1] model = HPChordLabeler.load_model(model_name) ranks = [] num_ranked = 0 for midi_file in input_data: # Skip any inputs that don't have a gold sequence associated with them # We won't know what the correct answer is if options.gold_only and midi_file.gold is None: continue
class NgramTaggerModel(TaggerModel): MODEL_TYPE = 'ngram' # Set up possible options for training TRAINING_OPTIONS = [ ModuleOption('n', filter=int, help_text="Length of the n-grams which this model will use.", usage="n=N, where N is an integer. Defaults to bigrams", default=2), ModuleOption('backoff', filter=int, help_text="Number of orders of backoff to use. This must be "\ "less than n. E.g. if using a trigram model (n=3) you can "\ "set backoff=2 to back off to bigrams and from bigrams "\ "to unigrams. Set to 0 to use no backoff at all (default).", usage="backoff=X, where X is an integer < n", default=0), ModuleOption('cutoff', filter=int, help_text="In estimating probabilities, treat any counts below "\ "cutoff as zero", usage="cutoff=X, where X is an integer", default=0), ModuleOption('backoff_cutoff', filter=int, help_text="Apply a different cutoff setting to the backoff model. "\ "Default is to use the same as the main model", usage="backoff_cutoff=X, where X is an integer"), ModuleOption('estimator', filter=choose_from_dict(ESTIMATORS), help_text="A way of constructing a probability model given "\ "the set of counts from the data. Default is to use "\ "laplace (add-one) smoothing.", usage="estimator=X, where X is one of: %s" % \ ", ".join(ESTIMATORS.keys()), default=laplace_estimator), # Add the standard chord mapping option ("chord_mapping") get_chord_mapping_module_option(), ] + TaggerModel.TRAINING_OPTIONS def __init__(self, model_name, model=None, chordmap=None, *args, **kwargs): """ An n-gram model to be used as a tagging model. Uses NLTK to represent, train and evaluate the n-gram model. """ super(NgramTaggerModel, self).__init__(model_name, *args, **kwargs) self.model = model self.chordmap = get_chord_mapping(chordmap) self.chordmap_name = chordmap if self.options['n'] <= self.options['backoff']: # This is not allowed # We can only back off n-1 orders for an n-gram model raise TaggingModelError, "tried to load an n-gram model with "\ "more orders of backoff than are possible (backing off "\ "%d orders on a %d-gram model)" % \ (self.options['backoff'], self.options['n']) def train(self, sequences, grammar=None, logger=None): from jazzparser.utils.nltk.ngram import PrecomputedNgramModel if grammar is None: from jazzparser.grammar import get_grammar # Load the default grammar grammar = get_grammar() N = self.options['n'] backoff = self.options['backoff'] chordmap = self.options['chord_mapping'] self.chordmap = chordmap self.chordmap_name = chordmap.name # Get data in the form of lists of (observation,tag) pairs training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \ for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)] for seq in sequences] # Get all the possible pos tags from the grammar label_dom = grammar.pos_tags # Build the emission domain to include all the observations that # theoretically could occur, not just those that are seen - # we might not see all interval/chord type pairs in the data. chord_types = chordmap.values() emission_dom = sum( [["%d-%s" % (interval, chord) for chord in chord_types] for interval in range(12)], []) # Ignore unlabelled data ignores = [''] if self.options['backoff_cutoff'] is None: backoff_kwargs = {} else: backoff_kwargs = {'cutoff': self.options['backoff_cutoff']} # Precompute the transition matrix and store it along with the model self.model = PrecomputedNgramModel.train( self.options['n'], training_data, label_dom, emission_dom=emission_dom, cutoff=self.options['cutoff'], backoff_order=self.options['backoff'], estimator=self.options['estimator'], ignore_list=ignores, backoff_kwargs=backoff_kwargs) # Add some model-specific info into the descriptive text # so we know how it was trained est_name = get_estimator_name(self.options['estimator']) self.model_description = """\ Model order: %(order)d Backoff orders: %(backoff)d Probability estimator: %(est)s Zero-count threshold: %(cutoff)d Chord mapping: %(chordmap)s Training sequences: %(seqs)d Training samples: %(samples)d\ """ % \ { 'est' : est_name, 'seqs' : len(training_data), 'samples' : len(sum(training_data, [])), 'order' : self.options['n'], 'backoff' : self.options['backoff'], 'cutoff' : self.options['cutoff'], 'chordmap' : self.chordmap_name, } @staticmethod def _load_model(data): from jazzparser.utils.nltk.ngram import PrecomputedNgramModel model = PrecomputedNgramModel.from_picklable_dict(data['model']) name = data['name'] chordmap = data.get("chordmap", None) return NgramTaggerModel(name, model=model, chordmap=chordmap) def _get_model_data(self): data = { 'name': self.model_name, 'model': self.model.to_picklable_dict(), 'chordmap': self.chordmap_name, } return data def generate_chord_sequence(self, length=20): """ Just for a laugh, use the trained n-gram to generate a chord sequence and output it in a playable form. Returns a tuple: (chords, tags) @todo: this isn't implemented yet for n-grams. It's not a high priority, but would be fun. """ # Easily done, because the NgramModel already implements it itself raise NotImplementedError, "not yet done generation for n-grams" # This is what the other tagger did: from jazzparser.utils.chords import int_to_chord_numeral # Use the model to generate randomly rand_seq = self.model.random_sample(random.Random(), length) pitch = 0 chords = [] prochords, tags = zip(*rand_seq) # Convert the generated observations into readable chords for chord in prochords: interval, __, ctype = chord.partition("-") chords.append("%s%s" % (int_to_chord_numeral(pitch), ctype)) pitch = (pitch + int(interval)) % 12 return (chords, tags) def forward_probabilities(self, sequence): """ Interface to the NgramModel's forward_probabilities """ return self.model.forward_probabilities(sequence) def forward_backward_probabilities(self, sequence): return self.model.gamma_probabilities(sequence, dictionary=True) def viterbi_probabilities(self, sequence): return self.model.viterbi_selector_probabilities(sequence) def _get_tags(self): return self.model.label_dom tags = property(_get_tags) #### Readable output of the parameters #### def _get_readable_params(self): try: text = "" # Include the stored model description text += self.model_description text += "\nNum emissions: %d\n" % self.model.num_emissions text += "\nShowing only probs for non-zero counts. "\ "Others may have a non-zero prob by smoothing\n" text += "\nChord mapping: %s:\n" % self.chordmap.name for (crdin, crdout) in self.chordmap.items(): text += " %s -> %s\n" % (crdin, crdout) # Emission distribution text += "\nEmission dist:\n" for label in sorted(self.model.label_dom): text += " %s:\n" % label probs = reversed(sorted( [(self.model.emission_dist[label].prob(em),em) for \ em in self.model.emission_dist[label].samples()])) for (prob, em) in probs: text += " %s: %s\n" % (em, prob) text += "\n\nTransition dist:\n" for history in sorted(self.model.label_dist.conditions()): text += " %s\n" % str(history) dist = [(self.model.label_dist[history].prob(lab), lab) for lab in self.model.label_dist[history].samples()] for prob, label in reversed(sorted(dist)): text += " %s: %s\n" % (str(label), prob) return text except AttributeError, err: # Catch this, because otherwise it just looks like the attribute # (readable_parameters) doesn't exist (stupid Python behaviour) raise ValueError, "error generating model description "\ "(attribute error): %s" % err
def command_line_input(filename=None, filetype=None, options="", \ allowed_types=None, default_type=None): """ Utility function for processing file input options from the command line. Pass in as args the values straight from the command line options to select a filename, filetype and list of options. Typical command-line options for this purpose (for an optparse option parser C{op}):: op.add_option("--file", "-f", dest="file", action="store", help="use a file to get input from") op.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Use '--filetype help' for a list of available types") op.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Use '--fopt help', with '--ft <type>', for a list of available options") Then you can call this function as:: command_line_input(filename=options.file, filetype=options.filetype, options=options.file_options) @type allowed_types: list of strs @param allowed_types: types of input you want the user to be able to give. If not given, all types are allowed @type default_type: str @param default_type: filetype to assume if no other filetype is given @rtype: L{InputReader} subclass @return: the input wrapper of appropriate type, or None if no input file was given """ if allowed_types is None: allowed_types = get_input_type_names() if filetype is None and default_type is not None: filetype = default_type # Catch a request for filetype help if filetype is not None and filetype.lower() == "help": # Output possible file types print "Allowed input types: %s" % ", ".join(allowed_types) sys.exit(0) # Check that the filetype is valid and get the input type class if it is input_type = get_input_type(filetype) type_name = input_type_name(input_type) if input_type is None: raise InputTypeError, "Unknown filetype '%s'. Allowed types are: %s" % \ (filetype, ", ".join(allowed_types)) if type_name not in allowed_types: raise InputTypeError, "Cannot accept input of type '%s'. Allowed "\ "types are: %s" % (filetype, ", ".join(allowed_types)) if options is not None and options.lower() == "help": # Output help text from jazzparser.utils.options import options_help_text print options_help_text(input_type.FILE_INPUT_OPTIONS, intro="Available options for input type %s" % type_name) sys.exit(0) if filename is None: return None # First get a dict of the options file_options = ModuleOption.process_option_string(options) # Process the options as appropriate for this type file_options = input_type.process_option_dict(file_options) # Instantiate the input from the file as appropriate for the input type input_data = input_type.from_file(filename, file_options) return input_data
def main(): usage = "%prog [<options>] <model-name> <training-input>" description = "Training of PCFG models." parser = OptionParser(usage=usage, description=description) parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \ help="Number of partitions to divide the data into. "\ "For train, divides the input file, trains a model on each "\ "partition's complement and appends partition number to "\ "the model names. For del, appends partition numbers to model "\ "names and deletes all the models. Recache does similarly. "\ "Has no effect for parse.") parser.add_option('--opts', dest="training_opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options") parser.add_option("--debug", dest="debug", action="store_true", help="Output verbose logging information to stderr") parser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.") options, arguments = parse_args_with_config(parser) if options.debug: log_level = logging.DEBUG else: log_level = logging.WARN # Create a logger for training logger = create_logger(log_level = log_level, name = "training", stderr = True) # Load a grammar grammar = get_grammar(options.grammar) # Get the pcfg model class for the formalism PcfgModel = grammar.formalism.PcfgModel # Parse the option string if options.training_opts is None: opts = {} elif options.training_opts.lower() == "help": print options_help_text(PcfgModel.TRAINING_OPTIONS, intro="Training options for PCFGs") sys.exit(0) else: opts = ModuleOption.process_option_dict( ModuleOption.process_option_string(options.training_opts), PcfgModel.TRAINING_OPTIONS) if len(arguments) == 0: print >>sys.stderr, "Specify a model name" models = PcfgModel.list_models() print >>sys.stderr, "Available models: %s" % ", ".join(models) sys.exit(1) model_name = arguments[0] print "Model base name:", model_name if options.partitions is not None: parts = [(i, "%s%d" % (model_name, i)) for i in range(options.partitions)] else: parts = [(None, model_name)] if len(arguments) < 2: print >>sys.stderr, "Specify an input file to read sequence data from" sys.exit(1) # Read in the training data from the given file seqs = SequenceIndex.from_file(arguments[1]) if options.partitions is not None: # Prepare each training partition datasets = holdout_partition(seqs.sequences, options.partitions) else: datasets = [seqs.sequences] for dataset,(parti,part_model) in zip(datasets,parts): # Train the named model on the sequence data model = PcfgModel.train(part_model, dataset, opts, grammar=grammar, logger=logger) model.save() print "Trained model", part_model
class ChordInput(Input): """ Input wrapper for textual chord input. This is the simplest type of input, usually taken from the command line. You must provide a list of chord symbols and either a list of durations or a list of times when constructing this. To process pure text (which includes computing durations/times and splitting up chords), use L{ChordInput.from_string}. """ FILE_INPUT_OPTIONS = [ ModuleOption('roman', filter=str_to_bool, help_text="read chord symbols as roman numberals. "\ "Default is to assume note names", usage="roman=B, where B is a boolean", default=False), ] def __init__(self, inputs, durations=None, times=None, roman=False, *args, **kwargs): super(ChordInput, self).__init__(*args, **kwargs) self.inputs = inputs self.durations = durations self.times = times self.roman = roman # Compute the durations from times or vice versa if durations is None and times is None: raise ValueError, "cannot create a ChordInput with neither "\ "times nor durations given" elif times is None: self.times = [ sum(durations[:i], Fraction(0)) for i in range(len(durations)) ] elif durations is None: from jazzparser.utils.base import group_pairs self.durations = [ time1 - time0 for (time1, time0) in group_pairs(times) ] + [Fraction(1)] # Convert all strings to internal chord representation # Done now so we check the chords can all be understood before doing # anything else self.chords = [ Chord.from_name(name, roman=roman).to_db_mirror() for name in inputs ] for chord, dur in zip(self.chords, self.durations): chord.duration = dur @staticmethod def from_string(input, name="<string input>", roman=False): """ Produce a wrapped-up version of the input directly from an input string, which may come, for example, from the command line. """ from jazzparser.utils.input import assign_durations, strip_input # Get durations from the original string before doing anything else durations = assign_durations(input) # Remove unwanted characters from the string input = strip_input(input) # Tokenise the string chords = input.split() return ChordInput(chords, durations=durations, name=name, roman=roman) def __str__(self): return " ".join(["%s" % i for i in self.inputs]) def __len__(self): return len(self.inputs) def __getitem__(self, item): return self.inputs[item] def slice(self, start=None, end=None): return ChordInput(self.inputs[start:end], self.durations[start:end], self.times[start:end], name=self.name) @staticmethod def from_file(filename, options={}): # Read the whole contents of the file f = open(filename, 'r') try: data = f.read() finally: f.close() # Just treat the whole file as one sequence return ChordInput.from_string(data, name="File: %s" % filename, roman=options['roman']) def to_db_input(self): """ This data type is useful for reading textual input. For internal processing, however, it can be converted to a L{DbInput}, which is generally more convenient to handle. """ return DbInput(self.inputs, durations=self.durations, chords=self.chords)
def process_option_list(self, options): optdict = ModuleOption.process_option_string(options) self.options = ModuleOption.process_option_dict(optdict, self.tool_options)
def main(): def _check_args(args): if len(args) != 3: print >>sys.stderr, "Specify a tagger, model name and input file" sys.exit(1) return args[1],args[2] partitions,part_ids,options,arguments = prepare_evaluation_options( usage = "%prog [options] <tagger> <model-name> <input-file>", description = "Evaluate a tagging model by "\ "tagging sequences from an input file. If the tagger doesn't "\ "need a model name, use '-' as the model name.", check_args = _check_args, optparse_groups = [ (("Tagging",), [(("--topt", "--tagger-options"), {'dest':"topts", 'action':"append", 'help':"options to pass to the tagger."}), ]), (("Output",), [(("--no-model-info",), {'dest':"no_model_info", 'action':"store_true", 'help':"turns of outputing of information about the model being used before using it (useful for identifying output piped to a file later, but may be too verbose sometimes)"}), ]), (("Evaluation", "Type of evaluation and options"), [(("-a", "--agreement"), {'dest':"agreement", 'action':"store_true", 'help':"instead of doing any parses, just report the agreement of the tops tags with the gold standard tags."}), (("--confusion",), {'dest':"confusion", 'action':"store_true", 'help':"print out confusion matrix after agreement calculation. Applies only in combination with --agreement"}), (("-e", "--entropy"), {'dest':"entropy", 'action':"store_true", 'help':"instead of doing any parses, just report the entropy of the returned tag distribution with respect to the gold standard tags."}), (("--tag-stats",), {'dest':"tag_stats", 'action':"store_true", 'help':"just output stats about the tags that the model assigns to this sequence (or these sequences)"}), (("--topn",), {'dest':"topn", 'type':"int", 'action':"store", 'help':"when evaluating agreement consider the top N tags the tagger returns. By default, allows only the top one to count as a hit.", 'default':1}), ]), ], ) grammar = Grammar() tagger_name = arguments[0] model_name = arguments[1] # Tagger shouldn't use a model in some cases no_tagger_model = model_name == "-" # Load the requested tagger class tagger_cls = get_tagger(tagger_name) topts = ModuleOption.process_option_string(options.topts) def _model_info(mname): """ Outputs info about the named model """ if options.no_model_info: print >>sys.stderr, "Model %s" % mname else: # Can only output the nice model info if it's a ModelTagger if issubclass(tagger_cls, ModelTagger): print >>sys.stderr, "======== Model info ========" print >>sys.stderr, tagger_cls.MODEL_CLASS.load_model(mname).description print >>sys.stderr, "============================" else: print >>sys.stderr, "Tagger %s using model %s" % (tagger_cls.__name__, mname) num_parts = len(partitions) num_seqs = sum([len(p[0]) for p in partitions]) ################# Evaluation ######################## if options.tag_stats: raise NotImplementedError, "fix this if you want it" # Print out statistics for each partition, with its model if no_tagger_model: # There could be some circumstance in which we want to do this, # but I can't think what it is, so I'm not implementing it for now print >>sys.stderr, "Cannot run tag_stats with no tagger model" sys.exit(1) all_stats = {} for parti in range(num_parts): sequences,model,part_num = partitions[parti] # Output the model training info if requested _model_info(model) ######## This doesn't exist any more stats = sequences_top_tags_dict(tagger_cls, model, sequences, topn=options.topn) for tag,num in stats.items(): if tag in all_stats: all_stats[tag] += stats[tag] else: all_stats[tag] = stats[tag] pprint_table(sys.stdout, list(reversed(sorted(all_stats.items(), key=lambda r:r[1]))), separator="|") elif options.agreement: # Print out agreement stats for each partition if no_tagger_model: # Same a tag_stats: probably no need for this ever print >>sys.stderr, "Cannot run agreement with no tagger model" sys.exit(1) correct = 0 total = 0 conf_mat = {} for parti in range(num_parts): sequences,model,part_num = partitions[parti] topts['model'] = model # Output the model training info if requested _model_info(model) pcorrect = 0 ptotal = 0 # Go through each sequence for seq in sequences: print >>sys.stderr, "Evaluating %s" % seq.string_name input = DbInput.from_sequence(seq) correct_tags = [chord.category for chord in seq.iterator()] cor,tot = tagger_agreement(input, grammar, tagger_cls, correct_tags, options=topts, confusion_matrix=conf_mat, topn=options.topn) pcorrect += cor ptotal += tot print " Sequence: %.1f%%" % (float(cor)/tot*100) print " So far: %.1f%%" % (float(pcorrect)/ptotal*100) print "Partition %d: %d / %d (%.2f%%)" % (part_num, pcorrect, ptotal, (float(pcorrect)/ptotal*100)) correct += pcorrect total += ptotal if num_parts > 1: # Print out the overall stats print "%d / %d (%f%%)" % (correct,total,(float(correct)/total*100)) if options.confusion: confusion_matrix(conf_mat) elif options.entropy: print "Calculating cross-entropy of tagger with gold standard tags" entropy = 0.0 num_chords = 0 for parti in range(num_parts): sequences,model,part_num = partitions[parti] if not no_tagger_model: topts['model'] = model # Output the model training info if requested _model_info(model) pentropy = 0.0 pnum_chords = 0 # Compute the entropy for the partition model for seq in sequences: print >>sys.stderr, "Evaluating %s" % seq.string_name input = " ".join([str(chord) for chord in seq.iterator()]) correct_tags = [chord.category for chord in seq.iterator()] ent,crds = tagger_entropy(input, grammar, tagger_cls, correct_tags, options=topts) pentropy += ent pnum_chords += crds print " %f bits per chord" % (ent/crds) print "Partition %d: %f bits per chord (%d chords)" % (part_num, (pentropy/pnum_chords), pnum_chords) entropy += pentropy num_chords += pnum_chords # Print out the stats for all partitions together if num_parts > 1: print "%f bits per chord (%d chords)" % ((entropy/num_chords), num_chords) else: print >>sys.stderr, "Select an evaluation operation with one of the options" sys.exit(1)
class DbInput(Input): """ Wrapper for input from the database, rather than the command line. No point in reducing db input to a string, then reinterpreting it. If only one of C{times} and C{durations} is given, the other will be computed from it. Computing C{times} from durations involves assuming that the first chord occurs at time 0. Computing C{durations} from C{times} involves assuming that the last chord has a length of 1. At least one of C{times} and C{durations} must be given. We also store the id of the chord sequence that this came from (C{id}) and the sequence representation itself (C{sequence}). This may be C{None} in some cases. Confusingly (for historical reasons!), C{inputs} contains string chord labels. C{chords} contains the db_mirrors representation of the chords. """ FILE_INPUT_OPTIONS = [ ModuleOption('index', filter=int, help_text="read the sequence with index (not id) X", usage="index=X, where X is an int", required=True), ] def __init__(self, inputs, durations=None, times=None, id=None, \ chords=None, sequence=None, *args, **kwargs): super(DbInput, self).__init__(*args, **kwargs) self.inputs = inputs self.durations = durations self.times = times self.id = id self.chords = chords self.sequence = sequence if durations is None and times is None: raise ValueError, "cannot create a DbInput with neither "\ "times nor durations given" elif times is None: self.times = [sum(durations[:i]) for i in range(len(durations))] elif durations is None: from jazzparser.utils.base import group_pairs self.durations = [ time1 - time0 for (time1, time0) in group_pairs(times) ] + [Fraction(1)] def get_gold_analysis(self): """ Parses the annotations, if present, to get a gold analysis. Unlike L{AnnotatedDbInput}, this input type cannot be assumed to have annotations. It will therefore not raise an error if annotations are missing or incomplete, but just return None. """ from jazzparser.evaluation.parsing import parse_sequence_with_annotations from jazzparser.grammar import get_grammar from jazzparser.parsers import ParseError try: parses = parse_sequence_with_annotations(self, get_grammar(), allow_subparses=False) except ParseError: return None else: return parses[0].semantics @staticmethod def from_sequence(seq): """ Creates a DbInput from a database representation of a sequence. """ chords = list(seq) inputs = [str(chord) for chord in chords] durations = [chord.duration for chord in seq] return DbInput(inputs, durations=durations, name=seq.string_name, \ id=seq.id, chords=chords, sequence=seq) def __str__(self): return " ".join(["%s" % i for i in self.inputs]) def __len__(self): return len(self.inputs) def __getitem__(self, item): return self.inputs[item] def slice(self, start=None, end=None): if self.chords: chords = self.chords[start:end] else: chords = None return DbInput(self.inputs[start:end], self.durations[start:end], self.times[start:end], id=self.id, name=self.name, chords=chords, sequence=self.sequence) @staticmethod def from_file(filename, options={}): # Load up a sequence index file according to the filename seqs = SequenceIndex.from_file(filename) # Get a sequence by index from the file seq = seqs.sequence_by_index(options['index']) if seq is None: raise InputReadError("%d is not a valid sequence index in %s" % \ (options['index'], filename)) # Get the data from the sequence return DbInput.from_sequence(seq)
def process_option_dict(cls, optdict): return ModuleOption.process_option_dict(optdict, cls.FILE_INPUT_OPTIONS)
class SegmentedMidiInput(Input): """ Input wrapper for MIDI files with extra information about segmentation, in the form it's needed for the Raphael and Stoddard model and midi supertagging models: that is, offset (start of first bar) and bar length. Each segment is a midi L{midi.EventStream}. It also has the additional attribute C{segment_start}, giving the tick time at which the segment begins in the original midi stream. Optionally also stores a gold standard analysis in the form of a db annotated chord sequence: see L{AnnotatedDbInput}. """ FILE_INPUT_OPTIONS = [ ModuleOption('time_unit', filter=float, help_text="number of beats (by the MIDI file resolution) "\ "to take to be one time unit", usage="time_unit=X, where X is an int or float", required=False, default=4), ModuleOption('tick_offset', filter=int, help_text="time in MIDI ticks at which the first time "\ "unit begins", usage="tick_offset=X, where X is an int", required=False, default=0), ModuleOption('truncate', filter=int, help_text="truncate the input to this length.", usage="truncate=L, where L is an integer"), ] SHELL_TOOLS = Input.SHELL_TOOLS + [ tools.PlayMidiChunksTool(), tools.PrintMidiChunksTool() ] def __init__(self, inputs, time_unit=4, tick_offset=0, stream=None, gold=None, sequence_index=None, *args, **kwargs): """ @type inputs: list of L{midi.EventStream}s @param stream: the midi data segments @type time_unit: int or float @param time_unit: number of beats to take as the basic unit of time for observations @type tick_offset: int @param tick_offset: number of ticks after which the first bar begins """ super(SegmentedMidiInput, self).__init__(*args, **kwargs) self.stream = stream self.time_unit = time_unit self.tick_offset = tick_offset self.inputs = inputs self.gold = gold self.sequence_index = sequence_index self.tick_unit = int(stream.resolution * time_unit) def __len__(self): return len(self.inputs) def __getitem__(self, item): return self.inputs[item] def __str__(self): if self.name is not None: return "<MIDI: %s (%d)>" % (self.name, len(self)) else: return "<MIDI: %d chunks>" % len(self) def slice(self, start=None, end=None): return SegmentedMidiInput(self.inputs[start:end], durations=self.durations[start:end], times=self.times[start:end], name=self.name, stream=self.stream, sequence_index=self.sequence_index) def get_gold_analysis(self): # This may be None if no analysis was in the input return self.gold @staticmethod def from_file(filename, options={}, gold=None, sequence_index=None): from midi import read_midifile from os.path import basename # Read are parse the midi file stream = read_midifile(filename) # Get the required segmentation parameters from the options time_unit = options['time_unit'] tick_offset = options['tick_offset'] # Use the filename as an identifier name = basename(filename) return SegmentedMidiInput.from_stream(stream, time_unit=time_unit, tick_offset=tick_offset, name=name, truncate=options['truncate'], gold=gold, only_notes=True, sequence_index=sequence_index) @staticmethod def from_stream(stream, time_unit=4, tick_offset=0, name=None, only_notes=True, truncate=None, gold=None, sequence_index=None): """ Creates a L{SegmentedMidiInput} from a midi event stream. @type only_notes: bool @param only_notes: if True, only includes note-on/note-off events in the segments. If False, the stream will be sliced so that each segment repeats things like program change events at the beginning. Including only notes, however, makes the preprocessing very much faster """ # Divide the stream up into slices of the right size # Number of ticks in each slice tick_unit = int(stream.resolution * time_unit) if len(stream.trackpool) == 0: end_time = 0 else: end_time = max(stream.trackpool).tick if only_notes: from midi import EventStream, NoteOnEvent, NoteOffEvent, EndOfTrackEvent # Only include notes in the stream # This is much simpler and faster than the alternative events = [ev for ev in list(sorted(stream.trackpool)) if \ type(ev) in [NoteOnEvent, NoteOffEvent]] events = iter(events) try: current_event = events.next() # Get up to the start point in the stream while current_event.tick < tick_offset: current_event = events.next() except StopIteration: # Got to the end of the stream before we even started inputs = [] else: inputs = [] for chunk_start in range(tick_offset, end_time, tick_unit): chunk_end = chunk_start + tick_unit slc = EventStream() slc.add_track() slc.format = stream.format slc.resolution = stream.resolution slc.segment_start = chunk_start # Add all the note events in this time period try: while current_event.tick < chunk_end: slc.add_event(current_event) current_event = events.next() # Add the end of track event eot = EndOfTrackEvent() eot.tick = chunk_end slc.add_event(eot) except StopIteration: # Reached the end of the stream inputs.append(slc) break inputs.append(slc) else: # Use slices to do all the necessary repetition of ongoing events from midi.slice import EventStreamSlice start_times = range(tick_offset, end_time, tick_unit) # First slice starts at the offset value slices = [ EventStreamSlice(stream, chunk_start, chunk_start + tick_unit) for chunk_start in start_times ] inputs = [slc.to_event_stream(repeat_playing=False, cancel_playing=False) \ for slc in slices] # Associate the start time with each segment for slc, start_time in zip(inputs, start_times): slc.segment_start = start_time # Remove empty segments from the start and end current = 0 # There's always one event - the end of track while len(inputs[current].trackpool) < 2: current += 1 inputs = inputs[current:] # And the end current = len(inputs) - 1 while len(inputs[current].trackpool) < 2: current -= 1 inputs = inputs[:current + 1] if truncate is not None: inputs = inputs[:truncate] return SegmentedMidiInput(inputs, time_unit=time_unit, tick_offset=tick_offset, name=name, stream=stream, gold=gold, sequence_index=sequence_index)
class SegmentedMidiBulkInput(BulkInput): """ A CSV file containing midi file paths and the parameters for segmenting each one. May store an index of a gold analysis with each input. This should appear in column 4. If these are given, the first line of the file should specify the path to the sequence input file as follows:: GOLD: <relative path> Columns: filename, time unit, tick offset, ignore (bool, optional), gold id (int, optional) """ INPUT_TYPE = SegmentedMidiInput FILE_INPUT_OPTIONS = [ ModuleOption('truncate', filter=int, help_text="truncate each input to this length.", usage="truncate=L, where L is an integer") ] SHELL_TOOLS = BulkInput.SHELL_TOOLS + [tools.PlayBulkMidiChunksTool()] def __init__(self, inputs): self.inputs = inputs def __str__(self): return "<bulk midi: %s>" % (" ".join([str(mid) for mid in self.inputs])) @staticmethod def writeln(csv, filename, time_unit=None, tick_offset=0, ignore=False, seq_index=None): """ Writes a line to a segmidi bulk input file, opened as a CSV writer. """ row = [ "%s" % filename, "%f" % time_unit if time_unit else "2", "%d" % tick_offset, "TRUE" if ignore else "", "%d" % seq_index if seq_index is not None else "" ] csv.writerow(row) @staticmethod def from_file(filename, options={}): import csv, os # Read in the CSV file infile = open(filename, 'r') try: reader = csv.reader(infile) data = list(reader) finally: infile.close() base_path = os.path.abspath(os.path.dirname(filename)) # Check the first line of the file for GOLD input if data[0][0].startswith("GOLD:"): gold_path = data[0][0].lstrip("GOLD:").strip() gold_path = os.path.join(base_path, gold_path) # Load the annotated data gold_data = AnnotatedDbBulkInput.from_file(gold_path) # Ignore this first line now data = data[1:] else: gold_data = None # Read the file's data and process it inputs = [] for row in data: # Optional col 4 allows us to ignore rows for training while # keeping their parameters in the file if len(row) > 3: ignore = str_to_bool(row[3]) else: ignore = False if not ignore: filename = row[0] # Read in the midi file midi = os.path.join(base_path, filename) # Prepare the parameters if row[1]: time_unit = float(row[1]) else: time_unit = 2.0 if row[2]: tick_offset = int(row[2]) else: tick_offset = 0 if len(row) > 4 and gold_data is not None and row[4].strip(): # A gold sequence analysis was given: load it up seq_index = int(row[4]) gold = gold_data[seq_index].get_gold_analysis() else: seq_index = None gold = None options = SegmentedMidiInput.process_option_dict({ 'time_unit': time_unit, 'tick_offset': tick_offset, 'truncate': options['truncate'], }) inputs.append( SegmentedMidiInput.from_file(midi, options=options, gold=gold, sequence_index=seq_index)) return SegmentedMidiBulkInput(inputs)
class ChordBulkInput(BulkInput): """ A file containing a list of textual chord sequences. This used to be provided fully in the top-level parser script as input processing. """ INPUT_TYPE = ChordInput FILE_INPUT_OPTIONS = [ ModuleOption('start', filter=int, help_text="line number to start reading from", usage="start=X, where X is an int"), ModuleOption('end', filter=int, help_text="line number at which to stop reading", usage="end=X, where X is an int"), ModuleOption('roman', filter=str_to_bool, help_text="read chord symbols as roman numberals. "\ "Default is to assume note names", usage="roman=B, where B is a boolean", default=False), ] def __init__(self, inputs, output_lines=None): self.inputs = inputs self.output_lines = output_lines @staticmethod def from_file(filename, options={}): f = open(filename, 'r') try: lines = f.readlines() finally: f.close() lines = [l.rstrip("\n") for l in lines] # Use the start and end line numbers if they were given if 'start' in options: lines = lines[options['start']:] if 'end' in options: lines = lines[:options['end']] # Do all the preprocessing output_lines = {} inputs = [] sequence_name = None for line in lines: # If this is an output comment, output it and move to the next item if line.startswith(">>"): # If this is also a name definition, use it for the next sequence if line[2:].startswith("="): sequence_name = line[3:-1] output_lines[len(inputs)] = line[3:] else: output_lines[len(inputs)] = line[2:] continue elif line.startswith("//"): # Non-printing comment # This could also be a name definition if line[2:].startswith("="): output_lines[len(inputs)] = line[3:-1] continue elif len(line.strip()) == 0: # Ignore blank lines continue else: # Otherwise it's an actual chord sequence inputs.append( ChordInput.from_string(line, name=sequence_name, roman=options['roman'])) # Reset the sequence name sequence_name = None return ChordBulkInput(inputs, output_lines=output_lines) def to_db_inputs(self): """ @see: L{ChordInput.to_db_input} """ return DbBulkInput([chords.to_db_input() for chords in self.inputs])
def command_line_input(filename=None, filetype=None, options="", allowed_types=None, default_type=None): """ Utility function for processing file input options from the command line. Pass in as args the values straight from the command line options to select a filename, filetype and list of options. Typical command-line options for this purpose (for an optparse option parser C{op}):: op.add_option("--file", "-f", dest="file", action="store", help="use a file to get input from") op.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Use '--filetype help' for a list of available types") op.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Use '--fopt help', with '--ft <type>', for a list of available options") Then you can call this function as:: command_line_input(filename=options.file, filetype=options.filetype, options=options.file_options) @type allowed_types: list of strs @param allowed_types: types of input you want the user to be able to give. If not given, all types are allowed @type default_type: str @param default_type: filetype to assume if no other filetype is given @rtype: L{InputReader} subclass @return: the input wrapper of appropriate type, or None if no input file was given """ if allowed_types is None: allowed_types = get_input_type_names() if filetype is None and default_type is not None: filetype = default_type # Catch a request for filetype help if filetype is not None and filetype.lower() == "help": # Output possible file types print "Allowed input types: %s" % ", ".join(allowed_types) sys.exit(0) # Check that the filetype is valid and get the input type class if it is input_type = get_input_type(filetype) type_name = input_type_name(input_type) if input_type is None: raise InputTypeError, "Unknown filetype '%s'. Allowed types are: %s" % (filetype, ", ".join(allowed_types)) if type_name not in allowed_types: raise InputTypeError, "Cannot accept input of type '%s'. Allowed " "types are: %s" % ( filetype, ", ".join(allowed_types), ) if options is not None and options.lower() == "help": # Output help text from jazzparser.utils.options import options_help_text print options_help_text(input_type.FILE_INPUT_OPTIONS, intro="Available options for input type %s" % type_name) sys.exit(0) if filename is None: return None # First get a dict of the options file_options = ModuleOption.process_option_string(options) # Process the options as appropriate for this type file_options = input_type.process_option_dict(file_options) # Instantiate the input from the file as appropriate for the input type input_data = input_type.from_file(filename, file_options) return input_data
def main(): usage = "%prog [options] <model_name> <in-file>" description = "Trains a chord labeling model using the given "\ "input data. The data file may be a stored SequenceIndex file, or "\ "any other type of bulk data file." parser = OptionParser(usage=usage, description=description) parser.add_option( '-p', '--partitions', dest="partitions", action="store", type="int", help= "train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number." ) parser.add_option( '--opts', dest="training_opts", action="append", help= "options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type." ) # File input options parser.add_option( "--filetype", "--ft", dest="filetype", action="store", help= "select the file type for the input file. Same filetypes as jazzparser", default='bulk-db') parser.add_option( "--file-options", "--fopt", dest="file_options", action="store", help= "options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options." ) # Logging output parser.add_option( '--log', dest="log", action="store", help= "file to output training logs to. Specify a base filename; <modelname>.log will be added to the end" ) options, arguments = parse_args_with_config(parser) grammar = Grammar() # Handle any training options that were given on the command line if options.training_opts is None: training_opts = {} elif "help" in [opt.lower() for opt in options.training_opts]: print options_help_text(HPChordLabeler.TRAINING_OPTIONS, intro="Training options:") sys.exit(0) else: training_opts = ModuleOption.process_option_string( options.training_opts) if len(arguments) < 2: print >> sys.stderr, "You must specify a model name and an input data file as arguments" sys.exit(1) filename = os.path.abspath(arguments[1]) model_name = arguments[0] # Load the sequence data # Only allow bulk types input_data = command_line_input(filename=filename, filetype=options.filetype, options=options.file_options, allowed_types=get_input_type_names( single=False, bulk=True)) # Only partition the chord data, not the MIDI data if options.partitions is not None and not \ (isinstance(input_data, MidiTaggerTrainingBulkInput) and \ input_data.chords is not None): print >>sys.stderr, "Can only partition chord data and no chord data "\ "was supplied" sys.exit(1) if options.partitions: # The input includes chord training data parts = input_data.chords.get_partitions(options.partitions)[1] models = [("%s%d" % (model_name,num),chord_data) \ for num,chord_data in enumerate(parts)] else: models = [(model_name, None)] for part_name, chord_data in models: if options.log is not None: # Prepare a logger logfile = "%s%s.log" % (options.log, part_name) print "Logging output to file %s" % logfile logger = create_logger(filename=logfile) else: logger = None # Create a fresh model with this name model = HPChordLabeler.train(input_data, part_name, logger=logger, options=training_opts, chord_data=chord_data) print "Trained model %s" % (part_name)
def main(): set_proc_title("jazzparser") ######################################################## usage = "jazzparser [<options>]" description = "The main parser interface for the Jazz Parser" ## Process the input options optparser = OptionParser(usage=usage, description=description) ### # File input options group = OptionGroup(optparser, "Input", "Input type and location") optparser.add_option_group(group) group.add_option("--file", "-f", dest="file", action="store", help="use a file to get parser input from. Use --filetype to specify the type of the file.") group.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file (--file). Use '--filetype help' for a list of available types. Default: chords", default='chords') group.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file (--file). Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.") group.add_option("--index", "--indices", dest="input_index", action="store", help="select individual inputs to process. Specify as a comma-separated list of indices. All inputs are loaded as usual, but only the ith input is processed, for each i in the list") group.add_option("--only-load", dest="only_load", action="store_true", help="don't do anything with the inputs, just load and list them. Handy for checking the inputs load and getting their indices") group.add_option("--partitions", dest="partitions", action="store", type="int", help="divide the input data into this number of partitions and use a different set of models for each. For any parser, tagger and backoff that takes a 'model' argument, the partition number will be appended to the given value") group.add_option("--seq-parts", "--sequence-partitions", dest="sequence_partitions", action="store", help="use a chord sequence index to partition the inputs. Input type (bulk) must support association of the inputs with chord sequences by id. Sequences in the given sequence index file are partitioned n ways (--partitions) and the inputs are processed according to their associated sequence.") group.add_option("--continue", "--skip-done", dest="skip_done", action="store_true", help="skip any inputs for which a readable results file already exists. This is useful for continuing a bulk job that was stopped in the middle") ### group = OptionGroup(optparser, "Parser", "Parser, supertagger and backoff parser") optparser.add_option_group(group) group.add_option("-d", "--derivations", dest="derivations", action="store_true", help="keep derivation logs during parse.") group.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.") # Parser options group.add_option("-p", "--parser", dest="parser", action="store", help="use the named parser algorithm instead of the default. Use '-p help' to see the list of available parsers. Default: %s" % settings.DEFAULT_PARSER, default=settings.DEFAULT_PARSER) group.add_option("--popt", "--parser-options", dest="popts", action="append", help="specify options for the parser. Type '--popt help', using '--parser <name>' to select a parser module, to get a list of options.") # Tagger options group.add_option("-t", "--tagger", "--supertagger", dest="supertagger", action="store", help="run the parser using the named supertagger. Use '-t help' to see the list of available taggers. Default: %s" % settings.DEFAULT_SUPERTAGGER, default=settings.DEFAULT_SUPERTAGGER) group.add_option("--topt", "--tagger-options", dest="topts", action="append", help="specify options for the tagger. Type '--topt help', using '-u <name>' to select a tagger module, to get a list of options.") # Backoff options group.add_option("-b", "--backoff", "--noparse", dest="backoff", action="store", help="use the named backoff model as a backoff if the parser produces no results") group.add_option("--bopt", "--backoff-options", "--backoff-options", "--npo", dest="backoff_opts", action="append", help="specify options for the backoff model. Type '--npo help', using '--backoff <name>' to select a backoff modules, to get a list of options.") ### # Multiprocessing options group = OptionGroup(optparser, "Multiprocessing") optparser.add_option_group(group) group.add_option("--processes", dest="processes", action="store", type="int", help="number of processes to create to perform parses in parallel. Default: 1, i.e. no process pool. Use -1 to create a process for every input", default=1) ### # Output options group = OptionGroup(optparser, "Output") optparser.add_option_group(group) group.add_option("--output", dest="output", action="store", help="directory name to output parse results to. A filename specific to the individual input will be appended to this") group.add_option("--topn", dest="topn", action="store", type="int", help="limit the number of final results to store in the output file to the top n by probability. By default, stores all") group.add_option("--output-opts", "--oopts", dest="output_opts", action="store", help="options that affect the output formatting. Use '--output-opts help' for a list of options.") group.add_option("-a", "--atomic-results", dest="atoms_only", action="store_true", help="only include atomic categories in the results.") group.add_option("-l", "--latex", dest="latex", action="store_true", help="output all results as Latex source. Used to produce a whole Latex document, but doesn't any more") group.add_option("--all-times", dest="all_times", action="store_true", help="display all timing information on semantics in output.") group.add_option("-v", "--debug", dest="debug", action="store_true", help="output verbose debugging information.") group.add_option("--time", dest="time", action="store_true", help="time how long the parse takes and output with the results.") group.add_option("--no-results", dest="no_results", action="store_true", help="don't print out the parse results at the end. Obviously you'll want to make sure they're going to a file (--output). This is useful for bulk parse jobs, where the results produce a lot of unnecessary output") group.add_option("--no-progress", dest="no_progress", action="store_true", help="don't output the summary of completed sequences after each one finishes") ### # Output analysis and harmonical group = OptionGroup(optparser, "Output processing", "Output analysis and harmonical") optparser.add_option_group(group) group.add_option("--harmonical", dest="harmonical", action="store", help="use the harmonical to play the chords justly intoned according to the top result and output to a wave file.") group.add_option("--enharmonical", dest="enharmonical", action="store", help="use the harmonical to play the chords in equal temperament and output to a wave file.") group.add_option("--midi", dest="midi", action="store_true", help="generate MIDI files from the harmonical, instead of wave files.") group.add_option("--tempo", dest="tempo", action="store", type=int, help="tempo to use for the generated music (see --harmonical/--enharmonical). Default: 120", default=120) group.add_option("--lh-analysis", dest="lh_analysis", action="store_true", help="output the Longuet-Higgins space interpretation of the semantics for each result.") group.add_option("--lh-coordinates", dest="lh_coord", action="store_true", help="like lh-analysis, but displays the coordinates of the points instead of their names.") ### # Logging options group = OptionGroup(optparser, "Logging") optparser.add_option_group(group) group.add_option("--long-progress", dest="long_progress", action="store_true", help="print a summary of the chart so far after each chord/word has been processed.") group.add_option("--progress", "--short-progress", dest="short_progress", action="store_true", help="print a small amount of information out during parsing to indicate progress.") group.add_option("--logger", dest="logger", action="store", help="directory to put parser logging in. A filename based on an identifier for each individual input will be appended.") ### # Shell options group = OptionGroup(optparser, "Shell", "Interactive shell for inspecting results and parser state") optparser.add_option_group(group) group.add_option("-i", "--interactive", dest="interactive", action="store_true", help="enter interactive mode after parsing.") group.add_option("--error", dest="error_shell", action="store_true", help="catch any errors, report them and then enter the interactive shell. This also catches keyboard interrupts, so you can use it to halt parsing and enter the shell.") # Read in command line options and args options, clinput = parse_args_with_config(optparser) ########################### Option processing #################### # Get log level option first, so we can start using the logger if options.debug: log_level = logging.DEBUG else: log_level = logging.INFO # Set up a logger init_logging(log_level) if options.latex: settings.OPTIONS.OUTPUT_LATEX = True if options.logger: # Directory parse_logger_dir = options.logger check_directory(parse_logger_dir) else: parse_logger_dir = None ######## Grammar ######## # Check the grammar actually exists grammar_names = get_grammar_names() if options.grammar is not None and options.grammar not in grammar_names: # This is not a valid grammar name logger.error("The grammar '%s' does not exist. Possible "\ "grammars are: %s." % (options.grammar, ", ".join(grammar_names))) return 1 grammar = get_grammar(options.grammar) ######## Parser ######## # Load the requested parser from jazzparser.parsers import PARSERS if options.parser.lower() == "help": print "Available parsers are: %s" % ", ".join(PARSERS) return 0 try: parser_cls = get_parser(options.parser) except ParserLoadError: logger.error("The parser '%s' could not be loaded. Possible "\ "parsers are: %s" % (options.parser, ", ".join(PARSERS))) return 1 # Get parser options if options.popts is not None: poptstr = options.popts if "help" in [s.strip().lower() for s in poptstr]: # Output this tagger's option help from jazzparser.utils.options import options_help_text print options_help_text(parser_cls.PARSER_OPTIONS, intro="Available options for selected parser") return 0 poptstr = ":".join(poptstr) else: poptstr = "" popts = ModuleOption.process_option_string(poptstr) # Check that the options are valid try: parser_cls.check_options(popts) except ModuleOptionError, err: logger.error("Problem with parser options (--popt): %s" % err) return 1
def main(): usage = "%prog [options] <model_name> <input-file>" description = ( "Trains a model for the RaphSto chord labelling " "algorithm on a file that contains a list of midi files with " "training options" ) parser = OptionParser(usage=usage, description=description) parser.add_option( "-p", "--partitions", dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.", ) parser.add_option( "--opts", dest="opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.", ) parser.add_option( "--proc", "--processes", dest="processes", action="store", type="int", help="number of parallel processes to spawn for the training. Use -1 to spawn one per training sequence (after splitting: see split_length)", default=1, ) parser.add_option( "--max-length", dest="max_length", action="store", type="int", help="limits the length of the training midi sequences in chunks", ) parser.add_option( "--split-length", dest="split_length", action="store", type="int", help="limits the length of the training midi sequences in chunks, but instead of throwing away everything after the first N chunks, splits it off as if it were starting a new sequence. This is good for multiprocessing, since many short sequences can be multitasked, whilst few long ones cannot", ) parser.add_option( "--min-length", dest="min_length", action="store", type="int", help="ignores any sequences under this number of chunks. This is useful with --split-length, which can leave very short sequences from the end of a split sequence", ) parser.add_option( "--progress-out", dest="progress_out", action="store", help="output logging info to a file instead of the command line", ) parser.add_option( "--init-model", dest="init_model", action="store", help="initialize the model using parameters from an already trained model", ) parser.add_option( "--init-ctrans", dest="init_ctrans", action="store", help="initialize the chord transition distribution using these parameters. Comma-separated list of params given as C0->C1-P, where C0 and C1 are chords (I, II, etc) and P is a float probability", ) parser.add_option( "--chord-set", dest="chord_set", action="store", help="use a chord set other than the default. Use value 'help' to see a list. Has no effect in combination with --init-model, since the old model's chord set will be used", ) parser.add_option( "-m", "--model-type", dest="model_type", action="store", help="select a model type: one of %s (default: standard)" % ", ".join(mt for mt in MODEL_TYPES.keys()), default="standard", ) options, arguments = parse_args_with_config(parser) if options.opts is not None and options.opts == "help": print options_help_text(RaphstoBaumWelchTrainer.OPTIONS, intro="Training options for Raphael and Stoddard HMMs") sys.exit(0) opts = ModuleOption.process_option_string(options.opts) if len(arguments) < 2: print >> sys.stderr, "You must specify a model name and an input data file as arguments" sys.exit(1) filename = os.path.abspath(arguments[1]) model_name = arguments[0] print >> sys.stderr, "Raphsto training beginning at %s" % datetime.now().isoformat(" ") # Create a logger to output the progress of the training to stdout or a file if options.progress_out is not None: stdout = False logfile = options.progress_out print >> sys.stderr, "Outputing logging info to %s" % logfile else: stdout = True logfile = None print >> sys.stderr, "Outputing logging to stdout" logger = create_logger(name="raphsto_train", filename=logfile, stdout=stdout) logger.info("Raphael and Stoddard HMM model training") if options.model_type not in MODEL_TYPES: print >> sys.stderr, "Model type must be one of: %s" % ", ".join(mt for mt in MODEL_TYPES) sys.exit(1) model_cls = MODEL_TYPES[options.model_type] if options.chord_set == "help": print "Available chord sets: %s" % ", ".join(constants.CHORD_SETS.keys()) sys.exit(0) elif options.chord_set is not None: # Check this chord set exists if options.chord_set not in constants.CHORD_SETS: print >> sys.stderr, "Chord set '%s' does not exist" % options.chord_set sys.exit(1) else: logger.info("Using chord set '%s'" % options.chord_set) # Read in the training data midis = InputSourceFile(filename) handlers = midis.get_handlers() logger.info("Reading in %d midi files..." % len(midis.inputs)) training_data = [] for i, mh in enumerate(handlers): logger.info("%s: %s" % (i, midis.inputs[i][0])) emissions = mh.get_emission_stream()[0] if options.max_length is not None and len(emissions) > options.max_length: logger.info("Truncating file %d to %d chunks (was %d)" % (i, options.max_length, len(emissions))) emissions = emissions[: options.max_length] if options.split_length is not None: logger.info("Splitting sequence %d into sequence no longer " "than %d chunks" % (i, options.split_length)) # Split up the sequence if it's too long while len(emissions) > options.split_length: training_data.append(emissions[: options.split_length]) emissions = emissions[options.split_length :] training_data.append(emissions) if options.min_length is not None: # Make sure there are no sequences under the minimum length # Just throw away any that are before_chuck = len(training_data) training_data = [seq for seq in training_data if len(seq) >= options.min_length] if len(training_data) != before_chuck: logger.info( "Threw away %d short sequences (below %d chunks)" % ((before_chuck - len(training_data)), options.min_length) ) logger.info( "Training on %d sequences. Lengths: %s" % (len(training_data), ", ".join(str(len(seq)) for seq in training_data)) ) if options.partitions is not None: parts = holdout_partition(training_data, options.partitions) models = [("%s%d" % (model_name, num), data) for num, data in enumerate(parts)] else: models = [(model_name, training_data)] # Number of processes to use if options.processes == -1: # Special value: means number of training sequences (one process per sequence) processes = len(training_data) else: processes = options.processes for part_name, data in models: # Instantiate a fresh model with this name logger.info("Training model '%s' on %d midis" % (part_name, len(data))) if options.init_model is not None: logger.info("Initializing using parameters from model '%s'" % options.init_model) # Load an already trained model as initialization model = model_cls.initialize_existing_model(options.init_model, model_name=part_name) else: # TODO: make these probs an option ctype_params = (0.5, 0.3, 0.2) logger.info("Initializing to naive chord types using parameters: " "%s, %s, %s" % ctype_params) init_kwargs = {"model_name": part_name} if options.chord_set is not None: # Specify a chord set for the model init_kwargs["chord_set"] = options.chord_set model = model_cls.initialize_chord_types(ctype_params, **init_kwargs) # Initialize the chord transition probabilities if given if options.init_ctrans is not None: logger.info("Initializing chord transition distribution to %s" % options.init_ctrans) model.set_chord_transition_probabilities(options.init_ctrans) # Retrain it with the loaded data trainer = model_cls.get_trainer()(model, options=opts) trainer.train(data, logger=logger, processes=processes, save_intermediate=True) print >> sys.stderr, "Training terminating at %s" % datetime.now().isoformat(" ")
class DirectedCkyParser(Parser): """ DirectedCkyParser is a special version of the CKY parser that tries to produce a parse according to a pre-built derivation tree. Why? Canonical trees are stored implicitly in the Jazz corpus. We can build the explicit structure of the trees, in accordance with the implicit manual annotations, but this will not contain any signs on internal nodes. The structure does not produce a parse in itself or even verify that the sequence can be parsed with that structure. The purpose of the DirectedCkyParser is to take a description of this annotated structure and actually perform the parse, packing the chart with only those signs that the derivation structure produces. The parser should be used with a tagger that assigns only those signs that were annotated. Use the PretaggedTagger to do this. """ PARSER_OPTIONS = Parser.PARSER_OPTIONS + [ ModuleOption( 'derivations', filter=bool, help_text="Store derivation traces along with the results", usage="derivations=X, where X is 'True' or 'False'.", default=None, ), ] def __init__(self, grammar, tagger, derivation_tree=None, *args, **kwargs): if derivation_tree is None: raise ValueError, "DirectedCkyParser must be instantiated "\ "with a derivation tree in kwarg 'derivation_tree'." self.derivation_tree = derivation_tree super(DirectedCkyParser, self).__init__(grammar, tagger, *args, **kwargs) def _create_chart(self, *args, **kwargs): self.chart = Chart(self.grammar, *args, **kwargs) return self.chart def parse(self, derivations=False, summaries=False): """ Run the parser on the input, using the specified tagger. Runs the CKY parsing algorithm to do chart parsing. For details of chart parsing, see Chart class. """ if 'derivations' in self.options and self.options[ 'derivations'] is not None: derivations = self.options['derivations'] # Find out from the tagger how long the input it read in was input_length = self.tagger.input_length # Create and initialise a chart for parsing # Don't initialise the chart with signs - we'll add signs gradually instead chart = self._create_chart(signs=[[]] * input_length, derivations=derivations) ################################################## ### Here is the parser itself # Only get signs from the tagger once: we expect to get them all first time # Add all the lexical signs to the chart for word in range(input_length): new_cat_pairs = self.tagger.get_signs_for_word(word) new_cats = [cat for (cat, tag, prob) in new_cat_pairs] chart.add_word_signs(new_cats, word, self.tagger.get_word(word)) ##### Main parser loop: produce only the signs that we're directed to produce # Get a mapping from the tree's short rule names to the rule instances rule_mapping = self.grammar.formalism.PcfgParser.rule_short_names # Perform the parse bottom up by a depth-first left-to-right # recursion on the derivation tree. Recursively parse children # of each node, before applying rules for the node itself. def _fill_chart(start, tree_node): """ Recursively fills the chart using the subtree rooted by tree_node, using start as the leftmost node of the chart. Returns the resulting rightmost node covered by this span. """ if hasattr(tree_node, 'children') and len(tree_node.children) > 0: if len(tree_node.children) > 2: raise DirectedParseError, "invalid derivation tree. "\ "Nodes may have up to 2 children. This node has "\ "%d: %s" % (len(tree_node.children), tree_node) ### An internal node # First recurse to the sub-parses sub_end = start middle = None for child in tree_node.children: sub_end = _fill_chart(sub_end, child) if middle is None: # Store the first node after the start as the middle node middle = sub_end # We now know where this span ends. end = sub_end # Apply the rule associated with the node try: rule_details = rule_mapping[tree_node.rule] except KeyError: raise DirectedParseError, "tree node %s specifies a "\ "rule '%s' which is not defined for this "\ "formalism. Are you using the right formalism "\ "for your data?" % (tree_node, tree_node.rule) rule_cls = self.grammar.formalism.rules[rule_details[0]] # Instantiate the rule rule_kwargs = { 'grammar': self.grammar, 'modalities': self.grammar.modality_tree, } rule_kwargs.update(rule_details[1]) rule = rule_cls(**rule_kwargs) # Try applying the rule to the arguments we've generated # Check we have the right number of children if len(tree_node.children) != rule.arity: raise DirectedParseError, "a node was encountered "\ "that does not have the right number of children "\ "for its rule. %s must have %d children." % \ (tree_node.rule, rule.arity) # Apply the rule to its one or two arguments if rule.arity == 1: added = chart.apply_unary_rule(rule, start, end) debug_inputs = "%s, [%s]" % (rule, ", ".join( ["%s" % s for s in chart.get_signs(start, end)])) elif rule.arity == 2: added = chart.apply_binary_rule(rule, start, middle, end) debug_inputs = "%s, [%s] and [%s]" % (rule, ", ".join( ["%s" % s for s in chart.get_signs(start, middle)]), ", ".join( ["%s" % s for s in chart.get_signs(middle, end)])) # If nothing was added to the chart, the rule must have failed if not added: # No point in continuing, since stuff further up the # tree will inevitably fail raise DirectedParseError, "failed to apply rule %s. "\ "Giving up on parse. "\ "Tree: %s. Inputs: %s." % \ (tree_node.rule, tree_node, debug_inputs) elif hasattr(tree_node, 'chord'): ### Leaf node # We assume this lines up with the correct position in # the tags that the tagger has given us. # This arc is a leaf, so only has a span of 1. end = start + 1 else: # Tree does not conform to correct interface raise DirectedParseError, "derivation tree for directed "\ "parse should be made up of internal trees with "\ "children and leaves with a chord attribute. This "\ "node is neither: %s" % tree_node return end rightmost = _fill_chart(0, self.derivation_tree) return chart.parses
class CandcTagger(ModelTagger): """ Superclass of both kinds of C&C tagger. Don't use this: use one of the subclasses below. """ MODEL_CLASS = CandcTaggerModel COMPATIBLE_FORMALISMS = [ 'music_roman', 'music_keyspan', 'music_halfspan', ] INPUT_TYPES = ['db', 'chords'] # Probability ratio between one tag and the next that allows the # second to be returned in the same batch as the first TAG_BATCH_RATIO = 0.8 DEFAULT_UNSEEN_TAG_PROB = 0.001 TAGGER_OPTIONS = [ ModuleOption('batch', filter=float, help_text="Probability ratio between one tag and the next "\ "that allows the second to be returned in the same batch.", usage="batch=X, where X is a floating point value between 0 and 1", default=TAG_BATCH_RATIO), ModuleOption('model', help_text="Name of the C&C trained model to use. Use the C&C "\ "training scripts to produce this.", usage="model=X, where X is the model name. Split up multi-level models with dots.", required=True), ModuleOption('unseen_tag_prob', filter=float, help_text="Probability mass reserved on each word so that some "\ "probability is assigned to tags never seen in the training "\ "set. This is a form of plus-n smoothing. "\ "Substracted from the total probability of tags for "\ "each word and distributed evenly across all tags.", usage="unseen_tag_prob=X, where X is a floating point value between 0 and 1", default=DEFAULT_UNSEEN_TAG_PROB), ModuleOption('last_batch', filter=str_to_bool, help_text="Use all possible tags, including the last, lowest "\ "probability batch, which typically acts as a bin for "\ "all remaining tags", usage="last_batch=X, where X is 'true' or 'false'", default=True), ] + ModelTagger.TAGGER_OPTIONS def __init__(self, grammar, input, options={}, dict_cutoff=5, *args, **kwargs): super(CandcTagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) if type(self) == CandcTagger: raise NotImplementedError, "Tried to instantiate CandcTagger "\ "directly. You should use one of its subclasses." self.tag_batch_ratio = self.options['batch'] model = self.options['model'].split('.') # Check that candc is available for supertagging if not os.path.exists(settings.CANDC.BASE_PATH): raise CandcConfigurationError, "The C&C parser base "\ "directory %s does not exist" % settings.CANDC.BASE_PATH if not os.path.exists(settings.CANDC.MODELS_PATH): raise CandcConfigurationError, "The C&C parser models "\ "directory %s does not exist" % settings.CANDC.MODELS_PATH candc_cmd = os.path.join(settings.CANDC.BASE_PATH, "bin", self.command) if not os.path.exists(candc_cmd): raise CandcConfigurationError, "The C&C supertagger command "\ "%s does not exist. Have you built it?" % candc_cmd # Check the model exists candc_model = os.path.join(settings.CANDC.MODELS_PATH, *(model)) if not os.path.exists(candc_model): raise CandcConfigurationError, "The C&C model given (%s) "\ "doesn't exist." % candc_model # Create a logger to dump the output to logfile = os.path.join(settings.CANDC.LOG_DIRECTORY, "-".join(model)) candc_logger = create_logger(filename=logfile) self.logger.info("Logging C&C output to %s" % logfile) # Note in the log what we're trying to tag candc_logger.info("Tagging: %s" % " ".join([str(crd) for crd in self.input])) # Read in the list of tags to smooth over self.tag_list = read_tag_list(os.path.join(candc_model, "tags")) # Read in extra options opts_filename = os.path.join(candc_model, "jpopts") if not os.path.exists(opts_filename): self.extra_opts = {} else: with open(opts_filename, 'r') as opts_file: self.extra_opts = dict( [line.strip("\n").split(":", 1) for line in opts_file.readlines()]) # Pull the chord mapping out of the options self.chordmap = get_chord_mapping(self.extra_opts.get('chordmap', None)) # Spawn a process to do the tagging candc_command = [candc_cmd, "--model", candc_model, "--dict_cutoff", "%d" % dict_cutoff]+self.extra_args self.tagger = Popen(candc_command, stdin=PIPE, stdout=PIPE, stderr=PIPE) candc_logger.info("C&C command: %s" % " ".join(candc_command)) self.tokens = self.input # Build some observations from the tokens observations = [ interval_observation_from_chord_string_pair(ch1,ch2,type_mapping=self.chordmap) for ch1,ch2 in group_pairs(self.tokens+[None]) ] # Add a dummy POS tag to each input item self.observations = ["%s|C" % t for t in observations] candc_logger.info("Input: %s" % " ".join(self.observations)) # Run the tagger on this input try: tagger_out, tagger_err = self.tagger.communicate(" ".join(self.observations)) except OSError, err: logger.error("Could not run the C&C supertagger (%s)" % err) candc_logger.error("Error: %s" % err) # Output the actual error that the command returned error = self.tagger.stderr.read() logger.error("C&C returned the error: %s" % error) candc_logger.error("C&C error: %s" % error) raise CandcTaggingError, "error running the C&C supertagger: %s" % error # C&C uses ANSI color commands in the output # Remove them tagger_out = remove_ansi_colors(tagger_out) tagger_err = remove_ansi_colors(tagger_err) # The tagger process should now be terminated. Check it didn't fall over return_code = self.tagger.returncode if return_code < 0: raise CandcTaggingError, "The C&C tagger terminated with return code %s. "\ "Error output for the tagging: %s" % (return_code, tagger_err) # Format the string for slightly easier reading in the logfile log_output = tagger_out.replace("\t", ", ") output_lines = [line for line in log_output.split("\n") if line.strip()] log_output = "\n".join(["%d-%d: %s" % (i,i+1,outline) for (i,outline) in enumerate(output_lines)]) candc_logger.info("Output: %s" % log_output) candc_logger.info("Stderr output: %s" % tagger_err) # Get the tags out of the tagger output. # We ignore the first two items (word and POS tag) and take the third (category) # The output format for the different taggers varies self.tags = self._tags_from_output(tagger_out) # Check for bogus tags # The tagger may return tags that can't actually be # instantiated with the word, since it doesn't know about # the lexicon: ignore them #print "\n".join(", ".join(tag for (sign,tag,prob) in taglist) for taglist in self.tags) self.tags = [ [(sign,tag,prob) for (sign,tag,prob) in self.tags[time] \ if sign is not None] for time in range(len(self.tags))]
def __init__(self, options={}): self.options = ModuleOption.process_option_dict(options, self.OPTIONS)
def main(): usage = "%prog [<options>]" description = "Runs a supertagger from the Jazz Parser to tag some input "\ "but just outputs the results, rather than continuing to parse." optparser = OptionParser(usage=usage, description=description) # Tagger options optparser.add_option("-t", "--tagger", "--supertagger", dest="supertagger", action="store", help="run the parser using the named supertagger. Use '-t help' to see the list of available taggers. Default: %s" % settings.DEFAULT_SUPERTAGGER, default=settings.DEFAULT_SUPERTAGGER) optparser.add_option("--topt", "--tagger-options", dest="topts", action="append", help="specify options for the tagger. Type '--topt help', using '-u <name>' to select a tagger module, to get a list of options.") # Commonly-used misc optparser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.") # File input options optparser.add_option("--file", "-f", dest="file", action="store", help="use a file to get parser input from. Use --filetype to specify the type of the file.") optparser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file (--file). Use '--filetype help' for a list of available types. Default: chords", default='chords') optparser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file (--file). Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.") # Misc options optparser.add_option("-v", "--debug", dest="debug", action="store_true", help="output verbose debugging information.") optparser.add_option("-i", "--interactive", dest="interactive", action="store_true", help="instead of just outputing all tags in one go, wait for user input between each iteration of adaptive supertagging") # Logging options optparser.add_option("--logger", dest="logger", action="store", help="directory to put parser logging in. A filename based on an identifier for each individual input will be appended.") # Read in command line options and args options, clinput = parse_args_with_config(optparser) ########################### Option processing #################### if options.logger: # Directory parse_logger_dir = options.logger check_directory(parse_logger_dir) else: parse_logger_dir = None ######## Grammar ######## # Read in the grammar grammar = get_grammar(options.grammar) ######## Supertagger ######## # Load the supertagger requested if options.supertagger.lower() == "help": print "Available taggers are: %s" % ", ".join(TAGGERS) return 0 try: tagger_cls = get_tagger(options.supertagger) except TaggerLoadError: logger.error("The tagger '%s' could not be loaded. Possible "\ "taggers are: %s" % (options.supertagger, ", ".join(TAGGERS))) return 1 # Get supertagger options before initializing the tagger if options.topts is not None: toptstr = options.topts if "help" in [s.strip().lower() for s in toptstr]: # Output this tagger's option help from jazzparser.utils.options import options_help_text print options_help_text(tagger_cls.TAGGER_OPTIONS, intro="Available options for selected tagger") return 0 toptstr = ":".join(toptstr) else: toptstr = "" topts = ModuleOption.process_option_string(toptstr) # Check that the options are valid try: tagger_cls.check_options(topts) except ModuleOptionError, err: print "Problem with tagger options (--topt): %s" % err return 1