def test_detect_input_type(self):
        # Load some input: DbInput
        dbi = DbInput.from_file(DB_SEQUENCES_FILE, {"index": 0})
        # Run it through the preprocessor
        datatype, obj = detect_input_type(dbi)
        # Get the datatype from the type name lists
        datatype2 = input_type_name(type(obj))
        self.assertEqual(datatype, datatype2)

        # Do the same with ChordInput
        ci = ChordInput.from_file(CHORDS_FILE, options={"roman": True})
        datatype, obj = detect_input_type(ci)
        datatype2 = input_type_name(type(obj))
        self.assertEqual(datatype, datatype2)

        # Try some bulk input
        bulk = DbBulkInput.from_file(DB_SEQUENCES_FILE)
        datatype, obj = detect_input_type(bulk, allow_bulk=True)
        datatype2 = input_type_name(type(obj))
        self.assertEqual(datatype, datatype2)

        # Try restricting the allowed type
        datatype, obj = detect_input_type(ci, allowed=["chords"])
        # And this one should get rejected
        self.assertRaises(InputTypeError, detect_input_type, (ci,), {"allowed": "db"})
Example #2
0
    def test_detect_input_type(self):
        # Load some input: DbInput
        dbi = DbInput.from_file(DB_SEQUENCES_FILE, {'index': 0})
        # Run it through the preprocessor
        datatype, obj = detect_input_type(dbi)
        # Get the datatype from the type name lists
        datatype2 = input_type_name(type(obj))
        self.assertEqual(datatype, datatype2)

        # Do the same with ChordInput
        ci = ChordInput.from_file(CHORDS_FILE, options={'roman': True})
        datatype, obj = detect_input_type(ci)
        datatype2 = input_type_name(type(obj))
        self.assertEqual(datatype, datatype2)

        # Try some bulk input
        bulk = DbBulkInput.from_file(DB_SEQUENCES_FILE)
        datatype, obj = detect_input_type(bulk, allow_bulk=True)
        datatype2 = input_type_name(type(obj))
        self.assertEqual(datatype, datatype2)

        # Try restricting the allowed type
        datatype, obj = detect_input_type(ci, allowed=['chords'])
        # And this one should get rejected
        self.assertRaises(InputTypeError, detect_input_type, (ci, ),
                          {'allowed': 'db'})
Example #3
0
    def __init__(self,
                 grammar,
                 input,
                 options={},
                 original_input=None,
                 logger=None):
        """
        The tagger must have reference to the grammar being used to parse
        the input. It must also be given the full input when instantiated.
        The format of this input will depend on the tagger: for example,
        it might be a string or a MIDI file.
        
        @param original_input: the input in its original, unprocessed form. This 
            will usually be a string. This is optional, but in some 
            circumstances things might fall apart if it hasn't been given.
            E.g. using a backoff model as backoff from a tagging model requires 
            the original input to be passed to the backoff model.
        @param logger: optional progress logger. Logging will be sent to this 
            during initialization of the tagger and tagging. If not given, the 
            logging will be lost. Subclasses may access the logger (or a dummy 
            logger if none was given) in C{self.logger}.
        
        """
        self.grammar = grammar
        # Check the formalism is one that's allowed by this tagger
        formalism = self.grammar.formalism.get_name()
        if formalism not in self.COMPATIBLE_FORMALISMS:
            raise TaggerLoadError, "Formalism '%s' cannot be used with "\
                "tagger '%s'" % (formalism,self.name)

        # Check what input type we've received and preprocess it
        datatype, input = detect_input_type(input, allowed=self.INPUT_TYPES, \
                                errmess=" for use with tagger '%s'" % self.name)
        # Store this for the subclass to use as appropriate
        self.input = input
        if original_input is None:
            self.original_input = input
        else:
            self.original_input = original_input
        # Subclasses may redefine self.input to taste
        # We keep the original wrapped input somewhere where it's sure to remain
        self.wrapped_input = input
        # Initialize using tagger-specific options
        self.options = type(self).check_options(options)

        if logger is not None:
            self.logger = logger
        else:
            self.logger = create_dummy_logger()
Example #4
0
 def __init__(self, input, options={}, logger=None):
     # Initialize using tagger-specific options
     self.options = type(self).check_options(options)
     # Check what input type we've received and preprocess it
     datatype, input = detect_input_type(input, allowed=self.INPUT_TYPES)
     # Store this for the subclass to use as appropriate
     self.input = input
     self.original_input = input
     # Subclasses may redefine self.input to taste
     # We keep the original wrapped input somewhere where it's sure to remain
     self.wrapped_input = input
     # Make sure we have some logger
     if logger is None:
         # Output to stderr instead
         self.logger = create_plain_stderr_logger()
     else:
         self.logger = logger
Example #5
0
 def __init__(self, input, options={}, logger=None):
     # Initialize using tagger-specific options
     self.options = type(self).check_options(options)
     # Check what input type we've received and preprocess it
     datatype, input = detect_input_type(input, allowed=self.INPUT_TYPES)
     # Store this for the subclass to use as appropriate
     self.input = input
     self.original_input = input
     # Subclasses may redefine self.input to taste
     # We keep the original wrapped input somewhere where it's sure to remain
     self.wrapped_input = input
     # Make sure we have some logger
     if logger is None:
         # Output to stderr instead
         self.logger = create_plain_stderr_logger()
     else:
         self.logger = logger
Example #6
0
 def __init__(self, grammar, input, options={}, original_input=None, logger=None):
     """
     The tagger must have reference to the grammar being used to parse
     the input. It must also be given the full input when instantiated.
     The format of this input will depend on the tagger: for example,
     it might be a string or a MIDI file.
     
     @param original_input: the input in its original, unprocessed form. This 
         will usually be a string. This is optional, but in some 
         circumstances things might fall apart if it hasn't been given.
         E.g. using a backoff model as backoff from a tagging model requires 
         the original input to be passed to the backoff model.
     @param logger: optional progress logger. Logging will be sent to this 
         during initialization of the tagger and tagging. If not given, the 
         logging will be lost. Subclasses may access the logger (or a dummy 
         logger if none was given) in C{self.logger}.
     
     """
     self.grammar = grammar
     # Check the formalism is one that's allowed by this tagger
     formalism = self.grammar.formalism.get_name()
     if formalism not in self.COMPATIBLE_FORMALISMS:
         raise TaggerLoadError, "Formalism '%s' cannot be used with "\
             "tagger '%s'" % (formalism,self.name)
     
     # Check what input type we've received and preprocess it
     datatype, input = detect_input_type(input, allowed=self.INPUT_TYPES, \
                             errmess=" for use with tagger '%s'" % self.name)
     # Store this for the subclass to use as appropriate
     self.input = input
     if original_input is None:
         self.original_input = input
     else:
         self.original_input = original_input
     # Subclasses may redefine self.input to taste
     # We keep the original wrapped input somewhere where it's sure to remain
     self.wrapped_input = input
     # Initialize using tagger-specific options
     self.options = type(self).check_options(options)
     
     if logger is not None:
         self.logger = logger
     else:
         self.logger = create_dummy_logger()
Example #7
0
    def train(self, inputs, grammar=None, logger=None):
        """
        @type inputs: L{jazzparser.data.input.MidiTaggerTrainingBulkInput} or 
            list of L{jazzparser.data.input.Input}s
        @param inputs: training MIDI data. Annotated chord sequences should 
            also be given (though this is optional) by loading a 
            bulk db input file in the MidiTaggerTrainingBulkInput.
        
        """
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()
            
        if len(inputs) == 0:
            # No data - nothing to do
            return
        
        # Check the type of one of the inputs - no guarantee they're all the 
        #  same, but there's something seriously weird going on if they're not
        input_type = detect_input_type(inputs[0], allowed=['segmidi'])
        # Get the chord training data too if it's been given
        if isinstance(inputs, MidiTaggerTrainingBulkInput) and \
                inputs.chords is not None:
            chord_inputs = inputs.chords
        else:
            chord_inputs = None
        
        # Initialize the emission distribution for chord classes
        self.hmm = ChordClassHmm.initialize_chord_classes(
                    self.options['ccprob'],
                    self.options['maxnotes'],
                    grammar,
                    metric=self.options['metric'],
                    illegal_transitions=self.options['illegal_transitions'],
                    fixed_root_transitions=self.options['fixed_roots'])
        
        if chord_inputs:
            # If chord training data was given, initially train transition 
            #  distribution from this
            self.hmm.add_history("Training initial transition distribution "\
                                    "from annotated chord data")
            self.hmm.train_transition_distribution(chord_inputs, grammar, \
                                        contprob=self.options['contprob'])
        else:
            # Otherwise it gets left as a uniform distribution
            self.hmm.add_history("No annotated chord training data given. "\
                    "Transition distribution initialized to uniform.")
        
        # Get a Baum-Welch trainer to do the EM retraining
        # Pull out the options to pass to the trainer
        bw_opt_names = [opt.name for opt in ChordClassBaumWelchTrainer.OPTIONS]
        bw_opts = dict([(name,val) for (name,val) in self.options.items() \
                        if name in bw_opt_names])
        retrainer = ChordClassBaumWelchTrainer(self.hmm, options=bw_opts)
        # Prepare a callback to save
        def _get_save_callback():
            def _save_callback():
                self.save()
            return _save_callback
        save_callback = _get_save_callback()
        # Do the Baum-Welch training
        retrainer.train(inputs, logger=logger, save_callback=save_callback)
        
        self.model_description = """\
Initial chord class emission prob: %(ccprob)f
Initial self-transition prob: %(contprob)s
Metrical model: %(metric)s
""" % \
            {
                'ccprob' : self.options['ccprob'],
                'metric' : self.options['metric'],
                'contprob' : self.options['contprob'],
            }
Example #8
0
 def train(data, name, logger=None, options={}, chord_data=None):
     """
     Initializes and trains an HMM in a supervised fashion using the given 
     training data.
     
     """
     if len(data) == 0:
         raise ModelTrainError, "empty training data set"
         
     # Prepare a dummy logger if none was given
     if logger is None:
         logger = create_dummy_logger()
     
     # Process the options dict
     options = HPChordLabeler.process_training_options(options)
     
     # Work out what kind of input data we've got
     # It should be a bulk input type: check what type the first input is
     input_type = detect_input_type(data[0], allowed=['segmidi', 'db-annotated'])
     
     logger.info(">>> Beginning training of HP chord labeler model '%s'" % name)
     # If we got midi tagger training data, it may include chord data as well
     if isinstance(data, MidiTaggerTrainingBulkInput) and \
                                             data.chords is not None:
         if chord_data is None:
             # Use the chord data in the input data
             logger.info("Midi training data; chord corpus data available")
             chord_inputs = data.chords
         else:
             # Use the chord data that was given explicitly
             chord_inputs = chord_data
         midi_inputs = data
     elif isinstance(data, DbBulkInput):
         logger.info("Only chord corpus training data")
         # This was only chord input, no midi data
         chord_inputs = data
         midi_inputs = None
     else:
         chord_inputs = chord_data
         # Presumably this is another form of midi training data
         midi_inputs = data
         logger.info("Midi training data; no chord data was included")
     
     # Get the chord vocab from the options
     logger.info("Model chord vocabulary: %s" % options['vocab'])
     vocab, vocab_mapping = CHORD_VOCABS[options['vocab']]
     
     # Initialize a model according to the chord types
     logger.info("Initializing emission distributions to favour chord "\
                 "notes with chord probability %s" % (options['chordprob']))
     model = HPChordLabeler.initialize_chords(options['chordprob'], \
                                         options['maxnotes'], vocab, \
                                         vocab_mapping, name=name)
     
     # If we have chord training data, use this to train the transition dist
     if chord_inputs is not None:
         logger.info("Training using chord data")
         
         # Construct the trees implicit in the annotations to get the 
         #  key of every chord
         logger.info("Preparing key data for annotated chord sequences")
         input_keys = [keys_for_sequence(dbinput) for dbinput in chord_inputs]
         
         # Run the supervised training of the transition distribution
         logger.info("Training transition distribution on chord sequences")
         model.train_transition_distribution(chord_inputs, input_keys)
         
     if midi_inputs is not None:
         logger.info("Training using midi data")
         
         # Preprocess the midi inputs so they're ready for the model training
         emissions = [midi_to_emission_stream(seq, 
                                              remove_empty=False)[0] \
                         for seq in midi_inputs]
         
         # Use the midi data to train emission number dist
         logger.info("Training emission number distribution")
         model.train_emission_number_distribution(emissions)
         
         ####### EM unsupervised training on the midi data
         # Pull out the options to pass to the trainer
         # These are a subset of the model training options
         bw_opt_names = [opt.name for opt in HPBaumWelchTrainer.OPTIONS]
         bw_opts = dict([(name,val) for (name,val) in options.items() \
                                         if name in bw_opt_names])
         # Create a Baum-Welch trainer
         trainer = HPBaumWelchTrainer(model, bw_opts)
         # Do the Baum-Welch training
         model = trainer.train(emissions, logger=logger)
     logger.info("Training complete")
     
     return model