Example #1
0
    def train(name, training_data, options, grammar=None, logger=None):
        if grammar is None:
            grammar = get_grammar()
        if logger is None:
            logger = create_dummy_logger()

        # If cat_bins wasn't given, read it from the grammar
        if options["cat_bins"]:
            cat_bins = options["cat_bins"]
        elif grammar.max_categories:
            cat_bins = grammar.max_categories
        else:
            # Nothing given in the grammar either: error
            raise ValueError, "no value was given for cat_bins and the "\
                "grammar doesn't supply one"

        # Create a new model with empty distributions
        model = HalfspanPcfgModel(name,
                                  cutoff=options['cutoff'],
                                  cat_bins=cat_bins,
                                  estimator=options['estimator'],
                                  lexical=options['lexical'],
                                  chordmap=options['chord_mapping'],
                                  grammar=grammar)

        # Add counts to this model for each sequence
        for sequence in training_data:
            try:
                model._sequence_train(sequence)
            except ModelTrainingError, err:
                logger.warn("Error training on %s: %s" %
                            (sequence.string_name, err))
Example #2
0
 def train(name, training_data, options, grammar=None, logger=None):
     if grammar is None:
         grammar = get_grammar()
     if logger is None:
         logger = create_dummy_logger()
     
     # If cat_bins wasn't given, read it from the grammar
     if options["cat_bins"]:
         cat_bins = options["cat_bins"]
     elif grammar.max_categories:
         cat_bins = grammar.max_categories
     else:
         # Nothing given in the grammar either: error
         raise ValueError, "no value was given for cat_bins and the "\
             "grammar doesn't supply one"
     
     # Create a new model with empty distributions
     model = HalfspanPcfgModel(
                 name,
                 cutoff = options['cutoff'], 
                 cat_bins = cat_bins, 
                 estimator = options['estimator'], 
                 lexical = options['lexical'], 
                 chordmap = options['chord_mapping'],
                 grammar = grammar)
     
     # Add counts to this model for each sequence
     for sequence in training_data:
         try:
             model._sequence_train(sequence)
         except ModelTrainingError, err:
             logger.warn("Error training on %s: %s" % (sequence.string_name, 
                                                       err))
Example #3
0
 def generate(self, logger=None, max_depth=None):
     """
     Generate a chord sequence from the model.
     
     """
     if logger is None:
         logger = create_dummy_logger()
     
     def _generate(parent, depth=0, pitch=0):
         # Transform the parent category so it's relative to itself
         # All generated categories will be relative to this, 
         #  so we need to make the parent self-relative at the 
         #  start of each recursion
         parent_rep = model_category_repr(parent)
         parent_pitch = (pitch + base_pitch(parent)) % 12
         logger.debug("%sGenerating from parent: %s" % (" "*depth,parent_rep))
         
         if max_depth is not None and depth >= max_depth and \
                     len(self._lexical_dist[parent_rep].samples()) != 0:
             # Don't go any deeper than this if we can stop here
             # Only possible if the parent has generated a leaf before
             exp = 'leaf'
             logger.debug("%sForcing leaf" % (" "*depth))
         else:
             # Otherwise freely generate an expansion type
             exp = generate_from_prob_dist(self._expansion_type_dist[parent_rep])
             logger.debug("%sExpansion: %s" % (" "*depth, exp))
             exp_parent = (exp,parent_rep)
         
         if exp == 'leaf':
             # Generate a leaf node (word)
             word = generate_from_prob_dist(self._lexical_dist[parent_rep])
             logger.debug("%sWord: %s, pitch: %d" % (" "*depth, word, parent_pitch))
             chord = Chord.from_name(word)
             chord.root = (chord.root + parent_pitch) % 12
             return [chord]
         else:
             # First generate a head node
             head = generate_from_prob_dist(self._head_expansion_dist[exp_parent])
             logger.debug("%sHead: %s" % (" "*depth, head))
             # Continue to expand this recursively to a word sequence
             head_generated = _generate(head, depth=depth+1, \
                                                         pitch=parent_pitch)
             
             head_exp_parent = (head,exp,parent_rep)
             # Now generate a non-head node
             non_head = generate_from_prob_dist(
                         self._non_head_expansion_dist[head_exp_parent])
             logger.debug("%sNon-head: %s" % (" "*depth, non_head))
             # Continue to expand this too
             non_head_generated = _generate(non_head, depth=depth+1, \
                                                         pitch=parent_pitch)
             
             return non_head_generated + head_generated
 
     # Choose a start node
     # Build a I^T-I^T as the root
     root = syntax_from_string("I^T-I^T")
     logger.debug("Root: %s" % root)
     return _generate(root)
Example #4
0
    def __init__(self,
                 grammar,
                 input,
                 options={},
                 original_input=None,
                 logger=None):
        """
        The tagger must have reference to the grammar being used to parse
        the input. It must also be given the full input when instantiated.
        The format of this input will depend on the tagger: for example,
        it might be a string or a MIDI file.
        
        @param original_input: the input in its original, unprocessed form. This 
            will usually be a string. This is optional, but in some 
            circumstances things might fall apart if it hasn't been given.
            E.g. using a backoff model as backoff from a tagging model requires 
            the original input to be passed to the backoff model.
        @param logger: optional progress logger. Logging will be sent to this 
            during initialization of the tagger and tagging. If not given, the 
            logging will be lost. Subclasses may access the logger (or a dummy 
            logger if none was given) in C{self.logger}.
        
        """
        self.grammar = grammar
        # Check the formalism is one that's allowed by this tagger
        formalism = self.grammar.formalism.get_name()
        if formalism not in self.COMPATIBLE_FORMALISMS:
            raise TaggerLoadError, "Formalism '%s' cannot be used with "\
                "tagger '%s'" % (formalism,self.name)

        # Check what input type we've received and preprocess it
        datatype, input = detect_input_type(input, allowed=self.INPUT_TYPES, \
                                errmess=" for use with tagger '%s'" % self.name)
        # Store this for the subclass to use as appropriate
        self.input = input
        if original_input is None:
            self.original_input = input
        else:
            self.original_input = original_input
        # Subclasses may redefine self.input to taste
        # We keep the original wrapped input somewhere where it's sure to remain
        self.wrapped_input = input
        # Initialize using tagger-specific options
        self.options = type(self).check_options(options)

        if logger is not None:
            self.logger = logger
        else:
            self.logger = create_dummy_logger()
Example #5
0
 def __init__(self, grammar, input, options={}, original_input=None, logger=None):
     """
     The tagger must have reference to the grammar being used to parse
     the input. It must also be given the full input when instantiated.
     The format of this input will depend on the tagger: for example,
     it might be a string or a MIDI file.
     
     @param original_input: the input in its original, unprocessed form. This 
         will usually be a string. This is optional, but in some 
         circumstances things might fall apart if it hasn't been given.
         E.g. using a backoff model as backoff from a tagging model requires 
         the original input to be passed to the backoff model.
     @param logger: optional progress logger. Logging will be sent to this 
         during initialization of the tagger and tagging. If not given, the 
         logging will be lost. Subclasses may access the logger (or a dummy 
         logger if none was given) in C{self.logger}.
     
     """
     self.grammar = grammar
     # Check the formalism is one that's allowed by this tagger
     formalism = self.grammar.formalism.get_name()
     if formalism not in self.COMPATIBLE_FORMALISMS:
         raise TaggerLoadError, "Formalism '%s' cannot be used with "\
             "tagger '%s'" % (formalism,self.name)
     
     # Check what input type we've received and preprocess it
     datatype, input = detect_input_type(input, allowed=self.INPUT_TYPES, \
                             errmess=" for use with tagger '%s'" % self.name)
     # Store this for the subclass to use as appropriate
     self.input = input
     if original_input is None:
         self.original_input = input
     else:
         self.original_input = original_input
     # Subclasses may redefine self.input to taste
     # We keep the original wrapped input somewhere where it's sure to remain
     self.wrapped_input = input
     # Initialize using tagger-specific options
     self.options = type(self).check_options(options)
     
     if logger is not None:
         self.logger = logger
     else:
         self.logger = create_dummy_logger()
Example #6
0
 def train(self, emissions, max_iterations=None, \
                 convergence_logprob=None, logger=None, processes=1,
                 save=True, save_intermediate=False):
     """
     Performs unsupervised training using Baum-Welch EM.
     
     This is an instance method, because it is performed on a model 
     that has already been initialized. You might, for example, 
     create such a model using C{initialize_chord_types}.
     
     This is based on the training procedure in NLTK for HMMs:
     C{nltk.tag.hmm.HiddenMarkovModelTrainer.train_unsupervised}.
     
     @type emissions: list of lists of emissions
     @param emissions: training data. Each element is a list of 
         emissions representing a sequence in the training data.
         Each emission is an emission like those used for 
         L{jazzparser.misc.raphsto.RaphstoHmm.emission_log_probability}, 
         i.e. a list of note 
         observations
     @type max_iterations: int
     @param max_iterations: maximum number of iterations to allow 
         for EM (default 100). Overrides the corresponding 
         module option
     @type convergence_logprob: float
     @param convergence_logprob: maximum change in log probability 
         to consider convergence to have been reached (default 1e-3). 
         Overrides the corresponding module option
     @type logger: logging.Logger
     @param logger: a logger to send progress logging to
     @type processes: int
     @param processes: number processes to spawn. A pool of this 
         many processes will be used to compute distribution updates 
         for sequences in parallel during each iteration.
     @type save: bool
     @param save: save the model at the end of training
     @type save_intermediate: bool
     @param save_intermediate: save the model after each iteration. Implies 
         C{save}
     
     """
     from . import raphsto_d
     if logger is None:
         from jazzparser.utils.loggers import create_dummy_logger
         logger = create_dummy_logger()
     
     if save_intermediate:
         save = True
         
     # No point in creating more processes than there are sequences
     if processes > len(emissions):
         processes = len(emissions)
     
     self.model.add_history("Beginning Baum-Welch unigram training on %s" % get_host_info_string())
     self.model.add_history("Training on %d sequences (with %s chords)" % \
         (len(emissions), ", ".join("%d" % len(seq) for seq in emissions)))
     
     # Use kwargs if given, otherwise module options
     if max_iterations is None:
         max_iterations = self.options['max_iterations']
     if convergence_logprob is None:
         convergence_logprob = self.options['convergence_logprob']
     
     # Enumerate the states
     state_ids = dict((state,num) for (num,state) in \
                                 enumerate(self.model.label_dom))
     
     # Enumerate the beat values (they're probably consecutive ints, but 
     #  let's not rely on it)
     beat_ids = dict((beat,num) for (num,beat) in \
                                 enumerate(self.model.beat_dom))
     num_beats = len(beat_ids)
     # Enumerate the d-values (d-function's domain)
     d_ids = dict((d,num) for (num,d) in \
                                 enumerate(self.model.emission_dist_dom))
     num_ds = len(d_ids)
     
     # Make a mutable distribution for the emission distribution we'll 
     #  be updating
     emission_mdist = DictionaryConditionalProbDist(
                 dict((s, MutableProbDist(self.model.emission_dist[s], 
                                          self.model.emission_dist_dom))
                     for s in self.model.emission_dist.conditions()))
     # Create dummy distributions to fill the places of the transition 
     #  distribution components
     key_mdist = DictionaryConditionalProbDist({})
     chord_mdist = DictionaryConditionalProbDist({})
     chord_uni_mdist = MutableProbDist({}, [])
     
     # Construct a model using these mutable distributions so we can 
     #  evaluate using them
     model = self.model_cls(key_mdist, 
                            chord_mdist,
                            emission_mdist, 
                            chord_uni_mdist,
                            chord_set=self.model.chord_set)
     
     iteration = 0
     last_logprob = None
     while iteration < max_iterations:
         logger.info("Beginning iteration %d" % iteration)
         current_logprob = 0.0
         
         # ems contains the new emission numerator probabilities
         # ems[r][d] = Sum_{d(y_n^k, x_n)=d, r_n^k=r}
         #                  alpha(x_n).beta(x_n) / 
         #                    Sum_{x'_n} (alpha(x'_n).beta(x'_n))
         ems = zeros((num_beats,num_ds), float64)
         # And these are the denominators
         ems_denom = zeros(num_beats, float64)
         
         def _training_callback(result):
             """
             Callback for the _sequence_updates processes that takes 
             the updates from a single sequence and adds them onto 
             the global update accumulators.
             
             """
             # _sequence_updates() returns all of this as a tuple
             (ems_local, ems_denom_local, seq_logprob) = result
             
             # Add these probabilities from this sequence to the 
             #  global matrices
             # Emission numerator
             array_add(ems, ems_local, ems)
             # Denominators
             array_add(ems_denom, ems_denom_local, ems_denom)
         ## End of _training_callback
         
         
         # Only use a process pool if there's more than one sequence
         if processes > 1:
             # Create a process pool to use for training
             logger.info("Creating a pool of %d processes" % processes)
             pool = Pool(processes=processes)
             
             async_results = []
             for seq_i,sequence in enumerate(emissions):
                 logger.info("Iteration %d, sequence %d" % (iteration, seq_i))
                 T = len(sequence)
                 if T == 0:
                     continue
                 
                 # Fire off a new call to the process pool for every sequence
                 async_results.append(
                         pool.apply_async(_sequence_updates_uni, 
                                             (sequence, model, 
                                                 self.model.label_dom, 
                                                 state_ids, 
                                                 beat_ids, d_ids, raphsto_d), 
                                             callback=_training_callback) )
             pool.close()
             # Wait for all the workers to complete
             pool.join()
             
             # Call get() on every AsyncResult so that any exceptions in 
             #  workers get raised
             for res in async_results:
                 # If there was an exception in _sequence_update, it 
                 #  will get raised here
                 res_tuple = res.get()
                 # Add this sequence's logprob into the total for all sequences
                 current_logprob += res_tuple[2]
         else:
             logger.info("One sequence: not using a process pool")
             sequence = emissions[0]
             
             if len(sequence) > 0:
                 updates = _sequence_updates_uni(
                                     sequence, model,
                                     self.model.label_dom,
                                     state_ids, 
                                     beat_ids, d_ids, raphsto_d)
                 _training_callback(updates)
                 # Update the overall logprob
                 current_logprob = updates[2]
         
         # Update the model's probabilities from the accumulated values
         for beat in self.model.beat_dom:
             denom = ems_denom[beat_ids[beat]]
             for d in self.model.emission_dist_dom:
                 if denom == 0.0:
                     # Zero denominator
                     prob = - logprob(len(d_ids))
                 else:
                     prob = logprob(ems[beat_ids[beat]][d_ids[d]] + ADD_SMALL) - logprob(denom + len(d_ids)*ADD_SMALL)
                 model.emission_dist[beat].update(d, prob)
         
         # Clear the model's cache so we get the new probabilities
         model.clear_cache()
         
         logger.info("Training data log prob: %s" % current_logprob)
         if last_logprob is not None and current_logprob < last_logprob:
             logger.error("Log probability dropped by %s" % \
                             (last_logprob - current_logprob))
         if last_logprob is not None:
             logger.info("Log prob change: %s" % \
                             (current_logprob - last_logprob))
         # Check whether the log probability has converged
         if iteration > 0 and \
                 abs(current_logprob - last_logprob) < convergence_logprob:
             # Don't iterate any more
             logger.info("Distribution has converged: ceasing training")
             break
         
         iteration += 1
         last_logprob = current_logprob
         
         # Update the main model
         # Only save if we've been asked to save between iterations
         self.update_model(model, save=save_intermediate)
     
     self.model.add_history("Completed Baum-Welch unigram training")
     # Update the distribution's parameters with those we've trained
     self.update_model(model, save=save)
     return
Example #7
0
 def train(self, emissions, logger=None, save_callback=None):
     """
     Performs unsupervised training using Baum-Welch EM.
     
     This is performed on a model that has already been initialized. 
     You might, for example, create such a model using 
     L{jazzparser.taggers.segmidi.chordclass.hmm.ChordClassHmm.initialize_chord_classes}.
     
     This is based on the training procedure in NLTK for HMMs:
     C{nltk.tag.hmm.HiddenMarkovModelTrainer.train_unsupervised}.
     
     @type emissions: L{jazzparser.data.input.MidiTaggerTrainingBulkInput} or 
         list of L{jazzparser.data.input.Input}s
     @param emissions: training MIDI data
     @type logger: logging.Logger
     @param logger: a logger to send progress logging to
     
     """
     if logger is None:
         from jazzparser.utils.loggers import create_dummy_logger
         logger = create_dummy_logger()
         
     self.model.add_history("Beginning Baum-Welch training on %s" % get_host_info_string())
     self.model.add_history("Training on %d MIDI sequences (with %s segments)" % \
         (len(emissions), ", ".join("%d" % len(seq) for seq in emissions)))
     logger.info("Beginning Baum-Welch training on %s" % get_host_info_string())
     
     # Get some options out of the module options
     max_iterations = self.options['max_iterations']
     convergence_logprob = self.options['convergence_logprob']
     split_length = self.options['split']
     truncate_length = self.options['truncate']
     save_intermediate = self.options['save_intermediate']
     processes = self.options['trainprocs']
     
     # Make a mutable distribution for each of the distributions 
     #  we'll be updating
     emission_mdist = cond_prob_dist_to_dictionary_cond_prob_dist(
                                 self.model.emission_dist, mutable=True)
     schema_trans_mdist = cond_prob_dist_to_dictionary_cond_prob_dist(
                                 self.model.schema_transition_dist, mutable=True)
     root_trans_mdist = cond_prob_dist_to_dictionary_cond_prob_dist(
                                 self.model.root_transition_dist, mutable=True)
     init_state_mdist = prob_dist_to_dictionary_prob_dist(
                                 self.model.initial_state_dist, mutable=True)
     
     # Get the sizes we'll need for the matrices
     num_schemata = len(self.model.schemata)
     num_root_changes = 12
     num_chord_classes = len(self.model.chord_classes)
     if self.model.metric:
         num_emission_conds = num_chord_classes * 4
     else:
         num_emission_conds = num_chord_classes
     num_emissions = 12
     
     # Enumerations to use for the matrices, so we know what they mean
     schema_ids = dict([(sch,i) for (i,sch) in enumerate(self.model.schemata+[None])])
     if self.model.metric:
         rs = range(4)
     else:
         rs = [0]
     emission_cond_ids = dict([(cc,i) for (i,cc) in enumerate(\
             sum([[
                 (str(cclass.name),r) for r in rs] for cclass in self.model.chord_classes], 
             []))])
     
     # Construct a model using these mutable distributions so we can 
     #  evaluate using them
     model = ChordClassHmm(schema_trans_mdist, 
                        root_trans_mdist, 
                        emission_mdist, 
                        self.model.emission_number_dist, 
                        init_state_mdist, 
                        self.model.schemata, 
                        self.model.chord_class_mapping,
                        self.model.chord_classes, 
                        metric=self.model.metric,
                        illegal_transitions=self.model.illegal_transitions,
                        fixed_root_transitions=self.model.fixed_root_transitions)
     
     def _save():
         if save_callback is None:
             logger.error("Could not save model, as no callback was given")
         else:
             # If the writing fails, wait till I've had a chance to sort it 
             #  out and then try again. This happens when my AFS token runs 
             #  out
             while True:
                 try:
                     save_callback()
                 except (IOError, OSError), err:
                     print "Error writing model to disk: %s. " % err
                     raw_input("Press <enter> to try again... ")
                 else:
                     break
Example #8
0
    def train(self, emissions, max_iterations=None, \
                    convergence_logprob=None, logger=None, processes=1,
                    save=True, save_intermediate=False):
        """
        Performs unsupervised training using Baum-Welch EM.
        
        This is an instance method, because it is performed on a model 
        that has already been initialized. You might, for example, 
        create such a model using C{initialize_chord_types}.
        
        This is based on the training procedure in NLTK for HMMs:
        C{nltk.tag.hmm.HiddenMarkovModelTrainer.train_unsupervised}.
        
        @type emissions: list of lists of emissions
        @param emissions: training data. Each element is a list of 
            emissions representing a sequence in the training data.
            Each emission is an emission like those used for 
            L{jazzparser.misc.raphsto.RaphstoHmm.emission_log_probability}, 
            i.e. a list of note 
            observations
        @type max_iterations: int
        @param max_iterations: maximum number of iterations to allow 
            for EM (default 100). Overrides the corresponding 
            module option
        @type convergence_logprob: float
        @param convergence_logprob: maximum change in log probability 
            to consider convergence to have been reached (default 1e-3). 
            Overrides the corresponding module option
        @type logger: logging.Logger
        @param logger: a logger to send progress logging to
        @type processes: int
        @param processes: number processes to spawn. A pool of this 
            many processes will be used to compute distribution updates 
            for sequences in parallel during each iteration.
        @type save: bool
        @param save: save the model at the end of training
        @type save_intermediate: bool
        @param save_intermediate: save the model after each iteration. Implies 
            C{save}
        
        """
        from . import raphsto_d
        if logger is None:
            from jazzparser.utils.loggers import create_dummy_logger
            logger = create_dummy_logger()

        if save_intermediate:
            save = True

        # No point in creating more processes than there are sequences
        if processes > len(emissions):
            processes = len(emissions)

        self.model.add_history("Beginning Baum-Welch unigram training on %s" %
                               get_host_info_string())
        self.model.add_history("Training on %d sequences (with %s chords)" % \
            (len(emissions), ", ".join("%d" % len(seq) for seq in emissions)))

        # Use kwargs if given, otherwise module options
        if max_iterations is None:
            max_iterations = self.options['max_iterations']
        if convergence_logprob is None:
            convergence_logprob = self.options['convergence_logprob']

        # Enumerate the states
        state_ids = dict((state,num) for (num,state) in \
                                    enumerate(self.model.label_dom))

        # Enumerate the beat values (they're probably consecutive ints, but
        #  let's not rely on it)
        beat_ids = dict((beat,num) for (num,beat) in \
                                    enumerate(self.model.beat_dom))
        num_beats = len(beat_ids)
        # Enumerate the d-values (d-function's domain)
        d_ids = dict((d,num) for (num,d) in \
                                    enumerate(self.model.emission_dist_dom))
        num_ds = len(d_ids)

        # Make a mutable distribution for the emission distribution we'll
        #  be updating
        emission_mdist = DictionaryConditionalProbDist(
            dict((s,
                  MutableProbDist(self.model.emission_dist[s],
                                  self.model.emission_dist_dom))
                 for s in self.model.emission_dist.conditions()))
        # Create dummy distributions to fill the places of the transition
        #  distribution components
        key_mdist = DictionaryConditionalProbDist({})
        chord_mdist = DictionaryConditionalProbDist({})
        chord_uni_mdist = MutableProbDist({}, [])

        # Construct a model using these mutable distributions so we can
        #  evaluate using them
        model = self.model_cls(key_mdist,
                               chord_mdist,
                               emission_mdist,
                               chord_uni_mdist,
                               chord_set=self.model.chord_set)

        iteration = 0
        last_logprob = None
        while iteration < max_iterations:
            logger.info("Beginning iteration %d" % iteration)
            current_logprob = 0.0

            # ems contains the new emission numerator probabilities
            # ems[r][d] = Sum_{d(y_n^k, x_n)=d, r_n^k=r}
            #                  alpha(x_n).beta(x_n) /
            #                    Sum_{x'_n} (alpha(x'_n).beta(x'_n))
            ems = zeros((num_beats, num_ds), float64)
            # And these are the denominators
            ems_denom = zeros(num_beats, float64)

            def _training_callback(result):
                """
                Callback for the _sequence_updates processes that takes 
                the updates from a single sequence and adds them onto 
                the global update accumulators.
                
                """
                # _sequence_updates() returns all of this as a tuple
                (ems_local, ems_denom_local, seq_logprob) = result

                # Add these probabilities from this sequence to the
                #  global matrices
                # Emission numerator
                array_add(ems, ems_local, ems)
                # Denominators
                array_add(ems_denom, ems_denom_local, ems_denom)

            ## End of _training_callback

            # Only use a process pool if there's more than one sequence
            if processes > 1:
                # Create a process pool to use for training
                logger.info("Creating a pool of %d processes" % processes)
                pool = Pool(processes=processes)

                async_results = []
                for seq_i, sequence in enumerate(emissions):
                    logger.info("Iteration %d, sequence %d" %
                                (iteration, seq_i))
                    T = len(sequence)
                    if T == 0:
                        continue

                    # Fire off a new call to the process pool for every sequence
                    async_results.append(
                        pool.apply_async(
                            _sequence_updates_uni,
                            (sequence, model, self.model.label_dom, state_ids,
                             beat_ids, d_ids, raphsto_d),
                            callback=_training_callback))
                pool.close()
                # Wait for all the workers to complete
                pool.join()

                # Call get() on every AsyncResult so that any exceptions in
                #  workers get raised
                for res in async_results:
                    # If there was an exception in _sequence_update, it
                    #  will get raised here
                    res_tuple = res.get()
                    # Add this sequence's logprob into the total for all sequences
                    current_logprob += res_tuple[2]
            else:
                logger.info("One sequence: not using a process pool")
                sequence = emissions[0]

                if len(sequence) > 0:
                    updates = _sequence_updates_uni(sequence, model,
                                                    self.model.label_dom,
                                                    state_ids, beat_ids, d_ids,
                                                    raphsto_d)
                    _training_callback(updates)
                    # Update the overall logprob
                    current_logprob = updates[2]

            # Update the model's probabilities from the accumulated values
            for beat in self.model.beat_dom:
                denom = ems_denom[beat_ids[beat]]
                for d in self.model.emission_dist_dom:
                    if denom == 0.0:
                        # Zero denominator
                        prob = -logprob(len(d_ids))
                    else:
                        prob = logprob(ems[beat_ids[beat]][d_ids[d]] +
                                       ADD_SMALL) - logprob(
                                           denom + len(d_ids) * ADD_SMALL)
                    model.emission_dist[beat].update(d, prob)

            # Clear the model's cache so we get the new probabilities
            model.clear_cache()

            logger.info("Training data log prob: %s" % current_logprob)
            if last_logprob is not None and current_logprob < last_logprob:
                logger.error("Log probability dropped by %s" % \
                                (last_logprob - current_logprob))
            if last_logprob is not None:
                logger.info("Log prob change: %s" % \
                                (current_logprob - last_logprob))
            # Check whether the log probability has converged
            if iteration > 0 and \
                    abs(current_logprob - last_logprob) < convergence_logprob:
                # Don't iterate any more
                logger.info("Distribution has converged: ceasing training")
                break

            iteration += 1
            last_logprob = current_logprob

            # Update the main model
            # Only save if we've been asked to save between iterations
            self.update_model(model, save=save_intermediate)

        self.model.add_history("Completed Baum-Welch unigram training")
        # Update the distribution's parameters with those we've trained
        self.update_model(model, save=save)
        return
Example #9
0
 def train(data, name, logger=None, options={}, chord_data=None):
     """
     Initializes and trains an HMM in a supervised fashion using the given 
     training data.
     
     """
     if len(data) == 0:
         raise ModelTrainError, "empty training data set"
         
     # Prepare a dummy logger if none was given
     if logger is None:
         logger = create_dummy_logger()
     
     # Process the options dict
     options = HPChordLabeler.process_training_options(options)
     
     # Work out what kind of input data we've got
     # It should be a bulk input type: check what type the first input is
     input_type = detect_input_type(data[0], allowed=['segmidi', 'db-annotated'])
     
     logger.info(">>> Beginning training of HP chord labeler model '%s'" % name)
     # If we got midi tagger training data, it may include chord data as well
     if isinstance(data, MidiTaggerTrainingBulkInput) and \
                                             data.chords is not None:
         if chord_data is None:
             # Use the chord data in the input data
             logger.info("Midi training data; chord corpus data available")
             chord_inputs = data.chords
         else:
             # Use the chord data that was given explicitly
             chord_inputs = chord_data
         midi_inputs = data
     elif isinstance(data, DbBulkInput):
         logger.info("Only chord corpus training data")
         # This was only chord input, no midi data
         chord_inputs = data
         midi_inputs = None
     else:
         chord_inputs = chord_data
         # Presumably this is another form of midi training data
         midi_inputs = data
         logger.info("Midi training data; no chord data was included")
     
     # Get the chord vocab from the options
     logger.info("Model chord vocabulary: %s" % options['vocab'])
     vocab, vocab_mapping = CHORD_VOCABS[options['vocab']]
     
     # Initialize a model according to the chord types
     logger.info("Initializing emission distributions to favour chord "\
                 "notes with chord probability %s" % (options['chordprob']))
     model = HPChordLabeler.initialize_chords(options['chordprob'], \
                                         options['maxnotes'], vocab, \
                                         vocab_mapping, name=name)
     
     # If we have chord training data, use this to train the transition dist
     if chord_inputs is not None:
         logger.info("Training using chord data")
         
         # Construct the trees implicit in the annotations to get the 
         #  key of every chord
         logger.info("Preparing key data for annotated chord sequences")
         input_keys = [keys_for_sequence(dbinput) for dbinput in chord_inputs]
         
         # Run the supervised training of the transition distribution
         logger.info("Training transition distribution on chord sequences")
         model.train_transition_distribution(chord_inputs, input_keys)
         
     if midi_inputs is not None:
         logger.info("Training using midi data")
         
         # Preprocess the midi inputs so they're ready for the model training
         emissions = [midi_to_emission_stream(seq, 
                                              remove_empty=False)[0] \
                         for seq in midi_inputs]
         
         # Use the midi data to train emission number dist
         logger.info("Training emission number distribution")
         model.train_emission_number_distribution(emissions)
         
         ####### EM unsupervised training on the midi data
         # Pull out the options to pass to the trainer
         # These are a subset of the model training options
         bw_opt_names = [opt.name for opt in HPBaumWelchTrainer.OPTIONS]
         bw_opts = dict([(name,val) for (name,val) in options.items() \
                                         if name in bw_opt_names])
         # Create a Baum-Welch trainer
         trainer = HPBaumWelchTrainer(model, bw_opts)
         # Do the Baum-Welch training
         model = trainer.train(emissions, logger=logger)
     logger.info("Training complete")
     
     return model
Example #10
0
 def train(self, emissions, logger=None):
     """
     Performs unsupervised training using Baum-Welch EM.
     
     This is performed as a retraining step on a model that has already 
     been initialized. 
     
     This is based on the training procedure in NLTK for HMMs:
     C{nltk.tag.hmm.HiddenMarkovModelTrainer.train_unsupervised}.
     
     @type emissions: list of lists of emissions
     @param emissions: training data. Each element is a list of 
         emissions representing a sequence in the training data.
         Each emission is an emission like those used for 
         C{emission_log_probability} on the model
     @type logger: logging.Logger
     @param logger: a logger to send progress logging to
     
     """
     if logger is None:
         from jazzparser.utils.loggers import create_dummy_logger
         logger = create_dummy_logger()
         
     self.record_history("Beginning Baum-Welch training on %s" % get_host_info_string())
     self.record_history("Training on %d inputs (with %s segments)" % \
         (len(emissions), ", ".join("%d" % len(seq) for seq in emissions)))
     logger.info("Beginning Baum-Welch training on %s" % get_host_info_string())
     
     # Get some options out of the module options
     max_iterations = self.options['max_iterations']
     convergence_logprob = self.options['convergence_logprob']
     split_length = self.options['split']
     truncate_length = self.options['truncate']
     save_intermediate = self.options['save_intermediate']
     processes = self.options['trainprocs']
     
     # Make a mutable version of the model that we can update each iteration
     self.model = self.create_mutable_model(self.model)
     # Getting the array id mappings
     array_ids = self.get_array_indices()
     
     ########## Data preprocessing
     logger.info("%d input sequences" % len(emissions))
     # Truncate long streams
     if truncate_length is not None:
         logger.info("Truncating sequences to max %d timesteps" % \
                                                         truncate_length)
         emissions = [stream[:truncate_length] for stream in emissions]
     # Split up long streams if requested
     # After this, each stream is a tuple (first,stream), where first 
     #  indicates whether the stream segment begins a song
     if split_length is not None:
         logger.info("Splitting sequences into max %d-sized chunks" % \
                                                             split_length)
         split_emissions = []
         # Split each stream
         for emstream in emissions:
             input_ems = list(emstream)
             splits = []
             first = True
             # Take bits of length split_length until we're under the max
             while len(input_ems) >= split_length:
                 # Overlap the splits by one so we get all transitions
                 splits.append((first, input_ems[:split_length]))
                 input_ems = input_ems[split_length-1:]
                 first = False
             # Get the last short one
             if len(input_ems):
                 # Try to avoid having a small bit that's split off at the end
                 if len(splits) and len(input_ems) <= split_length / 5:
                     # Add these to the end of the last split
                     # This will make it slightly longer than requested
                     splits[-1][1].extend(input_ems)
                 else:
                     splits.append((first, input_ems))
             split_emissions.extend(splits)
     else:
         # All streams begin a song
         split_emissions = [(True,stream) for stream in emissions]
     logger.info("Sequence lengths after preprocessing: %s" % 
             " ".join([str(len(em[1])) for em in split_emissions]))
     ##########
     
     # Special case of -1 for number of sequences
     # No point in creating more processes than there are sequences
     if processes == -1 or processes > len(split_emissions):
         processes = len(split_emissions)
     
     iteration = 0
     last_logprob = None
     while iteration < max_iterations:
         logger.info("Beginning iteration %d" % iteration)
         current_logprob = 0.0
         
         # Build a tuple of the arrays that will be updated by each sequence
         self.global_arrays = self.get_empty_arrays()
         
         # Only use a process pool if there's more than one sequence
         if processes > 1:
             # Create a process pool to use for training
             logger.info("Creating a pool of %d processes" % processes)
             #  catch them at this level
             pool = Pool(processes=processes)
             
             async_results = []
             try:
                 for seq_i,(first,sequence) in enumerate(split_emissions):
                     logger.info("Iteration %d, sequence %d" % (iteration, seq_i))
                     T = len(sequence)
                     if T == 0:
                         continue
                     
                     def _notifier_closure(seq_index):
                         def _notifier(res):
                             logger.info("Sequence %d finished" % seq_index)
                         return _notifier
                     # Create some empty arrays for the updates to go into
                     empty_arrays = self.get_empty_arrays()
                     # Fire off a new call to the process pool for every sequence
                     async_results.append(
                             pool.apply_async(self.sequence_updates, 
                                              (sequence, self.model, empty_arrays, array_ids), 
                                              { 'update_initial' : first },
                                              _notifier_closure(seq_i)) )
                 pool.close()
                 # Wait for all the workers to complete
                 pool.join()
             except KeyboardInterrupt:
                 # If Ctl+C is fired during the processing, we exit here
                 logger.info("Keyboard interrupt was received during EM "\
                     "updates")
                 raise
             
             # Call get() on every AsyncResult so that any exceptions in 
             #  workers get raised
             for res in async_results:
                 # If there was an exception in sequence_updates, it 
                 #  will get raised here
                 res_tuple = res.get()
                 # Run the callback on the results from this process
                 # It might seem sensible to do this using the callback 
                 #  arg to apply_async, but then the callback must be 
                 #  picklable and it doesn't buy us anything really
                 self.sequence_updates_callback(res_tuple)
                 # Add this sequence's logprob into the total for all sequences
                 current_logprob += res_tuple[-1]
         else:
             if len(split_emissions) == 1:
                 logger.info("One sequence: not using a process pool")
             else:
                 logger.info("Not using a process pool: training %d "\
                     "emission sequences sequentially" % \
                     len(split_emissions))
             
             for seq_i,(first,sequence) in enumerate(split_emissions):
                 if len(sequence) > 0:
                     logger.info("Iteration %d, sequence %d" % (iteration, seq_i))
                     # Create some empty arrays for the updates to go into
                     empty_arrays = self.get_empty_arrays()
                     updates = self.sequence_updates(
                                         sequence, self.model,
                                         empty_arrays, array_ids,
                                         update_initial=first)
                     self.sequence_updates_callback(updates)
                     # Update the overall logprob
                     current_logprob += updates[-1]
         
         ######## Model updates
         # Update the main model
         self.update_model(self.global_arrays, array_ids)
         
         # Clear the model's cache so we get the new probabilities
         self.model.clear_cache()
         
         logger.info("Training data log prob: %s" % current_logprob)
         if last_logprob is not None and current_logprob < last_logprob:
             logger.error("Log probability dropped by %s" % \
                             (last_logprob - current_logprob))
         if last_logprob is not None:
             logger.info("Log prob change: %s" % \
                             (current_logprob - last_logprob))
         # Check whether the log probability has converged
         if iteration > 0 and \
                 abs(current_logprob - last_logprob) < convergence_logprob:
             # Don't iterate any more
             logger.info("Distribution has converged: ceasing training")
             break
         
         iteration += 1
         last_logprob = current_logprob
         
         # Only save if we've been asked to save between iterations
         if save_intermediate:
             self.save()
     
     self.record_history("Completed Baum-Welch training")
     # Always save the model now that we're done
     self.save()
     return self.model
Example #11
0
    def train(self, emissions, logger=None, save_callback=None):
        """
        Performs unsupervised training using Baum-Welch EM.
        
        This is performed on a model that has already been initialized. 
        You might, for example, create such a model using 
        L{jazzparser.taggers.segmidi.chordclass.hmm.ChordClassHmm.initialize_chord_classes}.
        
        This is based on the training procedure in NLTK for HMMs:
        C{nltk.tag.hmm.HiddenMarkovModelTrainer.train_unsupervised}.
        
        @type emissions: L{jazzparser.data.input.MidiTaggerTrainingBulkInput} or 
            list of L{jazzparser.data.input.Input}s
        @param emissions: training MIDI data
        @type logger: logging.Logger
        @param logger: a logger to send progress logging to
        
        """
        if logger is None:
            from jazzparser.utils.loggers import create_dummy_logger
            logger = create_dummy_logger()

        self.model.add_history("Beginning Baum-Welch training on %s" %
                               get_host_info_string())
        self.model.add_history("Training on %d MIDI sequences (with %s segments)" % \
            (len(emissions), ", ".join("%d" % len(seq) for seq in emissions)))
        logger.info("Beginning Baum-Welch training on %s" %
                    get_host_info_string())

        # Get some options out of the module options
        max_iterations = self.options['max_iterations']
        convergence_logprob = self.options['convergence_logprob']
        split_length = self.options['split']
        truncate_length = self.options['truncate']
        save_intermediate = self.options['save_intermediate']
        processes = self.options['trainprocs']

        # Make a mutable distribution for each of the distributions
        #  we'll be updating
        emission_mdist = cond_prob_dist_to_dictionary_cond_prob_dist(
            self.model.emission_dist, mutable=True)
        schema_trans_mdist = cond_prob_dist_to_dictionary_cond_prob_dist(
            self.model.schema_transition_dist, mutable=True)
        root_trans_mdist = cond_prob_dist_to_dictionary_cond_prob_dist(
            self.model.root_transition_dist, mutable=True)
        init_state_mdist = prob_dist_to_dictionary_prob_dist(
            self.model.initial_state_dist, mutable=True)

        # Get the sizes we'll need for the matrices
        num_schemata = len(self.model.schemata)
        num_root_changes = 12
        num_chord_classes = len(self.model.chord_classes)
        if self.model.metric:
            num_emission_conds = num_chord_classes * 4
        else:
            num_emission_conds = num_chord_classes
        num_emissions = 12

        # Enumerations to use for the matrices, so we know what they mean
        schema_ids = dict([
            (sch, i) for (i, sch) in enumerate(self.model.schemata + [None])
        ])
        if self.model.metric:
            rs = range(4)
        else:
            rs = [0]
        emission_cond_ids = dict([(cc,i) for (i,cc) in enumerate(\
                sum([[
                    (str(cclass.name),r) for r in rs] for cclass in self.model.chord_classes],
                []))])

        # Construct a model using these mutable distributions so we can
        #  evaluate using them
        model = ChordClassHmm(
            schema_trans_mdist,
            root_trans_mdist,
            emission_mdist,
            self.model.emission_number_dist,
            init_state_mdist,
            self.model.schemata,
            self.model.chord_class_mapping,
            self.model.chord_classes,
            metric=self.model.metric,
            illegal_transitions=self.model.illegal_transitions,
            fixed_root_transitions=self.model.fixed_root_transitions)

        def _save():
            if save_callback is None:
                logger.error("Could not save model, as no callback was given")
            else:
                # If the writing fails, wait till I've had a chance to sort it
                #  out and then try again. This happens when my AFS token runs
                #  out
                while True:
                    try:
                        save_callback()
                    except (IOError, OSError), err:
                        print "Error writing model to disk: %s. " % err
                        raw_input("Press <enter> to try again... ")
                    else:
                        break
Example #12
0
 def train(data, schemata, chord_types, estimator, cutoff=0, logger=None, 
             chord_map=None, order=2, backoff_orders=0, backoff_kwargs={}):
     """
     Initializes and trains an HMM in a supervised fashion using the given 
     training data. Training data should be chord sequence data (input 
     type C{bulk-db} or C{bulk-db-annotated}).
     
     """
     # Remove any sequences that aren't fully labeled
     sequences = [
         sequence for sequence in data if \
                 all([c.category is not None and len(c.category) \
                         for c in sequence.chords])
     ]
     
     if len(sequences) == 0:
         raise TaggerTrainingError, "empty training data set"
     
     # Prepare a dummy logger if none was given
     if logger is None:
         logger = create_dummy_logger()
     logger.info(">>> Beginning training of multi-chord ngram tagging model")
     
     # Prepare training data from these sequences
     # Training set for emission dist
     if chord_map is None:
         chord_trans = lambda x:x
     else:
         chord_trans = lambda x: chord_map[x]
     emission_data = sum([
         [(chord.category, chord_trans(chord.type)) 
                                             for chord in sequence.chords] 
                                             for sequence in sequences], [])
     
     # Train the emission distribution
     emission_counts = CutoffConditionalFreqDist(cutoff)
     for schema,ctype in emission_data:
         emission_counts[schema].inc(ctype)
     
     # Train the transition distribution
     schema_transition_counts = CutoffConditionalFreqDist(cutoff)
     root_transition_counts = CutoffConditionalFreqDist(cutoff)
     
     for sequence in sequences:
         # Add a count for the transition to the final state
         final_ngram = tuple([c.category for c in sequence.chords[-order:-1]])
         schema_transition_counts[sequence.chords[-1].category].inc(None)
         # Make n-gram counts
         transition_data = [None]*(order-1) + sequence.chords
         
         for i in range(len(transition_data)-order):
             ngram = list(reversed(transition_data[i:i+order]))
             
             # Count the schema transition
             schema_ngram = [c.category if c is not None else None for c in ngram]
             schema_transition_counts[tuple(schema_ngram[1:])].inc(schema_ngram[0])
             
             # Now count the relative root, conditioned on the schema
             if order > 1 and ngram[1] is not None:
                 root_change = (ngram[0].root - ngram[1].root) % 12
                 root_transition_counts[ngram[1].category].inc(root_change)
     
     if backoff_orders > 0:
         # Train a lower-order model
         kwargs = {
             'cutoff' : cutoff,
             'logger' : logger, 
             'chord_map' : chord_map,
         }
         kwargs.update(backoff_kwargs)
         # These kwargs can't be overridden
         kwargs['order'] = order-1
         kwargs['backoff_orders'] = backoff_orders-1
         # Run the model training
         backoff_model = MultiChordNgramModel.train(
                                                 data, 
                                                 schemata, 
                                                 chord_types,
                                                 estimator,
                                                 **kwargs)
     else:
         backoff_model = None
     
     # Instantiate a model with these distributions
     model = MultiChordNgramModel(order,
                                   root_transition_counts, 
                                   schema_transition_counts, 
                                   emission_counts, 
                                   estimator, 
                                   backoff_model,
                                   schemata, 
                                   chord_types)
     return model
Example #13
0
 def train(data, estimator, grammar, cutoff=0, logger=None, 
             chord_map=None, order=2, backoff_orders=0, backoff_kwargs={}):
     """
     Initializes and trains an HMM in a supervised fashion using the given 
     training data. Training data should be chord sequence data (input 
     type C{bulk-db} or C{bulk-db-annotated}).
     
     """
     # Prepare a dummy logger if none was given
     if logger is None:
         logger = create_dummy_logger()
     logger.info(">>> Beginning training of ngram backoff model")
     
     training_data = []
     # Generate the gold standard data by parsing the annotations
     for dbinput in data:
         # Get a gold standard tonal space sequence
         try:
             parses = parse_sequence_with_annotations(dbinput, grammar, \
                                                     allow_subparses=False)
         except ParseError, err:
             # Just skip this sequence
             logger.error('Could not get a GS parse of %s: %s' % (dbinput,err))
             continue
         # There should only be one of these now
         parse = parses[0]
         if parse is None:
             logger.error('Could not get a GS parse of %s' % (dbinput))
             continue
         
         # Get the form of the analysis we need for the training
         if chord_map is None:
             chords = [(c.root, c.type) for c in dbinput.chords]
         else:
             chords = [(c.root, chord_map[c.type]) for c in dbinput.chords]
         
         points,times = zip(*grammar.formalism.semantics_to_coordinates(
                                                 parse.semantics))
         # Run through the sequence, transforming absolute points into 
         #  the condensed relative representation
         ec0 = EnharmonicCoordinate.from_harmonic_coord(points[0])
         # The first point is relative to the origin and always in the 
         #  (0,0) enharmonic space
         rel_points = [(0,0,ec0.x,ec0.y)]
         for point in points[1:]:
             ec1 = EnharmonicCoordinate.from_harmonic_coord(point)
             # Find the nearest enharmonic instance of this point to the last
             nearest = ec0.nearest((ec1.x, ec1.y))
             # Work out how much we have to shift this by to get the point
             dX = ec1.X - nearest.X
             dY = ec1.Y - nearest.Y
             rel_points.append((dX,dY,ec1.x,ec1.y))
             ec0 = ec1
         funs,times = zip(*grammar.formalism.semantics_to_functions(
                                                 parse.semantics))
         
         ### Synchronize the chords with the points and functions
         # We may need to repeat chords to match up with analysis 
         #  points that span multiple chords
         analysis = iter(zip(rel_points,funs,times))
         rel_point, fun, __ = analysis.next()
         next_rel_point,next_fun,next_anal_time = analysis.next()
         # Keep track of how much time has elapsed
         time = 0
         training_seq = []
         reached_end = False
         for crd_pair,chord in zip(chords, dbinput.chords):
             if time >= next_anal_time and not reached_end:
                 # Move on to the next analysis point
                 rel_point, fun = next_rel_point, next_fun
                 try:
                     next_rel_point,next_fun,next_anal_time = analysis.next()
                 except StopIteration:
                     # No more points: keep using the same to the end
                     reached_end = True
             training_seq.append((crd_pair, (rel_point,fun)))
             time += chord.duration
         training_data.append(training_seq)
Example #14
0
    def generate(self, logger=None, max_depth=None):
        """
        Generate a chord sequence from the model.
        
        """
        if logger is None:
            logger = create_dummy_logger()

        def _generate(parent, depth=0, pitch=0):
            # Transform the parent category so it's relative to itself
            # All generated categories will be relative to this,
            #  so we need to make the parent self-relative at the
            #  start of each recursion
            parent_rep = model_category_repr(parent)
            parent_pitch = (pitch + base_pitch(parent)) % 12
            logger.debug("%sGenerating from parent: %s" %
                         (" " * depth, parent_rep))

            if max_depth is not None and depth >= max_depth and \
                        len(self._lexical_dist[parent_rep].samples()) != 0:
                # Don't go any deeper than this if we can stop here
                # Only possible if the parent has generated a leaf before
                exp = 'leaf'
                logger.debug("%sForcing leaf" % (" " * depth))
            else:
                # Otherwise freely generate an expansion type
                exp = generate_from_prob_dist(
                    self._expansion_type_dist[parent_rep])
                logger.debug("%sExpansion: %s" % (" " * depth, exp))
                exp_parent = (exp, parent_rep)

            if exp == 'leaf':
                # Generate a leaf node (word)
                word = generate_from_prob_dist(self._lexical_dist[parent_rep])
                logger.debug("%sWord: %s, pitch: %d" %
                             (" " * depth, word, parent_pitch))
                chord = Chord.from_name(word)
                chord.root = (chord.root + parent_pitch) % 12
                return [chord]
            else:
                # First generate a head node
                head = generate_from_prob_dist(
                    self._head_expansion_dist[exp_parent])
                logger.debug("%sHead: %s" % (" " * depth, head))
                # Continue to expand this recursively to a word sequence
                head_generated = _generate(head, depth=depth+1, \
                                                            pitch=parent_pitch)

                head_exp_parent = (head, exp, parent_rep)
                # Now generate a non-head node
                non_head = generate_from_prob_dist(
                    self._non_head_expansion_dist[head_exp_parent])
                logger.debug("%sNon-head: %s" % (" " * depth, non_head))
                # Continue to expand this too
                non_head_generated = _generate(non_head, depth=depth+1, \
                                                            pitch=parent_pitch)

                return non_head_generated + head_generated

        # Choose a start node
        # Build a I^T-I^T as the root
        root = syntax_from_string("I^T-I^T")
        logger.debug("Root: %s" % root)
        return _generate(root)
Example #15
0
    def train(self, emissions, logger=None):
        """
        Performs unsupervised training using Baum-Welch EM.
        
        This is performed as a retraining step on a model that has already 
        been initialized. 
        
        This is based on the training procedure in NLTK for HMMs:
        C{nltk.tag.hmm.HiddenMarkovModelTrainer.train_unsupervised}.
        
        @type emissions: list of lists of emissions
        @param emissions: training data. Each element is a list of 
            emissions representing a sequence in the training data.
            Each emission is an emission like those used for 
            C{emission_log_probability} on the model
        @type logger: logging.Logger
        @param logger: a logger to send progress logging to
        
        """
        if logger is None:
            from jazzparser.utils.loggers import create_dummy_logger
            logger = create_dummy_logger()

        self.record_history("Beginning Baum-Welch training on %s" %
                            get_host_info_string())
        self.record_history("Training on %d inputs (with %s segments)" % \
            (len(emissions), ", ".join("%d" % len(seq) for seq in emissions)))
        logger.info("Beginning Baum-Welch training on %s" %
                    get_host_info_string())

        # Get some options out of the module options
        max_iterations = self.options['max_iterations']
        convergence_logprob = self.options['convergence_logprob']
        split_length = self.options['split']
        truncate_length = self.options['truncate']
        save_intermediate = self.options['save_intermediate']
        processes = self.options['trainprocs']

        # Make a mutable version of the model that we can update each iteration
        self.model = self.create_mutable_model(self.model)
        # Getting the array id mappings
        array_ids = self.get_array_indices()

        ########## Data preprocessing
        logger.info("%d input sequences" % len(emissions))
        # Truncate long streams
        if truncate_length is not None:
            logger.info("Truncating sequences to max %d timesteps" % \
                                                            truncate_length)
            emissions = [stream[:truncate_length] for stream in emissions]
        # Split up long streams if requested
        # After this, each stream is a tuple (first,stream), where first
        #  indicates whether the stream segment begins a song
        if split_length is not None:
            logger.info("Splitting sequences into max %d-sized chunks" % \
                                                                split_length)
            split_emissions = []
            # Split each stream
            for emstream in emissions:
                input_ems = list(emstream)
                splits = []
                first = True
                # Take bits of length split_length until we're under the max
                while len(input_ems) >= split_length:
                    # Overlap the splits by one so we get all transitions
                    splits.append((first, input_ems[:split_length]))
                    input_ems = input_ems[split_length - 1:]
                    first = False
                # Get the last short one
                if len(input_ems):
                    # Try to avoid having a small bit that's split off at the end
                    if len(splits) and len(input_ems) <= split_length / 5:
                        # Add these to the end of the last split
                        # This will make it slightly longer than requested
                        splits[-1][1].extend(input_ems)
                    else:
                        splits.append((first, input_ems))
                split_emissions.extend(splits)
        else:
            # All streams begin a song
            split_emissions = [(True, stream) for stream in emissions]
        logger.info("Sequence lengths after preprocessing: %s" %
                    " ".join([str(len(em[1])) for em in split_emissions]))
        ##########

        # Special case of -1 for number of sequences
        # No point in creating more processes than there are sequences
        if processes == -1 or processes > len(split_emissions):
            processes = len(split_emissions)

        iteration = 0
        last_logprob = None
        while iteration < max_iterations:
            logger.info("Beginning iteration %d" % iteration)
            current_logprob = 0.0

            # Build a tuple of the arrays that will be updated by each sequence
            self.global_arrays = self.get_empty_arrays()

            # Only use a process pool if there's more than one sequence
            if processes > 1:
                # Create a process pool to use for training
                logger.info("Creating a pool of %d processes" % processes)
                #  catch them at this level
                pool = Pool(processes=processes)

                async_results = []
                try:
                    for seq_i, (first, sequence) in enumerate(split_emissions):
                        logger.info("Iteration %d, sequence %d" %
                                    (iteration, seq_i))
                        T = len(sequence)
                        if T == 0:
                            continue

                        def _notifier_closure(seq_index):
                            def _notifier(res):
                                logger.info("Sequence %d finished" % seq_index)

                            return _notifier

                        # Create some empty arrays for the updates to go into
                        empty_arrays = self.get_empty_arrays()
                        # Fire off a new call to the process pool for every sequence
                        async_results.append(
                            pool.apply_async(self.sequence_updates,
                                             (sequence, self.model,
                                              empty_arrays, array_ids),
                                             {'update_initial': first},
                                             _notifier_closure(seq_i)))
                    pool.close()
                    # Wait for all the workers to complete
                    pool.join()
                except KeyboardInterrupt:
                    # If Ctl+C is fired during the processing, we exit here
                    logger.info("Keyboard interrupt was received during EM "\
                        "updates")
                    raise

                # Call get() on every AsyncResult so that any exceptions in
                #  workers get raised
                for res in async_results:
                    # If there was an exception in sequence_updates, it
                    #  will get raised here
                    res_tuple = res.get()
                    # Run the callback on the results from this process
                    # It might seem sensible to do this using the callback
                    #  arg to apply_async, but then the callback must be
                    #  picklable and it doesn't buy us anything really
                    self.sequence_updates_callback(res_tuple)
                    # Add this sequence's logprob into the total for all sequences
                    current_logprob += res_tuple[-1]
            else:
                if len(split_emissions) == 1:
                    logger.info("One sequence: not using a process pool")
                else:
                    logger.info("Not using a process pool: training %d "\
                        "emission sequences sequentially" % \
                        len(split_emissions))

                for seq_i, (first, sequence) in enumerate(split_emissions):
                    if len(sequence) > 0:
                        logger.info("Iteration %d, sequence %d" %
                                    (iteration, seq_i))
                        # Create some empty arrays for the updates to go into
                        empty_arrays = self.get_empty_arrays()
                        updates = self.sequence_updates(sequence,
                                                        self.model,
                                                        empty_arrays,
                                                        array_ids,
                                                        update_initial=first)
                        self.sequence_updates_callback(updates)
                        # Update the overall logprob
                        current_logprob += updates[-1]

            ######## Model updates
            # Update the main model
            self.update_model(self.global_arrays, array_ids)

            # Clear the model's cache so we get the new probabilities
            self.model.clear_cache()

            logger.info("Training data log prob: %s" % current_logprob)
            if last_logprob is not None and current_logprob < last_logprob:
                logger.error("Log probability dropped by %s" % \
                                (last_logprob - current_logprob))
            if last_logprob is not None:
                logger.info("Log prob change: %s" % \
                                (current_logprob - last_logprob))
            # Check whether the log probability has converged
            if iteration > 0 and \
                    abs(current_logprob - last_logprob) < convergence_logprob:
                # Don't iterate any more
                logger.info("Distribution has converged: ceasing training")
                break

            iteration += 1
            last_logprob = current_logprob

            # Only save if we've been asked to save between iterations
            if save_intermediate:
                self.save()

        self.record_history("Completed Baum-Welch training")
        # Always save the model now that we're done
        self.save()
        return self.model
Example #16
0
    def train(data,
              estimator,
              grammar,
              cutoff=0,
              logger=None,
              chord_map=None,
              order=2,
              backoff_orders=0,
              backoff_kwargs={}):
        """
        Initializes and trains an HMM in a supervised fashion using the given 
        training data. Training data should be chord sequence data (input 
        type C{bulk-db} or C{bulk-db-annotated}).
        
        """
        # Prepare a dummy logger if none was given
        if logger is None:
            logger = create_dummy_logger()
        logger.info(">>> Beginning training of ngram backoff model")

        training_data = []
        # Generate the gold standard data by parsing the annotations
        for dbinput in data:
            # Get a gold standard tonal space sequence
            try:
                parses = parse_sequence_with_annotations(dbinput, grammar, \
                                                        allow_subparses=False)
            except ParseError, err:
                # Just skip this sequence
                logger.error('Could not get a GS parse of %s: %s' %
                             (dbinput, err))
                continue
            # There should only be one of these now
            parse = parses[0]
            if parse is None:
                logger.error('Could not get a GS parse of %s' % (dbinput))
                continue

            # Get the form of the analysis we need for the training
            if chord_map is None:
                chords = [(c.root, c.type) for c in dbinput.chords]
            else:
                chords = [(c.root, chord_map[c.type]) for c in dbinput.chords]

            points, times = zip(
                *grammar.formalism.semantics_to_coordinates(parse.semantics))
            # Run through the sequence, transforming absolute points into
            #  the condensed relative representation
            ec0 = EnharmonicCoordinate.from_harmonic_coord(points[0])
            # The first point is relative to the origin and always in the
            #  (0,0) enharmonic space
            rel_points = [(0, 0, ec0.x, ec0.y)]
            for point in points[1:]:
                ec1 = EnharmonicCoordinate.from_harmonic_coord(point)
                # Find the nearest enharmonic instance of this point to the last
                nearest = ec0.nearest((ec1.x, ec1.y))
                # Work out how much we have to shift this by to get the point
                dX = ec1.X - nearest.X
                dY = ec1.Y - nearest.Y
                rel_points.append((dX, dY, ec1.x, ec1.y))
                ec0 = ec1
            funs, times = zip(
                *grammar.formalism.semantics_to_functions(parse.semantics))

            ### Synchronize the chords with the points and functions
            # We may need to repeat chords to match up with analysis
            #  points that span multiple chords
            analysis = iter(zip(rel_points, funs, times))
            rel_point, fun, __ = analysis.next()
            next_rel_point, next_fun, next_anal_time = analysis.next()
            # Keep track of how much time has elapsed
            time = 0
            training_seq = []
            reached_end = False
            for crd_pair, chord in zip(chords, dbinput.chords):
                if time >= next_anal_time and not reached_end:
                    # Move on to the next analysis point
                    rel_point, fun = next_rel_point, next_fun
                    try:
                        next_rel_point, next_fun, next_anal_time = analysis.next(
                        )
                    except StopIteration:
                        # No more points: keep using the same to the end
                        reached_end = True
                training_seq.append((crd_pair, (rel_point, fun)))
                time += chord.duration
            training_data.append(training_seq)
Example #17
0
    def train(data,
              schemata,
              chord_types,
              estimator,
              cutoff=0,
              logger=None,
              chord_map=None,
              order=2,
              backoff_orders=0,
              backoff_kwargs={}):
        """
        Initializes and trains an HMM in a supervised fashion using the given 
        training data. Training data should be chord sequence data (input 
        type C{bulk-db} or C{bulk-db-annotated}).
        
        """
        # Remove any sequences that aren't fully labeled
        sequences = [
            sequence for sequence in data if \
                    all([c.category is not None and len(c.category) \
                            for c in sequence.chords])
        ]

        if len(sequences) == 0:
            raise TaggerTrainingError, "empty training data set"

        # Prepare a dummy logger if none was given
        if logger is None:
            logger = create_dummy_logger()
        logger.info(
            ">>> Beginning training of multi-chord ngram tagging model")

        # Prepare training data from these sequences
        # Training set for emission dist
        if chord_map is None:
            chord_trans = lambda x: x
        else:
            chord_trans = lambda x: chord_map[x]
        emission_data = sum([[(chord.category, chord_trans(chord.type))
                              for chord in sequence.chords]
                             for sequence in sequences], [])

        # Train the emission distribution
        emission_counts = CutoffConditionalFreqDist(cutoff)
        for schema, ctype in emission_data:
            emission_counts[schema].inc(ctype)

        # Train the transition distribution
        schema_transition_counts = CutoffConditionalFreqDist(cutoff)
        root_transition_counts = CutoffConditionalFreqDist(cutoff)

        for sequence in sequences:
            # Add a count for the transition to the final state
            final_ngram = tuple(
                [c.category for c in sequence.chords[-order:-1]])
            schema_transition_counts[sequence.chords[-1].category].inc(None)
            # Make n-gram counts
            transition_data = [None] * (order - 1) + sequence.chords

            for i in range(len(transition_data) - order):
                ngram = list(reversed(transition_data[i:i + order]))

                # Count the schema transition
                schema_ngram = [
                    c.category if c is not None else None for c in ngram
                ]
                schema_transition_counts[tuple(schema_ngram[1:])].inc(
                    schema_ngram[0])

                # Now count the relative root, conditioned on the schema
                if order > 1 and ngram[1] is not None:
                    root_change = (ngram[0].root - ngram[1].root) % 12
                    root_transition_counts[ngram[1].category].inc(root_change)

        if backoff_orders > 0:
            # Train a lower-order model
            kwargs = {
                'cutoff': cutoff,
                'logger': logger,
                'chord_map': chord_map,
            }
            kwargs.update(backoff_kwargs)
            # These kwargs can't be overridden
            kwargs['order'] = order - 1
            kwargs['backoff_orders'] = backoff_orders - 1
            # Run the model training
            backoff_model = MultiChordNgramModel.train(data, schemata,
                                                       chord_types, estimator,
                                                       **kwargs)
        else:
            backoff_model = None

        # Instantiate a model with these distributions
        model = MultiChordNgramModel(order, root_transition_counts,
                                     schema_transition_counts, emission_counts,
                                     estimator, backoff_model, schemata,
                                     chord_types)
        return model