Beispiel #1
0
 def _load_model(data):
     from jazzparser.utils.nltk.ngram import PrecomputedNgramModel
     
     model = PrecomputedNgramModel.from_picklable_dict(data['model'])
     name = data['name']
     chordmap = data.get("chordmap", None)
     return NgramTaggerModel(name, model=model, chordmap=chordmap)
Beispiel #2
0
    def _load_model(data):
        from jazzparser.utils.nltk.ngram import PrecomputedNgramModel

        model = PrecomputedNgramModel.from_picklable_dict(data['model'])
        name = data['name']
        chordmap = data.get("chordmap", None)
        return NgramTaggerModel(name, model=model, chordmap=chordmap)
Beispiel #3
0
    def train(self, sequences, grammar=None, logger=None):
        from jazzparser.utils.nltk.ngram import PrecomputedNgramModel
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()
        
        N = self.options['n']
        backoff = self.options['backoff']
        chordmap = self.options['chord_mapping']
        self.chordmap = chordmap
        self.chordmap_name = chordmap.name
        
        # Get data in the form of lists of (observation,tag) pairs
        training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \
                                for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)]
                                    for seq in sequences]
        # Get all the possible pos tags from the grammar
        label_dom = grammar.pos_tags
        # Build the emission domain to include all the observations that 
        #  theoretically could occur, not just those that are seen - 
        #  we might not see all interval/chord type pairs in the data.
        chord_types = chordmap.values()
        emission_dom = sum([["%d-%s" % (interval,chord) for chord in chord_types] for interval in range(12)], [])
        
        # Ignore unlabelled data
        ignores = ['']
        
        if self.options['backoff_cutoff'] is None:
            backoff_kwargs = {}
        else:
            backoff_kwargs = {'cutoff' : self.options['backoff_cutoff']}
        
        # Precompute the transition matrix and store it along with the model
        self.model = PrecomputedNgramModel.train(
                            self.options['n'],
                            training_data,
                            label_dom,
                            emission_dom=emission_dom,
                            cutoff=self.options['cutoff'],
                            backoff_order=self.options['backoff'],
                            estimator=self.options['estimator'],
                            ignore_list=ignores,
                            backoff_kwargs=backoff_kwargs)
        
        # Add some model-specific info into the descriptive text
        #  so we know how it was trained
        est_name = get_estimator_name(self.options['estimator'])
        self.model_description = """\
Model order: %(order)d
Backoff orders: %(backoff)d
Probability estimator: %(est)s
Zero-count threshold: %(cutoff)d
Chord mapping: %(chordmap)s
Training sequences: %(seqs)d
Training samples: %(samples)d\
""" % \
            {
                'est' : est_name,
                'seqs' : len(training_data),
                'samples' : len(sum(training_data, [])),
                'order' : self.options['n'],
                'backoff' : self.options['backoff'],
                'cutoff' : self.options['cutoff'],
                'chordmap' : self.chordmap_name,
            }
Beispiel #4
0
    def train(self, sequences, grammar=None, logger=None):
        from jazzparser.utils.nltk.ngram import PrecomputedNgramModel
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()

        N = self.options['n']
        backoff = self.options['backoff']
        chordmap = self.options['chord_mapping']
        self.chordmap = chordmap
        self.chordmap_name = chordmap.name

        # Get data in the form of lists of (observation,tag) pairs
        training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \
                                for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)]
                                    for seq in sequences]
        # Get all the possible pos tags from the grammar
        label_dom = grammar.pos_tags
        # Build the emission domain to include all the observations that
        #  theoretically could occur, not just those that are seen -
        #  we might not see all interval/chord type pairs in the data.
        chord_types = chordmap.values()
        emission_dom = sum(
            [["%d-%s" % (interval, chord) for chord in chord_types]
             for interval in range(12)], [])

        # Ignore unlabelled data
        ignores = ['']

        if self.options['backoff_cutoff'] is None:
            backoff_kwargs = {}
        else:
            backoff_kwargs = {'cutoff': self.options['backoff_cutoff']}

        # Precompute the transition matrix and store it along with the model
        self.model = PrecomputedNgramModel.train(
            self.options['n'],
            training_data,
            label_dom,
            emission_dom=emission_dom,
            cutoff=self.options['cutoff'],
            backoff_order=self.options['backoff'],
            estimator=self.options['estimator'],
            ignore_list=ignores,
            backoff_kwargs=backoff_kwargs)

        # Add some model-specific info into the descriptive text
        #  so we know how it was trained
        est_name = get_estimator_name(self.options['estimator'])
        self.model_description = """\
Model order: %(order)d
Backoff orders: %(backoff)d
Probability estimator: %(est)s
Zero-count threshold: %(cutoff)d
Chord mapping: %(chordmap)s
Training sequences: %(seqs)d
Training samples: %(samples)d\
""" % \
            {
                'est' : est_name,
                'seqs' : len(training_data),
                'samples' : len(sum(training_data, [])),
                'order' : self.options['n'],
                'backoff' : self.options['backoff'],
                'cutoff' : self.options['cutoff'],
                'chordmap' : self.chordmap_name,
            }
Beispiel #5
0
    (3, 2,    0),
    # Trigram with cutoff
    # This probably ought to improve things (over those below)
    (3, 2,    2),
]

for order,backoff,cutoff in PARAMS:
    print "*******************************"
    print "Order:", order
    print "Backoff:", backoff
    print "Smoothing: witten-bell"
    print "Cutoff:", cutoff
    model = PrecomputedNgramModel.train(order, 
                             training_data, 
                             label_dom=list(CHARS), 
                             emission_dom=list(CHARS),
                             backoff_order=backoff,
                             estimator=witten_bell_estimator,
                             cutoff=cutoff,
                             backoff_kwargs={'cutoff':0})
    
    #~ # Take a look at some of the distributions
    #~ print "Some emission distributions"
    #~ print "%d labels, showing 10\n" % len(model.emission_dist.conditions())
    #~ show_dist(model.emission_dist)
    #~ 
    #~ print "\nSome transition distibrutions"
    #~ print "%d conditions, showing 5\n" % len(model.label_dist.conditions())
    #~ show_dist(model.label_dist, limit=5)
    
    # Try decoding the test data
    correct = 0
Beispiel #6
0
    (3, 2, 0),
    # Trigram with cutoff
    # This probably ought to improve things (over those below)
    (3, 2, 2),
]

for order, backoff, cutoff in PARAMS:
    print "*******************************"
    print "Order:", order
    print "Backoff:", backoff
    print "Smoothing: witten-bell"
    print "Cutoff:", cutoff
    model = PrecomputedNgramModel.train(order,
                                        training_data,
                                        label_dom=list(CHARS),
                                        emission_dom=list(CHARS),
                                        backoff_order=backoff,
                                        estimator=witten_bell_estimator,
                                        cutoff=cutoff,
                                        backoff_kwargs={'cutoff': 0})

    #~ # Take a look at some of the distributions
    #~ print "Some emission distributions"
    #~ print "%d labels, showing 10\n" % len(model.emission_dist.conditions())
    #~ show_dist(model.emission_dist)
    #~
    #~ print "\nSome transition distibrutions"
    #~ print "%d conditions, showing 5\n" % len(model.label_dist.conditions())
    #~ show_dist(model.label_dist, limit=5)

    # Try decoding the test data
    correct = 0