Esempio n. 1
0
 def __init__(self, grammar, input, options={}, *args, **kwargs):
     super(Baseline3Tagger, self).__init__(grammar, input, options, *args, **kwargs)
     process_chord_input(self)
     
     #### Tag the input sequence ####
     self._tagged_data = []
     self._batch_ranges = []
     # Group the input into pairs
     inpairs = group_pairs(self.input, none_final=True)
     # Get all the possible signs from the grammar
     for index,pair in enumerate(inpairs):
         features = {
             'duration' : self.durations[index],
             'time' : self.times[index],
         }
         word_signs = []
         # Now assign a probability to each tag, given the observation
         for tag in self.model.category_count.keys():
             sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features)
             if sign is not None:
                 probability = self.model.get_prob_cat_given_chord_pair(tag, *pair)
                 word_signs.append((sign, tag, probability))
         word_signs = list(reversed(sorted([(sign, tag, prob) for sign,tag,prob in word_signs], key=lambda x:x[2])))
         self._tagged_data.append(word_signs)
         
         # Work out the sizes of the batches to return these in
         batches = batch_sizes([p for __,__,p in word_signs], self.batch_ratio)
         # Transform these into a form that's easier to use for getting the signs
         so_far = 0
         batch_ranges = []
         for batch in batches:
             batch_ranges.append((so_far,so_far+batch))
             so_far += batch
         self._batch_ranges.append(batch_ranges)
Esempio n. 2
0
def _sequence_to_candc_format(formatter, sequence):
    """
    Produces a string representation of observations to be used as 
    training data for a C&C model from a chord sequence internal
    model.
    This is an inner function for the various different formats of 
    C&C data we use.
    
    """
    from jazzparser.utils.base import group_pairs
    # Produce observations from chord pairs
    pairs_list = group_pairs(list(sequence.iterator()) + [None])
    observation_list = [formatter(*chords) for chords in pairs_list]
    return "%s\n" % " ".join(observation_list)
    def __init__(self, inputs, durations=None, times=None, id=None, chords=None, sequence=None, *args, **kwargs):
        super(DbInput, self).__init__(*args, **kwargs)

        self.inputs = inputs
        self.durations = durations
        self.times = times
        self.id = id
        self.chords = chords
        self.sequence = sequence

        if durations is None and times is None:
            raise ValueError, "cannot create a DbInput with neither " "times nor durations given"
        elif times is None:
            self.times = [sum(durations[:i]) for i in range(len(durations))]
        elif durations is None:
            from jazzparser.utils.base import group_pairs

            self.durations = [time1 - time0 for (time1, time0) in group_pairs(times)] + [Fraction(1)]
Esempio n. 4
0
    def train(self, sequences, grammar=None, logger=None):
        seqs = 0
        chords = 0
        # Each sequence in the given corpus
        for seq in sequences:
            seqs += 1
            # Each chord in the sequence
            for c1,c2 in group_pairs(seq.iterator(), none_final=True):
                chords += 1
                self._add_category_chord_count(c1.category, observation_from_chord_pair(c1, c2))
        # Add a bit of training info to the descriptive text
        self.model_description = """\
Unigram probability model of combined observations of interval and chord type

Training sequences: %(seqs)d
Training samples: %(samples)d""" % {
                'seqs' : seqs,
                'samples' : chords
            }
Esempio n. 5
0
    def __init__(self, inputs, durations=None, times=None, id=None, \
                    chords=None, sequence=None, *args, **kwargs):
        super(DbInput, self).__init__(*args, **kwargs)

        self.inputs = inputs
        self.durations = durations
        self.times = times
        self.id = id
        self.chords = chords
        self.sequence = sequence

        if durations is None and times is None:
            raise ValueError, "cannot create a DbInput with neither "\
                "times nor durations given"
        elif times is None:
            self.times = [sum(durations[:i]) for i in range(len(durations))]
        elif durations is None:
            from jazzparser.utils.base import group_pairs
            self.durations = [
                time1 - time0 for (time1, time0) in group_pairs(times)
            ] + [Fraction(1)]
Esempio n. 6
0
def _steps_list(seq):
    """
    Given a list of (coordinate,function) pairs, produces a similar list 
    that represents the steps between each point in the path and its previous 
    point, maintaining the original functions.
    
    The first point yields the step 
    from the origin, ignoring its enharmonic block (in other words, the 
    step from (0,0) within its enharmonic block).
    
    This means that effectively we don't care what enharmonic block the 
    path lies in, only the relative points along the path.
    
    """
    def _minus(c0, c1):
        return (c0[0] - c1[0], c0[1] - c1[1])

    # Get the functions out for later
    coords, funs = zip(*seq)
    steps = [coords[0]] + [(_minus(c1, c0)) for c0, c1 in group_pairs(coords)]
    # Put the functions back in for the result
    return zip(steps, funs)
Esempio n. 7
0
def _steps_list(seq):
    """
    Given a list of (coordinate,function) pairs, produces a similar list 
    that represents the steps between each point in the path and its previous 
    point, maintaining the original functions.
    
    The first point yields the step 
    from the origin, ignoring its enharmonic block (in other words, the 
    step from (0,0) within its enharmonic block).
    
    This means that effectively we don't care what enharmonic block the 
    path lies in, only the relative points along the path.
    
    """
    def _minus(c0, c1):
        return (c0[0]-c1[0], c0[1]-c1[1])
    
    # Get the functions out for later
    coords,funs = zip(*seq)
    steps = [coords[0]] + [(_minus(c1,c0)) for c0,c1 in group_pairs(coords)]
    # Put the functions back in for the result
    return zip(steps, funs)
Esempio n. 8
0
    def __init__(self,
                 inputs,
                 durations=None,
                 times=None,
                 roman=False,
                 *args,
                 **kwargs):
        super(ChordInput, self).__init__(*args, **kwargs)

        self.inputs = inputs
        self.durations = durations
        self.times = times
        self.roman = roman

        # Compute the durations from times or vice versa
        if durations is None and times is None:
            raise ValueError, "cannot create a ChordInput with neither "\
                "times nor durations given"
        elif times is None:
            self.times = [
                sum(durations[:i], Fraction(0)) for i in range(len(durations))
            ]
        elif durations is None:
            from jazzparser.utils.base import group_pairs
            self.durations = [
                time1 - time0 for (time1, time0) in group_pairs(times)
            ] + [Fraction(1)]

        # Convert all strings to internal chord representation
        # Done now so we check the chords can all be understood before doing
        #  anything else
        self.chords = [
            Chord.from_name(name, roman=roman).to_db_mirror()
            for name in inputs
        ]
        for chord, dur in zip(self.chords, self.durations):
            chord.duration = dur
    def __init__(self, inputs, durations=None, times=None, roman=False, *args, **kwargs):
        super(ChordInput, self).__init__(*args, **kwargs)

        self.inputs = inputs
        self.durations = durations
        self.times = times
        self.roman = roman

        # Compute the durations from times or vice versa
        if durations is None and times is None:
            raise ValueError, "cannot create a ChordInput with neither " "times nor durations given"
        elif times is None:
            self.times = [sum(durations[:i], Fraction(0)) for i in range(len(durations))]
        elif durations is None:
            from jazzparser.utils.base import group_pairs

            self.durations = [time1 - time0 for (time1, time0) in group_pairs(times)] + [Fraction(1)]

        # Convert all strings to internal chord representation
        # Done now so we check the chords can all be understood before doing
        #  anything else
        self.chords = [Chord.from_name(name, roman=roman).to_db_mirror() for name in inputs]
        for chord, dur in zip(self.chords, self.durations):
            chord.duration = dur
Esempio n. 10
0
    def __init__(self, grammar, input, options={}, *args, **kwargs):
        """
        Tags using an ngram model backed by NLTK.
        
        """
        super(NgramTagger, self).__init__(grammar, input, options, *args,
                                          **kwargs)
        process_chord_input(self)

        #### Tag the input sequence ####
        self._tagged_data = []
        self._batch_ranges = []
        # Group the input into pairs to get observations
        inpairs = group_pairs(self.input, none_final=True)
        # Convert the pairs into observations
        observations = [
            observation_from_chord_pair(pair[0], pair[1], self.model.chordmap)
            for pair in inpairs
        ]

        # Use the ngram model to get tag probabilities for each input by
        # computing the forward probability matrix
        if self.options['decode'] == "viterbi":
            probabilities = self.model.viterbi_probabilities(observations)
        elif self.options['decode'] == "forward":
            probabilities = self.model.forward_probabilities(observations)
        else:
            probabilities = self.model.forward_backward_probabilities(
                observations)

        word_tag_probs = []

        for index, probs in enumerate(probabilities):
            features = {
                'duration': self.durations[index],
                'time': self.times[index],
            }
            word_signs = []
            # Now assign a probability to each tag, given the observation
            for tag in self.model.tags:
                # Read a full sign out of the grammar
                sign = self.grammar.get_sign_for_word_by_tag(
                    self.input[index], tag, extra_features=features)
                if sign is not None:
                    # Read off the probability from the matrix
                    probability = probs[tag]
                    word_signs.append((sign, tag, probability))

            # Randomly sort the list first to make sure equal probabilities are randomly ordered
            word_signs = [(sign, tag, prob) for sign, tag, prob in word_signs]
            random.shuffle(word_signs)
            # Now sort by probability
            word_signs = list(reversed(sorted(word_signs, key=lambda x: x[2])))
            self._tagged_data.append(word_signs)

            # Store the list of probabilities for tags, which we'll use
            #  after we've tagged every word to work out the sizes
            #  of the tag batches
            word_tag_probs.append([p for __, __, p in word_signs])

        if self.options['best']:
            # Only return one for each word
            self._batch_ranges = [[(0, 1)] for i in range(len(self.input))]
        else:
            # Work out the number of tags to return in each batch
            batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio)
            # So far, this has assigned a probability to every possible
            #  tag. We don't want the tagger ever to return the least
            #  probably batch of tags, unless it's the only one.
            #batch_sizes = [batches[:-1] if len(batches) > 1 else batches for batches in batch_sizes]
            # Transform these into a form that's easier to use for getting the signs
            self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \
                                    for batches in batch_sizes]
Esempio n. 11
0
    def train(self, sequences, grammar=None, logger=None):
        from jazzparser.utils.nltk.ngram import PrecomputedNgramModel
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()

        N = self.options['n']
        backoff = self.options['backoff']
        chordmap = self.options['chord_mapping']
        self.chordmap = chordmap
        self.chordmap_name = chordmap.name

        # Get data in the form of lists of (observation,tag) pairs
        training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \
                                for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)]
                                    for seq in sequences]
        # Get all the possible pos tags from the grammar
        label_dom = grammar.pos_tags
        # Build the emission domain to include all the observations that
        #  theoretically could occur, not just those that are seen -
        #  we might not see all interval/chord type pairs in the data.
        chord_types = chordmap.values()
        emission_dom = sum(
            [["%d-%s" % (interval, chord) for chord in chord_types]
             for interval in range(12)], [])

        # Ignore unlabelled data
        ignores = ['']

        if self.options['backoff_cutoff'] is None:
            backoff_kwargs = {}
        else:
            backoff_kwargs = {'cutoff': self.options['backoff_cutoff']}

        # Precompute the transition matrix and store it along with the model
        self.model = PrecomputedNgramModel.train(
            self.options['n'],
            training_data,
            label_dom,
            emission_dom=emission_dom,
            cutoff=self.options['cutoff'],
            backoff_order=self.options['backoff'],
            estimator=self.options['estimator'],
            ignore_list=ignores,
            backoff_kwargs=backoff_kwargs)

        # Add some model-specific info into the descriptive text
        #  so we know how it was trained
        est_name = get_estimator_name(self.options['estimator'])
        self.model_description = """\
Model order: %(order)d
Backoff orders: %(backoff)d
Probability estimator: %(est)s
Zero-count threshold: %(cutoff)d
Chord mapping: %(chordmap)s
Training sequences: %(seqs)d
Training samples: %(samples)d\
""" % \
            {
                'est' : est_name,
                'seqs' : len(training_data),
                'samples' : len(sum(training_data, [])),
                'order' : self.options['n'],
                'backoff' : self.options['backoff'],
                'cutoff' : self.options['cutoff'],
                'chordmap' : self.chordmap_name,
            }
Esempio n. 12
0
def get_vanilla_book():
    """
    Downloads the whole of the Vanilla Book: 
    L{http://www.ralphpatt.com/Song.html}.
    
    """
    from BeautifulSoup import BeautifulSoup
    from urllib2 import urlopen
    from urllib import quote
    from urlparse import urljoin
    import re
    from jazzparser.utils.base import group_pairs

    # ~ raise NotImplementedError, "not finished writing this"

    INDEX_PAGE = "http://www.ralphpatt.com/Song.html"
    SONG_BASE = "http://www.ralphpatt.com/"
    # The overbar alternative ending marker
    alt_end_re = re.compile(r"(\d+).(_+)")

    # Fetch the referring page and parse it
    soup = BeautifulSoup(urlopen(INDEX_PAGE).read())
    # Pull out all the links
    links = soup.findAll("a")
    # Get just the links to songs: all in VB/
    song_links = [l["href"] for l in links if l.has_key("href") and l["href"].startswith("VB/")]

    for song_link in song_links:
        url = "%s%s" % (SONG_BASE, song_link)
        song_soup = BeautifulSoup(urlopen(url).read())
        # The song's name is in the title tag
        song_name = song_soup.title.string.strip()
        print song_name
        # The chords are in a pre tag
        chord_text = "".join(song_soup.body.pre.findAll(text=True))
        # Remove the key line
        lines = chord_text.split("\n")
        start_line = 0
        for i, line in enumerate(lines):
            if line.lower().startswith("key"):
                # Found the key line: ignore everything up to here
                start_line = i + 1
                break
        else:
            # No key line found!
            print "No key line for %s" % song_name
            continue
        lines = lines[start_line:]

        # Find the chord lines: they start with | or [
        song_lines = []
        for i, line in enumerate(lines):
            if line.startswith("[") or line.startswith("|"):
                song_lines.append((lines[i - 1], lines[i]))

        try:
            bars = []
            bar_ranges = []
            open_repeats = []
            for overline, line in song_lines:
                barlines = list(re.finditer(r"(\|\|)|(\|)|(\[:)|(:\])|(\[)", line))
                barline_ptns = []
                for i, (start_match, end_match) in enumerate(group_pairs(barlines)):
                    # If the bar has zero length, it's just two barlines
                    #  next to each other: ignore
                    if start_match.end() == end_match.start():
                        continue
                    barline_ptns.append(start_match.start())
                    # Get the upper and lower parts of this bar
                    if i == len(barlines) - 2:
                        # If this is the last bar on the line, go to the end
                        overbar = overline[start_match.start() - 2 :]
                    else:
                        overbar = overline[start_match.start() - 2 : end_match.start()]
                    overbar_cnt = overbar.strip()
                    if len(overbar_cnt) < 2:
                        overbar_cnt = ""
                    bar = line[start_match.end() : end_match.start()]

                    # We might loose some timing information at this point,
                    #  but it's not really worth trying to get
                    chords = [str(c) for c in bar.split() if c != "/"]
                    bars.append(chords)

                    # Check the starting barline for a repeat
                    barline = line[start_match.start() : start_match.end()]
                    end_barline = line[end_match.start() : end_match.end()]
                    # If we're starting a repeat, note that it starts here
                    if barline == "[:":
                        open_repeats.append(len(bars) - 1)
                    # If we're ending a repeat, copy in the repeated bars
                    if end_barline == ":]":
                        if len(open_repeats) == 0:
                            print "Unmatched open repeat in %s" % song_name
                            raise ChordSequenceParseError
                        repeat_start = open_repeats.pop()
                        bars.extend(bars[repeat_start:])

                    if overbar_cnt.startswith("__"):
                        overbar_cnt = overbar_cnt[2:].lstrip()
                    elif overbar_cnt.startswith("_"):
                        overbar_cnt = overbar_cnt[1:].lstrip()
                    if len(overbar_cnt):
                        alt_end = alt_end_re.match(overbar_cnt)
                        if alt_end:
                            print "alt end", alt_end.groups()[0]
                        else:
                            print overbar_cnt
                    ## TODO: deal with alternative endings (in the overbar)

        except ChordSequenceParseError:
            continue
Esempio n. 13
0
def get_vanilla_book():
    """
    Downloads the whole of the Vanilla Book: 
    L{http://www.ralphpatt.com/Song.html}.
    
    """
    from BeautifulSoup import BeautifulSoup
    from urllib2 import urlopen
    from urllib import quote
    from urlparse import urljoin
    import re
    from jazzparser.utils.base import group_pairs
    
    #~ raise NotImplementedError, "not finished writing this"
    
    INDEX_PAGE = 'http://www.ralphpatt.com/Song.html'
    SONG_BASE = 'http://www.ralphpatt.com/'
    # The overbar alternative ending marker
    alt_end_re = re.compile(r'(\d+).(_+)')
    
    # Fetch the referring page and parse it
    soup = BeautifulSoup(urlopen(INDEX_PAGE).read())
    # Pull out all the links
    links = soup.findAll("a")
    # Get just the links to songs: all in VB/
    song_links = [l['href'] for l in links if l.has_key("href") and \
                                        l['href'].startswith("VB/")]
    
    for song_link in song_links:
        url = "%s%s" % (SONG_BASE, song_link)
        song_soup = BeautifulSoup(urlopen(url).read())
        # The song's name is in the title tag
        song_name = song_soup.title.string.strip()
        print song_name
        # The chords are in a pre tag
        chord_text = ''.join(song_soup.body.pre.findAll(text=True))
        # Remove the key line
        lines = chord_text.split("\n")
        start_line = 0
        for i,line in enumerate(lines):
            if line.lower().startswith("key"):
                # Found the key line: ignore everything up to here
                start_line = i+1
                break
        else:
            # No key line found!
            print "No key line for %s" % song_name
            continue
        lines = lines[start_line:]
        
        # Find the chord lines: they start with | or [
        song_lines = []
        for i,line in enumerate(lines):
            if line.startswith("[") or line.startswith("|"):
                song_lines.append((lines[i-1], lines[i]))
        
        try:
            bars = []
            bar_ranges = []
            open_repeats = []
            for overline,line in song_lines:
                barlines = list(re.finditer(r"(\|\|)|(\|)|(\[:)|(:\])|(\[)", line))
                barline_ptns = []
                for i,(start_match,end_match) in enumerate(group_pairs(barlines)):
                    # If the bar has zero length, it's just two barlines 
                    #  next to each other: ignore
                    if start_match.end() == end_match.start():
                        continue
                    barline_ptns.append(start_match.start())
                    # Get the upper and lower parts of this bar
                    if i == len(barlines) - 2:
                        # If this is the last bar on the line, go to the end
                        overbar = overline[start_match.start()-2:]
                    else:
                        overbar = overline[start_match.start()-2:end_match.start()]
                    overbar_cnt = overbar.strip()
                    if len(overbar_cnt) < 2:
                        overbar_cnt = ""
                    bar = line[start_match.end():end_match.start()]
                    
                    # We might loose some timing information at this point, 
                    #  but it's not really worth trying to get
                    chords = [str(c) for c in bar.split() if c != "/"]
                    bars.append(chords)
                    
                    # Check the starting barline for a repeat
                    barline = line[start_match.start():start_match.end()]
                    end_barline = line[end_match.start():end_match.end()]
                    # If we're starting a repeat, note that it starts here
                    if barline == "[:":
                        open_repeats.append(len(bars)-1)
                    # If we're ending a repeat, copy in the repeated bars
                    if end_barline == ":]":
                        if len(open_repeats) == 0:
                            print "Unmatched open repeat in %s" % song_name
                            raise ChordSequenceParseError
                        repeat_start = open_repeats.pop()
                        bars.extend(bars[repeat_start:])
                
                    if overbar_cnt.startswith("__"):
                        overbar_cnt = overbar_cnt[2:].lstrip()
                    elif overbar_cnt.startswith("_"):
                        overbar_cnt = overbar_cnt[1:].lstrip()
                    if len(overbar_cnt):
                        alt_end = alt_end_re.match(overbar_cnt)
                        if alt_end:
                            print "alt end", alt_end.groups()[0]
                        else:
                            print overbar_cnt
                    ## TODO: deal with alternative endings (in the overbar)
                    
        except ChordSequenceParseError:
            continue
Esempio n. 14
0
 def train_transition_distribution(self, inputs, grammar, contprob=0.3):
     """
     Train the transition distribution parameters in a supervised manner, 
     using chord corpus input.
     
     This is used as an initialization step to set transition parameters 
     before running EM on unannotated data.
     
     @type inputs: L{jazzparser.data.input.AnnotatedDbBulkInput}
     @param inputs: annotated chord training data
     @type contprob: float or string
     @param contprob: probability mass to reserve for staying on the 
         same state (self transitions). Use special value 'learn' to 
         learn the probabilities from the durations
     
     """
     self.add_history(
             "Training transition probabilities using %d annotated chord "\
             "sequences" % len(inputs))
     learn_cont = contprob == "learn"
     
     # Prepare the label sequences that we'll train on
     if learn_cont:
         # Repeat values with a duration > 1
         sequences = []
         for seq in inputs:
             sequence = []
             last_cat = None
             for chord,cat in zip(seq, seq.categories):
                 # Put it in once for each duration
                 for i in range(chord.duration):
                     sequence.append((chord,cat))
             sequences.append(sequence)
     else:
         sequences = [list(zip(sequence, sequence.categories)) for \
                                 sequence in inputs]
     
     # Prepare a list of transformations to apply to the categories
     label_transform = {}
     # First include all the categories we want to keep as they were
     for schema in self.schemata:
         label_transform[schema] = (schema, 0)
     # Then include any transformations the grammar defines
     for pos,mapping in grammar.equiv_map.items():
         label_transform[pos] = (mapping.target.pos, mapping.root)
     
     # Apply the transformation to all the training data
     training_samples = []
     for chord_cats in sequences:
         seq_samples = []
         for chord,cat in chord_cats:
             # Transform the label if it has a transformation
             if cat in label_transform:
                 use_cat, alter_root = label_transform[cat]
             else:
                 use_cat, alter_root = cat, 0
             root = (chord.root + alter_root) % 12
             seq_samples.append((str(use_cat), root))
         training_samples.append(seq_samples)
     
     training_data = sum([
         [(cat0, cat1, (root1 - root0) % 12)
                 for ((cat0,root0),(cat1,root1)) in \
                     group_pairs(seq_samples)] \
             for seq_samples in training_samples], [])
     
     # Count up the observations
     schema_transition_counts = ConditionalFreqDist()
     root_transition_counts = ConditionalFreqDist()
     for (label0, label1, root_change) in training_data:
         # Only use counts for categories the model's looking for
         if label0 in self.schemata and label1 in self.schemata:
             schema_transition_counts[label0].inc(label1)
             root_transition_counts[(label0,label1)].inc(root_change)
     
     # Transition probability to final state (end of sequence)
     for sequence in training_samples:
         # Inc the count of going from the label the sequence ends on to 
         #  the final state
         schema_transition_counts[sequence[-1][0]].inc(None)
         
     # Use Laplace (plus one) smoothing
     # We don't use the laplace_estimator because we want the conversion 
     #  to a dict prob dist to get all the labels, not just to discount 
     #  the ones it's seen
     for label0 in self.schemata:
         for label1 in self.schemata:
             for root_change in range(12):
                 # Exclude self-transition for now, unless we're learning it
                 if learn_cont or not (label0 == label1 and root_change == 0):
                     schema_transition_counts[label0].inc(label1)
                     root_transition_counts[(label0,label1)].inc(root_change)
             # We don't add a count for going to the final state: we don't 
             #  want to initialize it with too much weight
     
     # Estimate distribution from this frequency distribution
     schema_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(\
             ConditionalProbDist(schema_transition_counts, mle_estimator, None), \
                 mutable=True, samples=self.schemata+[None])
     root_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(\
             ConditionalProbDist(root_transition_counts, mle_estimator, None), \
                 mutable=True, samples=range(12))
     
     if not learn_cont:
         # Discount all probabilities to allow for self-transition probs
         discount = logprob(1.0 - contprob)
         self_prob = logprob(contprob)
         for label0 in self.schemata:
             # Give saved prob mass to self-transitions
             trans_dist[label0].update((label0, 0), self_prob)
             
             # Discount all other transitions to allow for this
             for label1 in self.schemata:
                 for root_change in range(12):
                     if not (label0 == label1 and root_change == 0):
                         # Discount non self transitions
                         trans_dist[label0].update((label1, root_change), \
                             trans_dist[label0].logprob((label1, root_change)) + \
                             discount)
     
     # Recreate the dict prob dist so it's not mutable any more
     schema_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(schema_trans_dist)
     root_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(root_trans_dist)
     
     ## Now for the initial distribution
     # Count up the observations
     initial_counts = FreqDist()
     for sequence in training_samples:
         initial_counts.inc(sequence[0][0])
     # Use Laplace (plus one) smoothing
     #for label in self.schemata:
     #    initial_counts.inc(label)
     
     # Estimate distribution from this frequency distribution
     initial_dist = prob_dist_to_dictionary_prob_dist(\
                 mle_estimator(initial_counts, None), samples=self.schemata)
     
     # Replace the model's transition distributions
     self.schema_transition_dist = schema_trans_dist
     self.root_transition_dist = root_trans_dist
     self.initial_state_dist = initial_dist
     # Invalidate the cache
     self.clear_cache()
Esempio n. 15
0
    def train(self, sequences, grammar=None, logger=None):
        from jazzparser.utils.nltk.ngram import PrecomputedNgramModel
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()
        
        N = self.options['n']
        backoff = self.options['backoff']
        chordmap = self.options['chord_mapping']
        self.chordmap = chordmap
        self.chordmap_name = chordmap.name
        
        # Get data in the form of lists of (observation,tag) pairs
        training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \
                                for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)]
                                    for seq in sequences]
        # Get all the possible pos tags from the grammar
        label_dom = grammar.pos_tags
        # Build the emission domain to include all the observations that 
        #  theoretically could occur, not just those that are seen - 
        #  we might not see all interval/chord type pairs in the data.
        chord_types = chordmap.values()
        emission_dom = sum([["%d-%s" % (interval,chord) for chord in chord_types] for interval in range(12)], [])
        
        # Ignore unlabelled data
        ignores = ['']
        
        if self.options['backoff_cutoff'] is None:
            backoff_kwargs = {}
        else:
            backoff_kwargs = {'cutoff' : self.options['backoff_cutoff']}
        
        # Precompute the transition matrix and store it along with the model
        self.model = PrecomputedNgramModel.train(
                            self.options['n'],
                            training_data,
                            label_dom,
                            emission_dom=emission_dom,
                            cutoff=self.options['cutoff'],
                            backoff_order=self.options['backoff'],
                            estimator=self.options['estimator'],
                            ignore_list=ignores,
                            backoff_kwargs=backoff_kwargs)
        
        # Add some model-specific info into the descriptive text
        #  so we know how it was trained
        est_name = get_estimator_name(self.options['estimator'])
        self.model_description = """\
Model order: %(order)d
Backoff orders: %(backoff)d
Probability estimator: %(est)s
Zero-count threshold: %(cutoff)d
Chord mapping: %(chordmap)s
Training sequences: %(seqs)d
Training samples: %(samples)d\
""" % \
            {
                'est' : est_name,
                'seqs' : len(training_data),
                'samples' : len(sum(training_data, [])),
                'order' : self.options['n'],
                'backoff' : self.options['backoff'],
                'cutoff' : self.options['cutoff'],
                'chordmap' : self.chordmap_name,
            }
Esempio n. 16
0
 def __init__(self, grammar, input, options={}, dict_cutoff=5, *args, **kwargs):
     super(CandcTagger, self).__init__(grammar, input, options, *args, **kwargs)
     process_chord_input(self)
     
     if type(self) == CandcTagger:
         raise NotImplementedError, "Tried to instantiate CandcTagger "\
             "directly. You should use one of its subclasses."
     self.tag_batch_ratio = self.options['batch']
     model = self.options['model'].split('.')
     
     # Check that candc is available for supertagging
     if not os.path.exists(settings.CANDC.BASE_PATH):
         raise CandcConfigurationError, "The C&C parser base "\
             "directory %s does not exist" % settings.CANDC.BASE_PATH
     if not os.path.exists(settings.CANDC.MODELS_PATH):
         raise CandcConfigurationError, "The C&C parser models "\
             "directory %s does not exist" % settings.CANDC.MODELS_PATH
     candc_cmd = os.path.join(settings.CANDC.BASE_PATH, "bin", self.command)
     if not os.path.exists(candc_cmd):
         raise CandcConfigurationError, "The C&C supertagger command "\
             "%s does not exist. Have you built it?" % candc_cmd
     # Check the model exists
     candc_model = os.path.join(settings.CANDC.MODELS_PATH, *(model))
     if not os.path.exists(candc_model):
         raise CandcConfigurationError, "The C&C model given (%s) "\
             "doesn't exist." % candc_model
     
     # Create a logger to dump the output to
     logfile = os.path.join(settings.CANDC.LOG_DIRECTORY, "-".join(model))
     candc_logger = create_logger(filename=logfile)
     self.logger.info("Logging C&C output to %s" % logfile)
     # Note in the log what we're trying to tag
     candc_logger.info("Tagging: %s" % " ".join([str(crd) for crd in self.input]))
     
     # Read in the list of tags to smooth over
     self.tag_list = read_tag_list(os.path.join(candc_model, "tags"))
     
     # Read in extra options
     opts_filename = os.path.join(candc_model, "jpopts")
     if not os.path.exists(opts_filename):
         self.extra_opts = {}
     else:
         with open(opts_filename, 'r') as opts_file:
             self.extra_opts = dict(
                 [line.strip("\n").split(":", 1) 
                     for line in opts_file.readlines()])
     # Pull the chord mapping out of the options
     self.chordmap = get_chord_mapping(self.extra_opts.get('chordmap', None))
     
     # Spawn a process to do the tagging
     candc_command = [candc_cmd, "--model", candc_model, 
                     "--dict_cutoff", "%d" % dict_cutoff]+self.extra_args
     self.tagger = Popen(candc_command, 
                         stdin=PIPE, stdout=PIPE, stderr=PIPE)
     candc_logger.info("C&C command: %s" % " ".join(candc_command))
         
     self.tokens = self.input
     # Build some observations from the tokens
     observations = [
         interval_observation_from_chord_string_pair(ch1,ch2,type_mapping=self.chordmap) 
             for ch1,ch2 in group_pairs(self.tokens+[None])
     ]
     # Add a dummy POS tag to each input item
     self.observations = ["%s|C" % t for t in observations]
     candc_logger.info("Input: %s" % " ".join(self.observations))
     
     # Run the tagger on this input
     try:
         tagger_out, tagger_err = self.tagger.communicate(" ".join(self.observations))
     except OSError, err:
         logger.error("Could not run the C&C supertagger (%s)" % err)
         candc_logger.error("Error: %s" % err)
         # Output the actual error that the command returned
         error = self.tagger.stderr.read()
         logger.error("C&C returned the error: %s" % error)
         candc_logger.error("C&C error: %s" % error)
         raise CandcTaggingError, "error running the C&C supertagger: %s" % error
Esempio n. 17
0
 def __init__(self, grammar, input, options={}, *args, **kwargs):
     """
     Tags using an ngram model backed by NLTK.
     
     """
     super(NgramTagger, self).__init__(grammar, input, options, *args, **kwargs)
     process_chord_input(self)
     
     #### Tag the input sequence ####
     self._tagged_data = []
     self._batch_ranges = []
     # Group the input into pairs to get observations
     inpairs = group_pairs(self.input, none_final=True)
     # Convert the pairs into observations
     observations = [observation_from_chord_pair(pair[0], pair[1], self.model.chordmap) for pair in inpairs]
     
     # Use the ngram model to get tag probabilities for each input by 
     # computing the forward probability matrix
     if self.options['decode'] == "viterbi":
         probabilities = self.model.viterbi_probabilities(observations)
     elif self.options['decode'] == "forward":
         probabilities = self.model.forward_probabilities(observations)
     else:
         probabilities = self.model.forward_backward_probabilities(observations)
         
     word_tag_probs = []
     
     for index,probs in enumerate(probabilities):
         features = {
             'duration' : self.durations[index],
             'time' : self.times[index],
         }
         word_signs = []
         # Now assign a probability to each tag, given the observation
         for tag in self.model.tags:
             # Read a full sign out of the grammar
             sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features)
             if sign is not None:
                 # Read off the probability from the matrix
                 probability = probs[tag]
                 word_signs.append((sign, tag, probability))
         
         # Randomly sort the list first to make sure equal probabilities are randomly ordered
         word_signs = [(sign, tag, prob) for sign,tag,prob in word_signs]
         random.shuffle(word_signs)
         # Now sort by probability
         word_signs = list(reversed(sorted(word_signs, key=lambda x:x[2])))
         self._tagged_data.append(word_signs)
         
         # Store the list of probabilities for tags, which we'll use 
         #  after we've tagged every word to work out the sizes
         #  of the tag batches
         word_tag_probs.append([p for __,__,p in word_signs])
     
     if self.options['best']:
         # Only return one for each word
         self._batch_ranges = [[(0,1)] for i in range(len(self.input))]
     else:
         # Work out the number of tags to return in each batch
         batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio)
         # So far, this has assigned a probability to every possible 
         #  tag. We don't want the tagger ever to return the least 
         #  probably batch of tags, unless it's the only one.
         #batch_sizes = [batches[:-1] if len(batches) > 1 else batches for batches in batch_sizes]
         # Transform these into a form that's easier to use for getting the signs
         self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \
                                 for batches in batch_sizes]
Esempio n. 18
0
                 'messages' : messages,
                 'time' : timer.get_time(),
             })
             return response
 else:
     # Parsed successfully
     # Do some postprocessing and return to the main function
 
     # Output audio files from the harmonical
     if (options.harmonical is not None or \
             options.enharmonical is not None) and len(results) > 0:
         path = grammar.formalism.sign_to_coordinates(results[0])
         # Assuming we used a temporal formalism, the times should be 
         #  available as a list from the semantics
         times = results[0].semantics.get_path_times()
         point_durations = [next-current for current,next in group_pairs(times)] + [0]
         # Get 3d coordinates as well
         path3d = zip(add_z_coordinates(path, pitch_range=2), point_durations)
         path2d = zip(path,point_durations)
         # Get chord types out of the input
         chords = tagger.get_string_input()
         chord_durs = [tagger.get_word_duration(i) for i in range(tagger.input_length)]
         chord_types = [(Chord.from_name(c).type,dur) for c,dur in zip(chords,chord_durs)]
         
         if options.midi:
             # Maybe set this as a CL option or a setting
             # 73 - flute
             # 0  - piano
             # 4  - e-piano
             instrument = 73
             # TODO: make these filenames different for multiple inputs