Esempio n. 1
0
 def test_load_cached(self):
     """
     Loads a grammar using L{jazzparser.grammar.get_grammar} and then checks 
     that if we load another we get the same instance.
     
     """
     g1 = get_grammar()
     g2 = get_grammar()
     self.assertIs(g1, g2)
    def get_gold_semantics(self):
        """
        Tries to return a gold standard semantics. In some cases this is 
        stored along with the results in C{gold_parse}. In others this is 
        not available, but a gold annotated chord sequence is: then we 
        can get the gold semantics by parsing the annotations. Note that 
        this might take a little bit of time.
        
        In other cases neither is available. Then C{None} will be returned.
        
        """
        from jazzparser.evaluation.parsing import parse_sequence_with_annotations

        if self.gold_parse is not None:
            return self.gold_parse
        elif self.gold_sequence is not None:
            # Parse the annotations to get a semantics
            try:
                gold_parses = parse_sequence_with_annotations(
                    self.gold_sequence, grammar=get_grammar(), allow_subparses=False
                )
                if len(gold_parses) != 1:
                    # This shouldn't happen, since allow_subparses was False
                    return None
                # Got a result: return its semantics
                return gold_parses[0].semantics
            except ParseError:
                # Could not parse annotated sequence
                return None
        else:
            return None
Esempio n. 3
0
 def get_gold_semantics(self):
     """
     Tries to return a gold standard semantics. In some cases this is 
     stored along with the results in C{gold_parse}. In others this is 
     not available, but a gold annotated chord sequence is: then we 
     can get the gold semantics by parsing the annotations. Note that 
     this might take a little bit of time.
     
     In other cases neither is available. Then C{None} will be returned.
     
     """
     from jazzparser.evaluation.parsing import parse_sequence_with_annotations
     
     if self.gold_parse is not None:
         return self.gold_parse
     elif self.gold_sequence is not None:
         # Parse the annotations to get a semantics
         try:
             gold_parses = parse_sequence_with_annotations(
                                                 self.gold_sequence, 
                                                 grammar=get_grammar(),
                                                 allow_subparses=False)
             if len(gold_parses) != 1:
                 # This shouldn't happen, since allow_subparses was False
                 return None
             # Got a result: return its semantics
             return gold_parses[0].semantics
         except ParseError:
             # Could not parse annotated sequence
             return None
     else:
         return None
Esempio n. 4
0
 def train(name, training_data, options, grammar=None, logger=None):
     if grammar is None:
         grammar = get_grammar()
     if logger is None:
         logger = create_dummy_logger()
     
     # If cat_bins wasn't given, read it from the grammar
     if options["cat_bins"]:
         cat_bins = options["cat_bins"]
     elif grammar.max_categories:
         cat_bins = grammar.max_categories
     else:
         # Nothing given in the grammar either: error
         raise ValueError, "no value was given for cat_bins and the "\
             "grammar doesn't supply one"
     
     # Create a new model with empty distributions
     model = HalfspanPcfgModel(
                 name,
                 cutoff = options['cutoff'], 
                 cat_bins = cat_bins, 
                 estimator = options['estimator'], 
                 lexical = options['lexical'], 
                 chordmap = options['chord_mapping'],
                 grammar = grammar)
     
     # Add counts to this model for each sequence
     for sequence in training_data:
         try:
             model._sequence_train(sequence)
         except ModelTrainingError, err:
             logger.warn("Error training on %s: %s" % (sequence.string_name, 
                                                       err))
Esempio n. 5
0
 def __init__(self, input, options={}, grammar=None, *args, **kwargs):
     super(HmmPathBuilder, self).__init__(input, options, *args, **kwargs)
     process_chord_input(self)
     
     if grammar is None:
         self.grammar = get_grammar()
     else:
         self.grammar = grammar
     
     #### Tag the input sequence ####
     self._tagged_data = []
     
     chord_map = self.model.model.chord_map
     if isinstance(self.wrapped_input, ChordInput):
         chords = self.wrapped_input.to_db_input().chords
         observations = [(chord.root, chord_map[chord.type]) for chord in 
                             chords]
         self.input = chords
     elif isinstance(self.wrapped_input, DbInput):
         observations = [(chord.root, chord_map[chord.type]) for chord in 
                             self.wrapped_input.chords]
     elif isinstance(self.wrapped_input, WeightedChordLabelInput):
         observations = lattice_to_emissions(input, chord_map=chord_map)
         
     # Use the ngram model to get tag probabilities for each input by 
     # computing the state occupation probability matrix
     path_probs = self.model.viterbi_paths(observations, self.options['paths'])
     
     self._paths = [
         self.grammar.formalism.backoff_states_to_lf(zip(states,self.times))
                 for states,prob in path_probs]
     # Set the probability on each result
     for path,(states,prob) in zip(self._paths,path_probs):
         path.probability = prob
Esempio n. 6
0
def result_lengths(filename, grammar=None):
    """
    Opens the parse results file and returns the lengths of the gold standard 
    path and the top parse result's path.
    
    """
    if grammar is None:
        grammar = get_grammar()
    # Load the data in from the file
    res = ParseResults.from_file(filename)
    
    gold_parse = res.get_gold_semantics()
    if gold_parse is None:
        gold_length = 0
    else:
        # Measure the length of the gold standard
        gold_length = grammar.formalism.Evaluation.tonal_space_length(gold_parse)
    
    # Get the results in order of probability
    results = res.semantics
    if len(results) == 0:
        # No results: cannot analyse them
        return gold_length,0
    top_result = results[0][1]
    top_length = grammar.formalism.Evaluation.tonal_space_length(top_result)
        
    return gold_length, top_length
Esempio n. 7
0
    def train(self, data, grammar=None, logger=None):
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()

        model = HmmPathNgram.train(data,
                                   self.options['estimator'],
                                   grammar,
                                   cutoff=self.options['cutoff'],
                                   chord_map=self.options['chord_mapping'],
                                   order=self.options['n'],
                                   backoff_orders=self.options['backoff'])
        self.model = model

        # Add some model-specific info into the descriptive text
        #  so we know how it was trained
        est_name = get_estimator_name(self.options['estimator'])
        self.model_description = """\
Model order: %(order)d
Backoff orders: %(backoff)d
Probability estimator: %(est)s
Zero-count threshold: %(cutoff)d
Training sequences: %(seqs)d
Training samples: %(samples)d\
""" % \
            {
                'est' : est_name,
                'seqs' : len(data),
                'samples' : sum([len(s) for s in data], 0),
                'order' : self.options['n'],
                'backoff' : self.options['backoff'],
                'cutoff' : self.options['cutoff'],
            }
Esempio n. 8
0
def main():
    usage = "%prog <model-name>"
    description = "Generate chord sequences from a PCFG model"
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-g", "--grammar", dest="grammar", action="store", \
                        help="use the named grammar instead of the default.")
    parser.add_option("-d", "--debug", dest="debug", action="store_true", \
                        help="output debugging information during generation")
    options, arguments = parse_args_with_config(parser)

    if options.debug:
        logger = create_plain_stderr_logger(log_level=logging.DEBUG)
    else:
        logger = create_plain_stderr_logger(log_level=logging.WARN)

    if len(arguments) < 1:
        print "Specify a model name"
        sys.exit(1)
    model_name = arguments[0]

    grammar = get_grammar(options.grammar)
    PcfgModel = grammar.formalism.PcfgModel
    # Load the trained model
    model = PcfgModel.load_model(model_name)

    sequence = model.generate(logger=logger)
    if sequence is None:
        print "Model did not generate a sequence"
    else:
        print sequence
Esempio n. 9
0
    def train(name, training_data, options, grammar=None, logger=None):
        if grammar is None:
            grammar = get_grammar()
        if logger is None:
            logger = create_dummy_logger()

        # If cat_bins wasn't given, read it from the grammar
        if options["cat_bins"]:
            cat_bins = options["cat_bins"]
        elif grammar.max_categories:
            cat_bins = grammar.max_categories
        else:
            # Nothing given in the grammar either: error
            raise ValueError, "no value was given for cat_bins and the "\
                "grammar doesn't supply one"

        # Create a new model with empty distributions
        model = HalfspanPcfgModel(name,
                                  cutoff=options['cutoff'],
                                  cat_bins=cat_bins,
                                  estimator=options['estimator'],
                                  lexical=options['lexical'],
                                  chordmap=options['chord_mapping'],
                                  grammar=grammar)

        # Add counts to this model for each sequence
        for sequence in training_data:
            try:
                model._sequence_train(sequence)
            except ModelTrainingError, err:
                logger.warn("Error training on %s: %s" %
                            (sequence.string_name, err))
Esempio n. 10
0
    def __init__(self, input, options={}, grammar=None, *args, **kwargs):
        super(MidiHmmPathBuilder, self).__init__(input, options, *args, **kwargs)
        if grammar is None:
            self.grammar = get_grammar()
        else:
            self.grammar = grammar

        # Make a copy of the options that we will pass through to HmmPath
        options = self.options.copy()
        # Remove the options that the tagger doesn't need
        labeling_model_name = options.pop("labeling_model")
        latticen = options.pop("latticen")
        beam_ratio = options.pop("lattice_beam")
        viterbi = options.pop("label_viterbi")
        partition_labeler = options.pop("partition_labeler")

        # Use an HP chord labeler to label the MIDI data
        # Partition the labeling model if requested and a partition number
        #  was given for the supertagger
        if partition_labeler and "partition" in self.options and self.options["partition"] is not None:
            labeling_model_name += "%d" % self.options["partition"]

        # First run the chord labeler on the MIDI input
        # Load a labeling model
        labeler = HPChordLabeler.load_model(labeling_model_name)
        self.labeler = labeler
        # Get chord labels from the model: get a lattice of possible chords
        lattice = labeler.label_lattice(input, options={"n": latticen, "nokey": True, "viterbi": viterbi}, corpus=True)
        # Store the lattice for later reference
        self.lattice = lattice
        # Beam the lattice to get rid of very low probability labels
        lattice.apply_ratio_beam(ratio=beam_ratio)

        # Tag the lattice
        self.hmmpath = HmmPathBuilder(lattice, options, grammar, *args, **kwargs)
Esempio n. 11
0
    def train(self, data, grammar=None, logger=None):
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()
        
        model = HmmPathNgram.train(data, self.options['estimator'], grammar, 
                                   cutoff=self.options['cutoff'], 
                                   chord_map=self.options['chord_mapping'],
                                   order=self.options['n'],
                                   backoff_orders=self.options['backoff'])
        self.model = model
        
        # Add some model-specific info into the descriptive text
        #  so we know how it was trained
        est_name = get_estimator_name(self.options['estimator'])
        self.model_description = """\
Model order: %(order)d
Backoff orders: %(backoff)d
Probability estimator: %(est)s
Zero-count threshold: %(cutoff)d
Training sequences: %(seqs)d
Training samples: %(samples)d\
""" % \
            {
                'est' : est_name,
                'seqs' : len(data),
                'samples' : sum([len(s) for s in data], 0),
                'order' : self.options['n'],
                'backoff' : self.options['backoff'],
                'cutoff' : self.options['cutoff'],
            }
Esempio n. 12
0
def result_lengths(filename, grammar=None):
    """
    Opens the parse results file and returns the lengths of the gold standard 
    path and the top parse result's path.
    
    """
    if grammar is None:
        grammar = get_grammar()
    # Load the data in from the file
    res = ParseResults.from_file(filename)

    gold_parse = res.get_gold_semantics()
    if gold_parse is None:
        gold_length = 0
    else:
        # Measure the length of the gold standard
        gold_length = grammar.formalism.Evaluation.tonal_space_length(
            gold_parse)

    # Get the results in order of probability
    results = res.semantics
    if len(results) == 0:
        # No results: cannot analyse them
        return gold_length, 0
    top_result = results[0][1]
    top_length = grammar.formalism.Evaluation.tonal_space_length(top_result)

    return gold_length, top_length
Esempio n. 13
0
def main():
    usage = "%prog <model-name>"
    description = "Generate chord sequences from a PCFG model"
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-g", "--grammar", dest="grammar", action="store", \
                        help="use the named grammar instead of the default.")
    parser.add_option("-d", "--debug", dest="debug", action="store_true", \
                        help="output debugging information during generation")
    options, arguments = parse_args_with_config(parser)
    
    if options.debug:
        logger = create_plain_stderr_logger(log_level=logging.DEBUG)
    else:
        logger = create_plain_stderr_logger(log_level=logging.WARN)
    
    if len(arguments) < 1:
        print "Specify a model name"
        sys.exit(1)
    model_name = arguments[0]
    
    grammar = get_grammar(options.grammar)
    PcfgModel = grammar.formalism.PcfgModel
    # Load the trained model
    model = PcfgModel.load_model(model_name)
    
    sequence = model.generate(logger=logger)
    if sequence is None:
        print "Model did not generate a sequence"
    else:
        print sequence
    def get_gold_analysis(self):
        """
        Parses the annotations to get a gold analysis.
        
        """
        from jazzparser.evaluation.parsing import parse_sequence_with_annotations
        from jazzparser.grammar import get_grammar

        parses = parse_sequence_with_annotations(self, get_grammar(), allow_subparses=False)
        return parses[0].semantics
Esempio n. 15
0
def keys_for_sequence(sequence, grammar=None):
    """
    Takes a chord sequence from the chord corpus and parses using its 
    annotations. Returns a list of the key (as a pitch class integer) for 
    each chord.
    
    This is simply worked out, once the parse is done. Every chord in a cadence 
    has the same key as the resolution of the cadence, which can be read off 
    by taking the equal temperament pitch class for the tonal space point of 
    the resolution.
    
    """
    from jazzparser.evaluation.parsing import parse_sequence_with_annotations
    if grammar is None:
        grammar = get_grammar()
    # Try parsing the sequence according to the tree in the database
    sub_parses = parse_sequence_with_annotations(sequence, grammar)
    if len(sub_parses) > 1:
        # We can only continue if we got a full parse
        raise ParseError, "could not fully parse the sequence %s." % \
                sequence.string_name
    sems = sub_parses[0].semantics
    
    # Get the keys for this LF, and the times when they start
    keys = grammar.formalism.semantics_to_keys(sems)
    key_roots, change_times = zip(*keys)
    key_roots = iter(key_roots)
    change_times = iter(change_times)
    
    chords = iter(sequence)
    # Get the first key as the current key
    key = key_roots.next()
    # Ignore the first time, as it should be 0
    change_times.next()
    chord_keys = []
    try:
        # Get the next time at which we'll need to change
        next_change = change_times.next()
        
        time = 0
        for chord in sequence.chords:
            if time >= next_change:
                # Move onto the next key
                key = key_roots.next()
                next_change = change_times.next()
            # Add the next chord with the current key value
            chord_keys.append((chord, key))
            time += chord.duration
    except StopIteration:
        # No more timings left
        # Include the rest of the chords with the current key
        for chord in chords:
            chord_keys.append((chord, key))
    
    return chord_keys
def keys_for_sequence(sequence, grammar=None):
    """
    Takes a chord sequence from the chord corpus and parses using its 
    annotations. Returns a list of the key (as a pitch class integer) for 
    each chord.
    
    This is simply worked out, once the parse is done. Every chord in a cadence 
    has the same key as the resolution of the cadence, which can be read off 
    by taking the equal temperament pitch class for the tonal space point of 
    the resolution.
    
    """
    from jazzparser.evaluation.parsing import parse_sequence_with_annotations

    if grammar is None:
        grammar = get_grammar()
    # Try parsing the sequence according to the tree in the database
    sub_parses = parse_sequence_with_annotations(sequence, grammar)
    if len(sub_parses) > 1:
        # We can only continue if we got a full parse
        raise ParseError, "could not fully parse the sequence %s." % sequence.string_name
    sems = sub_parses[0].semantics

    # Get the keys for this LF, and the times when they start
    keys = grammar.formalism.semantics_to_keys(sems)
    key_roots, change_times = zip(*keys)
    key_roots = iter(key_roots)
    change_times = iter(change_times)

    chords = iter(sequence)
    # Get the first key as the current key
    key = key_roots.next()
    # Ignore the first time, as it should be 0
    change_times.next()
    chord_keys = []
    try:
        # Get the next time at which we'll need to change
        next_change = change_times.next()

        time = 0
        for chord in sequence.chords:
            if time >= next_change:
                # Move onto the next key
                key = key_roots.next()
                next_change = change_times.next()
            # Add the next chord with the current key value
            chord_keys.append((chord, key))
            time += chord.duration
    except StopIteration:
        # No more timings left
        # Include the rest of the chords with the current key
        for chord in chords:
            chord_keys.append((chord, key))

    return chord_keys
Esempio n. 17
0
    def train(self, sequences, grammar=None, logger=None):
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()
        
        # We can only train on annotated chord sequence input
        if not isinstance(sequences, (DbBulkInput, AnnotatedDbBulkInput)):
            raise TaggerTrainingError, "can only train ngram-multi model "\
                "on bulk db chord input (bulk-db or bulk-db-annotated). Got "\
                "input of type '%s'" % type(sequences).__name__
        
        if self.options['backoff_cutoff'] is None:
            backoff_kwargs = {}
        else:
            backoff_kwargs = {'cutoff' : self.options['backoff_cutoff']}
        
        # Get all the possible pos tags from the grammar
        schemata = grammar.pos_tags
        # Build the emission domain to include all the observations that 
        #  theoretically could occur, not just those that are seen - 
        #  we might not see all interval/chord type pairs in the data.
        chord_types = list(set(self.options['chord_mapping'].values()))
        
        self.model = MultiChordNgramModel.train(
                                    sequences,
                                    schemata,
                                    chord_types,
                                    self.options['estimator'], 
                                    cutoff=self.options['cutoff'],
                                    chord_map=self.options['chord_mapping'],
                                    order=self.options['n'],
                                    backoff_orders=self.options['backoff'],
                                    backoff_kwargs=backoff_kwargs)
        
        # Add some model-specific info into the descriptive text
        #  so we know how it was trained
        est_name = get_estimator_name(self.options['estimator'])
        self.model_description = """\
Order: %(order)d
Backoff orders: %(backoff)d
Probability estimator: %(est)s
Zero-count threshold: %(cutoff)d
Chord mapping: %(chordmap)s
Training sequences: %(seqs)d\
""" % \
            {
                'est' : est_name,
                'seqs' : len(sequences),
                'cutoff' : self.options['cutoff'],
                'chordmap' : self.options['chord_mapping'].name,
                'order' : self.options['n'],
                'backoff' : self.options['backoff'],
            }
Esempio n. 18
0
 def get_gold_analysis(self):
     """
     Parses the annotations to get a gold analysis.
     
     """
     from jazzparser.evaluation.parsing import parse_sequence_with_annotations
     from jazzparser.grammar import get_grammar
     parses = parse_sequence_with_annotations(self,
                                              get_grammar(),
                                              allow_subparses=False)
     return parses[0].semantics
Esempio n. 19
0
def main():
    parser = OptionParser()
    usage = "%prog [options] [<seq-db-file>]"
    description = "Measure the degree of ambiguity (average cats per chord) "\
        "for a grammar over a particular dataset"
    parser.add_option('-g',
                      '--grammar',
                      dest='grammar',
                      action='store',
                      help='Speficy a grammar by name')
    options, arguments = parser.parse_args()

    if len(arguments) < 1:
        print "No sequence index file given: grammar stats only"
        seq_file = None
    else:
        seq_file = arguments[0]
    # Load the grammar
    grammar = get_grammar(options.grammar)

    # Some stats about ambiguity in the grammar
    table = []
    class_cats = []
    for class_name, chord_class in grammar.chord_classes.items():
        if class_name not in EXCLUDE_CLASSES:
            cats = grammar.get_signs_for_word(str(chord_class.words[0]))
            table.append([str(class_name), str(len(cats))])
            class_cats.append(len(cats))

    table.append(["Mean", "%.2f" % (float(sum(class_cats)) / len(class_cats))])
    table.append(["Std dev", "%.2f" % (std(class_cats))])
    print "Cats for each chord class:"
    pprint_table(sys.stdout, table, justs=[True, True])

    # Ambiguity stats on the dataset
    if seq_file is not None:
        seqs = SequenceIndex.from_file(arguments[0])

        counts = []
        for seq in seqs:
            for chord in seq:
                cats = grammar.get_signs_for_word(chord)
                counts.append(len(cats))

        table = []
        table.append(["Chords", str(len(counts))])
        table.append(
            ["Cats per chord",
             "%.2f" % (float(sum(counts)) / len(counts))])
        table.append(["Std dev", "%.2f" % (std(counts))])

        print
        pprint_table(sys.stdout, table, justs=[True, True])
Esempio n. 20
0
def results_alignment(top_result, gold, grammar=None):
    """
    Returns the list of alignment operations that result in the optimal alignment.
    
    @return: tuple containing the alignment and the two sequences in the form 
    that they were compared (gold, top result).
    
    """
    if grammar is None:
        grammar = get_grammar()
    # Perform the alignment
    alignment,gold_seq,result_seq = grammar.formalism.Evaluation.tonal_space_alignment(gold, top_result)
    return alignment,gold_seq,result_seq
Esempio n. 21
0
def results_alignment(top_result, gold, grammar=None):
    """
    Returns the list of alignment operations that result in the optimal alignment.
    
    @return: tuple containing the alignment and the two sequences in the form 
    that they were compared (gold, top result).
    
    """
    if grammar is None:
        grammar = get_grammar()
    # Perform the alignment
    alignment, gold_seq, result_seq = grammar.formalism.Evaluation.tonal_space_alignment(
        gold, top_result)
    return alignment, gold_seq, result_seq
Esempio n. 22
0
def generate_tag_list(filename, grammar=None):
    """
    Generates a list of possible tags to be stored along with a C&C model.
    It contains all tags that are in the grammar.
    
    """
    from jazzparser.grammar import get_grammar
    if grammar is None:
        # Load the default grammar
        grammar = get_grammar()
    tags = grammar.families.keys()
    data = "\n".join(tags)
    file = open(filename, 'w')
    file.write(data)
    file.close()
Esempio n. 23
0
def main():
    parser = OptionParser()
    usage = "%prog [options] [<seq-db-file>]"
    description = "Measure the degree of ambiguity (average cats per chord) "\
        "for a grammar over a particular dataset"
    parser.add_option('-g', '--grammar', dest='grammar', action='store', help='Speficy a grammar by name')
    options, arguments = parser.parse_args()
    
    if len(arguments) < 1:
        print "No sequence index file given: grammar stats only"
        seq_file = None
    else:
        seq_file = arguments[0]
    # Load the grammar
    grammar = get_grammar(options.grammar)
    
    # Some stats about ambiguity in the grammar
    table = []
    class_cats = []
    for class_name,chord_class in grammar.chord_classes.items():
        if class_name not in EXCLUDE_CLASSES:
            cats = grammar.get_signs_for_word(str(chord_class.words[0]))
            table.append([str(class_name), str(len(cats))])
            class_cats.append(len(cats))
    
    table.append(["Mean", "%.2f" % (float(sum(class_cats))/len(class_cats))])
    table.append(["Std dev", "%.2f" % (std(class_cats))])
    print "Cats for each chord class:"
    pprint_table(sys.stdout, table, justs=[True, True])
    
    # Ambiguity stats on the dataset
    if seq_file is not None:
        seqs = SequenceIndex.from_file(arguments[0])
        
        counts = []
        for seq in seqs:
            for chord in seq:
                cats = grammar.get_signs_for_word(chord)
                counts.append(len(cats))
        
        table = []
        table.append(["Chords", str(len(counts))])
        table.append(["Cats per chord", "%.2f" % (float(sum(counts)) / len(counts))])
        table.append(["Std dev", "%.2f" % (std(counts))])
        
        print
        pprint_table(sys.stdout, table, justs=[True, True])
Esempio n. 24
0
def get_depend_graph(semantics):

    # 'coord', 'xycoord', 'alpha' or 'roman'
    grammar = get_grammar()
    grammar.formalism.cl_output_options("tsformat=coord")   
    coords = zip(*grammar.formalism.semantics_to_coordinates(semantics))[0]
    funs = zip(*grammar.formalism.semantics_to_functions(semantics))[0]
    gold_seq = zip(coords, funs)

    tags = []
    for g in gold_seq:
        t = "%s,%s" % (coordinate_to_roman_name(g[0]).replace("-","").replace("b", ""), g[1])
        tags.append(t)

    gold_graph,gold_time_map = semantics_to_dependency_graph(semantics) 
    depend_graph_tags = eval("%s" % gold_graph.get_graph_pos(tags))
    gold_graph = eval("%s" % gold_graph.get_graph_index())
    return [gold_graph, depend_graph_tags]
    def get_gold_analysis(self):
        """
        Parses the annotations, if present, to get a gold analysis. Unlike 
        L{AnnotatedDbInput}, this input type cannot be assumed to have 
        annotations. It will therefore not raise an error if annotations 
        are missing or incomplete, but just return None.
        
        """
        from jazzparser.evaluation.parsing import parse_sequence_with_annotations
        from jazzparser.grammar import get_grammar
        from jazzparser.parsers import ParseError

        try:
            parses = parse_sequence_with_annotations(self, get_grammar(), allow_subparses=False)
        except ParseError:
            return None
        else:
            return parses[0].semantics
Esempio n. 26
0
    def __init__(self, input, options={}, grammar=None, *args, **kwargs):
        super(MidiHmmPathBuilder, self).__init__(input, options, *args,
                                                 **kwargs)
        if grammar is None:
            self.grammar = get_grammar()
        else:
            self.grammar = grammar

        # Make a copy of the options that we will pass through to HmmPath
        options = self.options.copy()
        # Remove the options that the tagger doesn't need
        labeling_model_name = options.pop('labeling_model')
        latticen = options.pop('latticen')
        beam_ratio = options.pop('lattice_beam')
        viterbi = options.pop('label_viterbi')
        partition_labeler = options.pop('partition_labeler')

        # Use an HP chord labeler to label the MIDI data
        # Partition the labeling model if requested and a partition number
        #  was given for the supertagger
        if partition_labeler and 'partition' in self.options and \
                    self.options['partition'] is not None:
            labeling_model_name += "%d" % self.options['partition']

        # First run the chord labeler on the MIDI input
        # Load a labeling model
        labeler = HPChordLabeler.load_model(labeling_model_name)
        self.labeler = labeler
        # Get chord labels from the model: get a lattice of possible chords
        lattice = labeler.label_lattice(input,
                                        options={
                                            'n': latticen,
                                            'nokey': True,
                                            'viterbi': viterbi
                                        },
                                        corpus=True)
        # Store the lattice for later reference
        self.lattice = lattice
        # Beam the lattice to get rid of very low probability labels
        lattice.apply_ratio_beam(ratio=beam_ratio)

        # Tag the lattice
        self.hmmpath = HmmPathBuilder(lattice, options, grammar, *args,
                                      **kwargs)
Esempio n. 27
0
def main():
    usage = "%prog [<options>] <model-name>"
    description = "Delete a PCFG model"
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \
                    help="Number of partitions the model is divided into")
    parser.add_option("-g",
                      "--grammar",
                      dest="grammar",
                      action="store",
                      help="use the named grammar instead of the default.")
    options, arguments = parse_args_with_config(parser)

    # Load a grammar
    grammar = get_grammar(options.grammar)
    # Get the pcfg model class for the formalism
    PcfgModel = grammar.formalism.PcfgModel

    if len(arguments) == 0:
        print >> sys.stderr, "Specify a model name"
        models = PcfgModel.list_models()
        print >> sys.stderr, "Available models: %s" % ", ".join(models)
        sys.exit(1)
    model_name = arguments[0]
    print "Model base name:", model_name

    if options.partitions is not None:
        parts = [(i, "%s%d" % (model_name, i))
                 for i in range(options.partitions)]
    else:
        parts = [(None, model_name)]

    # First check all the models exist
    for parti, part_model in parts:
        if part_model not in PcfgModel.list_models():
            print "The model '%s' does not exist" % part_model
            sys.exit(1)

    # Now delete them one by one
    for parti, part_model in parts:
        # Load the model
        model = PcfgModel.load_model(part_model)
        model.delete()
        print "Removed model: %s" % part_model
Esempio n. 28
0
def main():
    usage = "%prog <model-name>"
    description = "Debug a PCFG model"
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-g", "--grammar", dest="grammar", action="store", \
                        help="use the named grammar instead of the default.")
    parser.add_option("-d", "--debug", dest="debug", action="store_true", \
                        help="output debugging information during generation")
    parser.add_option("--file-options", "--fopt", dest="file_options", \
                        action="store", help="options for the input file "\
                        "(--file). Type '--fopt help' for a list of available "\
                        "options.")
    options, arguments = parse_args_with_config(parser)

    if len(arguments) < 1:
        print "Specify a model name"
        sys.exit(1)
    model_name = arguments[0]

    if len(arguments) < 2:
        print "Specify an input file"

    grammar = get_grammar(options.grammar)
    PcfgModel = grammar.formalism.PcfgModel
    # Load the trained model
    model = PcfgModel.load_model(model_name)

    # Try getting a file from the command-line options
    input_data = command_line_input(filename=arguments[1],
                                    filetype="db",
                                    options=options.file_options)

    # Prepare the input and annotations
    sequence = input_data.sequence
    categories = [chord.category for chord in sequence.iterator()]
    str_inputs = input_data.inputs
    # Build the implicit normal-form tree from the annotations
    try:
        tree = build_tree_for_sequence(sequence)
    except TreeBuildError, err:
        raise ModelTrainingError, "could not build a tree for '%s': %s" % \
            (sequence.string_name, err)
Esempio n. 29
0
    def get_gold_analysis(self):
        """
        Parses the annotations, if present, to get a gold analysis. Unlike 
        L{AnnotatedDbInput}, this input type cannot be assumed to have 
        annotations. It will therefore not raise an error if annotations 
        are missing or incomplete, but just return None.
        
        """
        from jazzparser.evaluation.parsing import parse_sequence_with_annotations
        from jazzparser.grammar import get_grammar
        from jazzparser.parsers import ParseError

        try:
            parses = parse_sequence_with_annotations(self,
                                                     get_grammar(),
                                                     allow_subparses=False)
        except ParseError:
            return None
        else:
            return parses[0].semantics
Esempio n. 30
0
def main():
    usage = "%prog <model-name> [options]"
    description = "Outputs a summary of a named model (counts, etc)"
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-g", "--grammar", dest="grammar", action="store", \
                        help="use the named grammar instead of the default.")
    options, arguments = parser.parse_args()
    
    grammar = get_grammar(options.grammar)
    PcfgModel = grammar.formalism.PcfgModel
    
    if len(arguments) == 0:
        print >>sys.stderr, "Specify a model name"
        models = PcfgModel.list_models()
        print >>sys.stderr, "Available models: %s" % ", ".join(models)
        sys.exit(1)
    model_name = arguments[0]
    # Load the trained model
    model = PcfgModel.load_model(model_name)
    
    print model.description()
Esempio n. 31
0
def main():
    usage = "%prog <model-name> [options]"
    description = "Outputs a summary of a named model (counts, etc)"
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-g", "--grammar", dest="grammar", action="store", \
                        help="use the named grammar instead of the default.")
    options, arguments = parser.parse_args()

    grammar = get_grammar(options.grammar)
    PcfgModel = grammar.formalism.PcfgModel

    if len(arguments) == 0:
        print >> sys.stderr, "Specify a model name"
        models = PcfgModel.list_models()
        print >> sys.stderr, "Available models: %s" % ", ".join(models)
        sys.exit(1)
    model_name = arguments[0]
    # Load the trained model
    model = PcfgModel.load_model(model_name)

    print model.description()
Esempio n. 32
0
def main():
    usage = "%prog [<options>] <model-name>"
    description = "Delete a PCFG model"
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \
                    help="Number of partitions the model is divided into")
    parser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.")
    options, arguments = parse_args_with_config(parser)
    
    # Load a grammar
    grammar = get_grammar(options.grammar)
    # Get the pcfg model class for the formalism
    PcfgModel = grammar.formalism.PcfgModel
        
    if len(arguments) == 0:
        print >>sys.stderr, "Specify a model name"
        models = PcfgModel.list_models()
        print >>sys.stderr, "Available models: %s" % ", ".join(models)
        sys.exit(1)
    model_name = arguments[0]
    print "Model base name:", model_name
    
    if options.partitions is not None:
        parts = [(i, "%s%d" % (model_name, i)) for i in range(options.partitions)]
    else:
        parts = [(None, model_name)]

    # First check all the models exist
    for parti,part_model in parts:
        if part_model not in PcfgModel.list_models():
            print "The model '%s' does not exist" % part_model
            sys.exit(1)
    
    # Now delete them one by one
    for parti,part_model in parts:
        # Load the model
        model = PcfgModel.load_model(part_model)
        model.delete()
        print "Removed model: %s" % part_model
Esempio n. 33
0
    def __init__(self, input, options={}, grammar=None, *args, **kwargs):
        super(HmmPathBuilder, self).__init__(input, options, *args, **kwargs)
        process_chord_input(self)

        if grammar is None:
            self.grammar = get_grammar()
        else:
            self.grammar = grammar

        #### Tag the input sequence ####
        self._tagged_data = []

        chord_map = self.model.model.chord_map
        if isinstance(self.wrapped_input, ChordInput):
            chords = self.wrapped_input.to_db_input().chords
            observations = [(chord.root, chord_map[chord.type])
                            for chord in chords]
            self.input = chords
        elif isinstance(self.wrapped_input, DbInput):
            observations = [(chord.root, chord_map[chord.type])
                            for chord in self.wrapped_input.chords]
        elif isinstance(self.wrapped_input, WeightedChordLabelInput):
            observations = lattice_to_emissions(input, chord_map=chord_map)

        # Use the ngram model to get tag probabilities for each input by
        # computing the state occupation probability matrix
        path_probs = self.model.viterbi_paths(observations,
                                              self.options['paths'])

        self._paths = [
            self.grammar.formalism.backoff_states_to_lf(zip(
                states, self.times)) for states, prob in path_probs
        ]
        # Set the probability on each result
        for path, (states, prob) in zip(self._paths, path_probs):
            path.probability = prob
Esempio n. 34
0
    def train(self, sequences, grammar=None, logger=None):
        from jazzparser.utils.nltk.ngram import PrecomputedNgramModel
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()

        N = self.options['n']
        backoff = self.options['backoff']
        chordmap = self.options['chord_mapping']
        self.chordmap = chordmap
        self.chordmap_name = chordmap.name

        # Get data in the form of lists of (observation,tag) pairs
        training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \
                                for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)]
                                    for seq in sequences]
        # Get all the possible pos tags from the grammar
        label_dom = grammar.pos_tags
        # Build the emission domain to include all the observations that
        #  theoretically could occur, not just those that are seen -
        #  we might not see all interval/chord type pairs in the data.
        chord_types = chordmap.values()
        emission_dom = sum(
            [["%d-%s" % (interval, chord) for chord in chord_types]
             for interval in range(12)], [])

        # Ignore unlabelled data
        ignores = ['']

        if self.options['backoff_cutoff'] is None:
            backoff_kwargs = {}
        else:
            backoff_kwargs = {'cutoff': self.options['backoff_cutoff']}

        # Precompute the transition matrix and store it along with the model
        self.model = PrecomputedNgramModel.train(
            self.options['n'],
            training_data,
            label_dom,
            emission_dom=emission_dom,
            cutoff=self.options['cutoff'],
            backoff_order=self.options['backoff'],
            estimator=self.options['estimator'],
            ignore_list=ignores,
            backoff_kwargs=backoff_kwargs)

        # Add some model-specific info into the descriptive text
        #  so we know how it was trained
        est_name = get_estimator_name(self.options['estimator'])
        self.model_description = """\
Model order: %(order)d
Backoff orders: %(backoff)d
Probability estimator: %(est)s
Zero-count threshold: %(cutoff)d
Chord mapping: %(chordmap)s
Training sequences: %(seqs)d
Training samples: %(samples)d\
""" % \
            {
                'est' : est_name,
                'seqs' : len(training_data),
                'samples' : len(sum(training_data, [])),
                'order' : self.options['n'],
                'backoff' : self.options['backoff'],
                'cutoff' : self.options['cutoff'],
                'chordmap' : self.chordmap_name,
            }
Esempio n. 35
0
 def setUp(self):
     # Load a grammar
     self.grammar = get_grammar()
     self.coord = self.grammar.rules_by_name['coord']
Esempio n. 36
0
 def setUp(self):
     # Load a grammar
     self.grammar = get_grammar()
     self.devel = self.grammar.rules_by_name['dev']
Esempio n. 37
0
 def setUp(self):
     # Load a grammar
     self.grammar = get_grammar()
     self.coord = self.grammar.rules_by_name['coord']
def main():	

	features = {}
	input_files = glob.glob(PARSES_FILES)
	
	for file_results in input_files:
		# We read in the whole file (it's pickled, so we have to), but don't 
		#  keep the pres object after the loop iteration, because it can 
		#  be very big
		try:
			pres = ParseResults.from_file(file_results)
		except ParseResults.LoadError, err:
			if options.errors:
				# Print all load errors
				print >>sys.stderr, "Error loading file: %s" % (err)
			errors.append(file_results)
			continue

		print file_results
		if len(pres.semantics) == 0:
			continue
			
		top_result = pres.semantics[0][1]
		gold_result = pres.get_gold_semantics()

		# 'coord', 'xycoord', 'alpha' or 'roman'
		grammar = get_grammar()
		grammar.formalism.cl_output_options("tsformat=coord")	
		coords = zip(*grammar.formalism.semantics_to_coordinates(gold_result))[0]
		funs = zip(*grammar.formalism.semantics_to_functions(gold_result))[0]
		gold_seq = zip(coords, funs)

		tags = []
		for g in gold_seq:
			t = "%s,%s" % (coordinate_to_roman_name(g[0]), g[1])
			tags.append(t)

		gold_graph,gold_time_map = semantics_to_dependency_graph(gold_result)	
		depend_graph = eval("%s" % gold_graph.get_graph_pos(tags))
		gold_graph = eval("%s" % gold_graph.get_graph_index())

		# Words
		for g in gold_graph:
			word1 = g[0].split(",")
			uni_word = "UNIGRAM:"+str(word1[0])
			if uni_word not in features:
				features[uni_word] = 0
			else:
				features[uni_word] += 1

		for dep in depend_graph:
			word1 = dep[0].split(",")
			uni_word = "UNIGRAM:"+str(word1[0])
			if uni_word not in features:
				features[uni_word] = 0
			else:
				features[uni_word] += 1

		# Tags
		for dep in depend_graph:
			word1 = dep[0].split(",")
			uni_tag = "UNIGRAM:"+str(word1[1])
			if uni_tag not in features:
				features[uni_tag] = 0
			else:
				features[uni_tag] += 1

		# Bigram Words
		for g in gold_graph:
			word1 = g[0].split(",")
			if g[1] == "ROOT":
				bigram_word = "BIGRAM:"+str(word1[0])+":ROOT"
			else:
				word2 = g[1].split(",")
				bigram_word = "BIGRAM:"+str(word1[0])+":"+str(word2[0])
			if bigram_word not in features:
				features[bigram_word] = 0	
			else:
				features[bigram_word] += 1

		for dep in depend_graph:
			word1 = dep[0].split(",")
			if dep[1] == "ROOT":
				bigram_word = "BIGRAM:"+str(word1[0])+":ROOT"
			else:
				word2 = dep[1].split(",")
				bigram_word = "BIGRAM:"+str(word1[0])+":"+str(word2[0])
			if bigram_word not in features:
				features[bigram_word] = 0	
			else:
				features[bigram_word] += 1

		# Bigram Tags
		for dep in depend_graph:
			word1 = dep[0].split(",")
			if dep[1] == "ROOT":
				bigram_tag = "BIGRAM:"+str(word1[1])+":ROOT"
			else:
				word2 = dep[1].split(",")
				bigram_tag = "BIGRAM:"+str(word1[1])+":"+str(word2[1])
			if bigram_tag not in features:
				features[bigram_tag] = 0			
			else:
				features[bigram_tag] += 1

		# Bigram Words/Tags
		for dep in depend_graph:
			word1 = dep[0].split(",")
			if dep[1] == "ROOT":
				bigram_words_tags = "BIGRAM:"+str(word1[0])+":"+str(word1[1])+":ROOT"
			else:
				word2 = dep[1].split(",")
				bigram_words_tags = "BIGRAM:"+str(word1[0])+":"+str(word1[1])+":"+str(word2[0])+":"+str(word2[1])
			if bigram_words_tags not in features:
				features[bigram_words_tags] = 0
			else:
				features[bigram_words_tags] += 1

		# Trigram words
		for i in range(len(gold_graph)):
			if gold_graph[i][1] == "ROOT":
				# Get trigram
				if gold_graph[i-1][1] != "ROOT" and gold_graph[i-2][1] != "ROOT":
					head_root_word = gold_graph[i][0].split(",")[0]
					head_i1_word = gold_graph[i-1][0].split(",")[0]
					head_i2_word = gold_graph[i-2][0].split(",")[0]
					trigram_word = "TRIGRAM:" + head_root_word + ":" + head_i1_word + ":" + head_i2_word
					if trigram_word not in features:
						features[trigram_word] = 0
					else:
						features[trigram_word] += 1

		for i in range(len(depend_graph)):
			if depend_graph[i][1] == "ROOT":
				# Get trigram
				if depend_graph[i-1][1] != "ROOT" and depend_graph[i-2][1] != "ROOT":
					head_root_word = depend_graph[i][0].split(",")[0]
					head_i1_word = depend_graph[i-1][0].split(",")[0]
					head_i2_word = depend_graph[i-2][0].split(",")[0]
					trigram_word = "TRIGRAM:" + head_root_word + ":" + head_i1_word + ":" + head_i2_word
					if trigram_word not in features:
						features[trigram_word] = 0
					else:
						features[trigram_word] += 1

		# Trigram tags
		for i in range(len(depend_graph)):
			if depend_graph[i][1] == "ROOT":
				# Get trigram
				if depend_graph[i-1][1] != "ROOT" and depend_graph[i-2][1] != "ROOT":
					head_root_tag = depend_graph[i][0].split(",")[1]
					head_i1_tag = depend_graph[i-1][0].split(",")[1]
					head_i2_tag = depend_graph[i-2][0].split(",")[1]
					trigram_tag = "TRIGRAM:" + head_root_tag + ":" + head_i1_tag + ":" + head_i2_tag
					if trigram_tag not in features:
						features[trigram_tag] = 0
					else:
						features[trigram_tag] += 1

		# Trigram words/tags
		for i in range(len(depend_graph)):
			if depend_graph[i][1] == "ROOT":
				# Get trigram
				if depend_graph[i-1][1] != "ROOT" and depend_graph[i-2][1] != "ROOT":
					head_root = depend_graph[i][0].split(",")
					head_root_word_tag = head_root[0] + ":" + head_root[1]
					# words/tags
					head_i1 = depend_graph[i-1][0].split(",")
					head_i2 = depend_graph[i-2][0].split(",")
					head_i1_word_tag = head_i1[0] + ":" + head_i1[1]
					head_i2_word_tag = head_i2[0] + ":" + head_i2[1]

					trigram_word_tag = "TRIGRAM:" + head_root_word_tag + ":" + head_i1_word_tag + ":" + head_i2_word_tag
					if trigram_word_tag not in features:
						features[trigram_word_tag] = 0
					else:
						features[trigram_word_tag] += 1
Esempio n. 39
0
def main():
    usage = "%prog [<options>]"
    description = "Runs a supertagger from the Jazz Parser to tag some input "\
        "but just outputs the results, rather than continuing to parse."
    optparser = OptionParser(usage=usage, description=description)
    
    # Tagger options
    optparser.add_option("-t", "--tagger", "--supertagger", dest="supertagger", action="store", help="run the parser using the named supertagger. Use '-t help' to see the list of available taggers. Default: %s" % settings.DEFAULT_SUPERTAGGER, default=settings.DEFAULT_SUPERTAGGER)
    optparser.add_option("--topt", "--tagger-options", dest="topts", action="append", help="specify options for the tagger. Type '--topt help', using '-u <name>' to select a tagger module, to get a list of options.")
    # Commonly-used misc
    optparser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.")
    # File input options
    optparser.add_option("--file", "-f", dest="file", action="store", help="use a file to get parser input from. Use --filetype to specify the type of the file.")
    optparser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file (--file). Use '--filetype help' for a list of available types. Default: chords", default='chords')
    optparser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file (--file). Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.")
    # Misc options
    optparser.add_option("-v", "--debug", dest="debug", action="store_true", help="output verbose debugging information.")
    optparser.add_option("-i", "--interactive", dest="interactive", action="store_true", help="instead of just outputing all tags in one go, wait for user input between each iteration of adaptive supertagging")
    # Logging options
    optparser.add_option("--logger", dest="logger", action="store", help="directory to put parser logging in. A filename based on an identifier for each individual input will be appended.")
    # Read in command line options and args
    options, clinput = parse_args_with_config(optparser)
    
    ########################### Option processing ####################
    if options.logger:
        # Directory
        parse_logger_dir = options.logger
        check_directory(parse_logger_dir)
    else:
        parse_logger_dir = None
    
    ######## Grammar ########
    # Read in the grammar
    grammar = get_grammar(options.grammar)
        
    ######## Supertagger ########
    # Load the supertagger requested
    if options.supertagger.lower() == "help":
        print "Available taggers are: %s" % ", ".join(TAGGERS)
        return 0
    try:
        tagger_cls = get_tagger(options.supertagger)
    except TaggerLoadError:
        logger.error("The tagger '%s' could not be loaded. Possible "\
            "taggers are: %s" % (options.supertagger, ", ".join(TAGGERS)))
        return 1
        
    # Get supertagger options before initializing the tagger
    if options.topts is not None:
        toptstr = options.topts
        if "help" in [s.strip().lower() for s in toptstr]:
            # Output this tagger's option help
            from jazzparser.utils.options import options_help_text
            print options_help_text(tagger_cls.TAGGER_OPTIONS, intro="Available options for selected tagger")
            return 0
        toptstr = ":".join(toptstr)
    else:
        toptstr = ""
    topts = ModuleOption.process_option_string(toptstr)
    # Check that the options are valid
    try:
        tagger_cls.check_options(topts)
    except ModuleOptionError, err:
        print "Problem with tagger options (--topt): %s" % err
        return 1
Esempio n. 40
0
 def setUp(self):
     # Load a grammar
     self.grammar = get_grammar()
     self.fapply = self.grammar.rules_by_name['appf']
     self.bapply = self.grammar.rules_by_name['appb']
Esempio n. 41
0
def main():
    usage = "%prog [<options>]"
    description = "Runs a supertagger from the Jazz Parser to tag some input "\
        "but just outputs the results, rather than continuing to parse."
    optparser = OptionParser(usage=usage, description=description)

    # Tagger options
    optparser.add_option(
        "-t",
        "--tagger",
        "--supertagger",
        dest="supertagger",
        action="store",
        help=
        "run the parser using the named supertagger. Use '-t help' to see the list of available taggers. Default: %s"
        % settings.DEFAULT_SUPERTAGGER,
        default=settings.DEFAULT_SUPERTAGGER)
    optparser.add_option(
        "--topt",
        "--tagger-options",
        dest="topts",
        action="append",
        help=
        "specify options for the tagger. Type '--topt help', using '-u <name>' to select a tagger module, to get a list of options."
    )
    # Commonly-used misc
    optparser.add_option("-g",
                         "--grammar",
                         dest="grammar",
                         action="store",
                         help="use the named grammar instead of the default.")
    # File input options
    optparser.add_option(
        "--file",
        "-f",
        dest="file",
        action="store",
        help=
        "use a file to get parser input from. Use --filetype to specify the type of the file."
    )
    optparser.add_option(
        "--filetype",
        "--ft",
        dest="filetype",
        action="store",
        help=
        "select the file type for the input file (--file). Use '--filetype help' for a list of available types. Default: chords",
        default='chords')
    optparser.add_option(
        "--file-options",
        "--fopt",
        dest="file_options",
        action="store",
        help=
        "options for the input file (--file). Type '--fopt help', using '--ft <type>' to select file type, for a list of available options."
    )
    # Misc options
    optparser.add_option("-v",
                         "--debug",
                         dest="debug",
                         action="store_true",
                         help="output verbose debugging information.")
    optparser.add_option(
        "-i",
        "--interactive",
        dest="interactive",
        action="store_true",
        help=
        "instead of just outputing all tags in one go, wait for user input between each iteration of adaptive supertagging"
    )
    # Logging options
    optparser.add_option(
        "--logger",
        dest="logger",
        action="store",
        help=
        "directory to put parser logging in. A filename based on an identifier for each individual input will be appended."
    )
    # Read in command line options and args
    options, clinput = parse_args_with_config(optparser)

    ########################### Option processing ####################
    if options.logger:
        # Directory
        parse_logger_dir = options.logger
        check_directory(parse_logger_dir)
    else:
        parse_logger_dir = None

    ######## Grammar ########
    # Read in the grammar
    grammar = get_grammar(options.grammar)

    ######## Supertagger ########
    # Load the supertagger requested
    if options.supertagger.lower() == "help":
        print "Available taggers are: %s" % ", ".join(TAGGERS)
        return 0
    try:
        tagger_cls = get_tagger(options.supertagger)
    except TaggerLoadError:
        logger.error("The tagger '%s' could not be loaded. Possible "\
            "taggers are: %s" % (options.supertagger, ", ".join(TAGGERS)))
        return 1

    # Get supertagger options before initializing the tagger
    if options.topts is not None:
        toptstr = options.topts
        if "help" in [s.strip().lower() for s in toptstr]:
            # Output this tagger's option help
            from jazzparser.utils.options import options_help_text
            print options_help_text(
                tagger_cls.TAGGER_OPTIONS,
                intro="Available options for selected tagger")
            return 0
        toptstr = ":".join(toptstr)
    else:
        toptstr = ""
    topts = ModuleOption.process_option_string(toptstr)
    # Check that the options are valid
    try:
        tagger_cls.check_options(topts)
    except ModuleOptionError, err:
        print "Problem with tagger options (--topt): %s" % err
        return 1
Esempio n. 42
0
def build_tree_for_sequence(sequence, debug_stack=False, grammar=None, logger=None):
    """
    Run through the motions of parsing the sequence in order to build 
    its tree structure. Most of the structure is implicit in the 
    lexical categories. Additional information is given in the TreeInfo
    model, associated with chords.
    
    """
    # Read in the possible categories from the grammar
    if grammar is None:
        grammar = get_grammar()
    # This function will format a string and output it to a logger if logging
    if logger is None:
        def _log(*args):
            pass
    else:
        def _log(string, *args):
            string = string % args
            logger.info(string)
    
    input = []
    shift_reduce = []
    
    categories = []
    for chord in sequence.iterator():
        # Try getting a family for the specified category
        if chord.category is None or chord.category == "":
            category = None
            cat_name = None
        else:
            if chord.category not in grammar.families:
                raise TreeBuildError, "Could not find the category %s in "\
                    "the lexicon" % chord.category
            # Assume there's only one entry per family, or at least that if 
            #  there are multiple they have the same argument structure.
            category = grammar.families[chord.category][0].entries[0].sign.category
            cat_name = chord.category
        # Put the generalized form of the category into the stack
        gen_cat = generalize_category(category, grammar.formalism)
        # Attached a tree leaf to this chord
        gen_cat.tree = SyntacticTerminal(chord, category=cat_name)
        input.append(gen_cat)
        categories.append("%s <= %s" % (chord,category))
    _log("CATEGORIES %s", categories)
        
    input = list(reversed(input))
    stack = []
    rules = [ compf, compb, appf, appb, cont ]
    # Now do the vague pseudo-parse
    while len(input) > 0:
        # SHIFT
        shift_reduce.append("S")
        stack.append(input.pop())
        if debug_stack:
            print stack
        _log("SHIFT stack = %s, input = %s", stack, input)
        # Use the additional information given to us to override default
        #  rule applications
        coord_unresolved = False
        coord_resolved = False
        if stack[-1].tree.chord.treeinfo.coord_unresolved:
            # This is the end of the first part of a coordination.
            # Continue reducing, but add a special marker afterwards
            coord_unresolved = True
        if stack[-1].tree.chord.treeinfo.coord_resolved:
            # The end of the second part of a coordination.
            # Continue reducing, then apply coordination
            coord_resolved = True
            
        # REDUCE
        # Try combining the top categories on the stack
        changed = True
        while changed:
            changed = False
            # Try each rule and see whether it applies
            for rule in rules:
                res = rule(stack)
                if res:
                    shift_reduce.append("R(%s)" % rule.name)
                    changed = True
                    _log("REDUCE %s, stack = %s", rule.name, stack)
        
        if coord_resolved:
            # Try to reduce the coordination
            coord(stack)
        if coord_unresolved:
            # Add a special marker to the stack so we know where the 
            #  coordination began
            stack.append(CoordinationMiddleMarker())
    for cat in stack:
        if isinstance(cat, CoordinationMiddleMarker):
            raise TreeBuildError, "Coordination middle marker not "\
                "matched by an end marker. Stack: %s" % strs(stack, ", ")
    tree = SyntacticTreeRoot([cat.tree for cat in stack], shift_reduce=shift_reduce)
    return tree
Esempio n. 43
0
def main():
    usage = "%prog [<options>] <model-name> <training-input>"
    description = "Training of PCFG models."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \
        help="Number of partitions to divide the data into. "\
            "For train, divides the input file, trains a model on each "\
            "partition's complement and appends partition number to "\
            "the model names. For del, appends partition numbers to model "\
            "names and deletes all the models. Recache does similarly. "\
            "Has no effect for parse.")
    parser.add_option(
        '--opts',
        dest="training_opts",
        action="store",
        help=
        "options to pass to the model trainer. Type '--opts help' for a list of options"
    )
    parser.add_option("--debug",
                      dest="debug",
                      action="store_true",
                      help="Output verbose logging information to stderr")
    parser.add_option("-g",
                      "--grammar",
                      dest="grammar",
                      action="store",
                      help="use the named grammar instead of the default.")
    options, arguments = parse_args_with_config(parser)

    if options.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.WARN
    # Create a logger for training
    logger = create_logger(log_level=log_level, name="training", stderr=True)

    # Load a grammar
    grammar = get_grammar(options.grammar)
    # Get the pcfg model class for the formalism
    PcfgModel = grammar.formalism.PcfgModel

    # Parse the option string
    if options.training_opts is None:
        opts = {}
    elif options.training_opts.lower() == "help":
        print options_help_text(PcfgModel.TRAINING_OPTIONS,
                                intro="Training options for PCFGs")
        sys.exit(0)
    else:
        opts = ModuleOption.process_option_dict(
            ModuleOption.process_option_string(options.training_opts),
            PcfgModel.TRAINING_OPTIONS)

    if len(arguments) == 0:
        print >> sys.stderr, "Specify a model name"
        models = PcfgModel.list_models()
        print >> sys.stderr, "Available models: %s" % ", ".join(models)
        sys.exit(1)
    model_name = arguments[0]
    print "Model base name:", model_name

    if options.partitions is not None:
        parts = [(i, "%s%d" % (model_name, i))
                 for i in range(options.partitions)]
    else:
        parts = [(None, model_name)]

    if len(arguments) < 2:
        print >> sys.stderr, "Specify an input file to read sequence data from"
        sys.exit(1)
    # Read in the training data from the given file
    seqs = SequenceIndex.from_file(arguments[1])

    if options.partitions is not None:
        # Prepare each training partition
        datasets = holdout_partition(seqs.sequences, options.partitions)
    else:
        datasets = [seqs.sequences]

    for dataset, (parti, part_model) in zip(datasets, parts):
        # Train the named model on the sequence data
        model = PcfgModel.train(part_model,
                                dataset,
                                opts,
                                grammar=grammar,
                                logger=logger)
        model.save()
        print "Trained model", part_model
Esempio n. 44
0
def main():
    usage = "%prog [options] <seq-file>"
    description = "Parses a sequence from a sequence index file using the "\
        "annotations stored in the same file."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option(
        "--popt",
        "--parser-options",
        dest="popts",
        action="append",
        help=
        "specify options for the parser. Type '--popt help' to get a list of options (we use a DirectedCkyParser)"
    )
    parser.add_option("--derivations",
                      "--deriv",
                      dest="derivations",
                      action="store_true",
                      help="print out derivation traces of all the results")
    parser.add_option("--index",
                      "-i",
                      dest="index",
                      action="store",
                      type="int",
                      help="parse just the sequence with this index")
    parser.add_option("--quiet",
                      "-q",
                      dest="quiet",
                      action="store_true",
                      help="show only errors in the output")
    parser.add_option(
        "--tonal-space",
        "--ts",
        dest="tonal_space",
        action="store_true",
        help="show the tonal space path (with -q, shows only paths)")
    parser.add_option(
        "--output-set",
        "-o",
        dest="output_set",
        action="store",
        help="store the analyses to a tonal space analysis set with this name")
    parser.add_option(
        "--trace-parse",
        "-t",
        dest="trace_parse",
        action="store_true",
        help=
        "output a trace of the shift-reduce parser's operations in producing the full interpretation from the annotations"
    )
    options, arguments = parser.parse_args()

    if len(arguments) < 1:
        print "You must specify a sequence file"
        sys.exit(1)

    if options.popts is not None:
        poptstr = options.popts
        if "help" in [s.strip().lower() for s in poptstr]:
            # Output this tagger's option help
            print options_help_text(
                DirectedCkyParser.PARSER_OPTIONS,
                intro="Available options for the directed parser")
            return 0
    else:
        poptstr = ""
    popts = ModuleOption.process_option_string(poptstr)

    grammar = get_grammar()
    if options.quiet:
        logger = create_plain_stderr_logger(log_level=logging.ERROR)
    else:
        logger = create_plain_stderr_logger()

    if options.trace_parse:
        parse_logger = logger
    else:
        parse_logger = None

    seq_index = SequenceIndex.from_file(arguments[0])
    # Get the chord sequence(s)
    if options.index is None:
        seqs = seq_index.sequences
    else:
        seqs = [seq_index.sequence_by_index(options.index)]
    logger.info("%d sequences\n" % len(seqs))

    full_analyses = []
    stats = {
        'full': 0,
        'partial': 0,
        'fail': 0,
    }
    # Try parsing every sequence
    for seq in seqs:
        logger.info("====== Sequence %s =======" % seq.string_name)
        try:
            results = parse_sequence_with_annotations(
                seq, grammar, logger=logger, parse_logger=parse_logger)
        except ParseError, err:
            logger.error("Error parsing: %s" % err)
            stats['fail'] += 1
        else:
            # This may have resulted in multiple partial parses
            logger.info("%d partial parses" % len(results))

            if len(results) == 1:
                stats['full'] += 1
            else:
                stats['partial'] += 1

            if options.derivations:
                # Output the derivation trace for each partial parse
                for result in results:
                    print
                    print result.derivation_trace

            if options.tonal_space:
                # Output the tonal space coordinates
                path = grammar.formalism.sign_to_coordinates(results[0])
                for i, point in enumerate(path):
                    print "%d, %d: %s" % (seq.id, i, point)

            # Only include a result in the output analyses if it was a full parse
            if len(results) == 1:
                full_analyses.append((seq.string_name, results[0].semantics))
            else:
                logger.warn("%s was not included in the output analyses, "\
                    "since it was not fully parsed" % seq.string_name)
Esempio n. 45
0
def main():
    usage = "%prog [options] <consistency-data>"
    description = "Evaluates annotator consistency."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-m", "--metric", dest="metric", action="store",
        help="semantics distance metric to use. Use '-m help' for a list of "\
            "available metrics")
    parser.add_option("--mopt", "--metric-options", dest="mopts",
        action="append",
        help="options to pass to the semantics metric. Use with '--mopt help' "\
            "with -m to see available options")
    parser.add_option("-f", "--f-score", dest="f_score", action="store_true",
        help="outputs recall, precision and f-score for an f-score-based "\
            "metric. Just uses the same metric 3 times with output=recall, "\
            "etc. Will only work with appropriate metrics")
    options, arguments = parser.parse_args()

    grammar = get_grammar()

    if options.metric is not None:
        use_metric = True
        if options.f_score:
            # Special case: get 3 metrics
            metrics = []
            opts = options.mopts or []
            for opt in ["output=precision", "output=recall", "output=f"]:
                metrics.append(
                    command_line_metric(formalism, options.metric,
                                        opts + [opt]))
            print "Evaluating precision, recall and f-score on %s" % metrics[
                0].name
        else:
            # Get a metric according to the options
            metrics = [
                command_line_metric(formalism, options.metric, options.mopts)
            ]
            print "Evaluating using metric: %s" % metrics[0].name
    else:
        use_metric = False

    if len(arguments) < 1:
        print >> sys.stderr, "Specify a consistency data file"
        sys.exit(1)
    filename = arguments[0]

    consdata = ConsistencyData.from_file(filename)

    # Count up matching annotations
    matches = 0
    chords = 0
    for ann1, ann2 in consdata:
        for chord1, chord2 in zip(ann1, ann2):
            chords += 1
            if chord1.category == chord2.category:
                matches += 1
    # Count matching coordination points
    rean_coords = sum(sum(
                    [1 for crd in seq if crd.treeinfo.coord_unresolved])
                        for seq,gs in consdata) + \
                  sum(sum(
                    [1 for crd in seq if crd.treeinfo.coord_resolved])
                        for seq,gs in consdata)
    gold_coords = sum(sum(
                    [1 for crd in gs if crd.treeinfo.coord_unresolved])
                        for seq,gs in consdata) + \
                  sum(sum(
                    [1 for crd in gs if crd.treeinfo.coord_resolved])
                        for seq,gs in consdata)
    match_coords = sum(sum(
                    [1 for crdr,crdg in zip(seq,gs) if
                                            crdr.treeinfo.coord_unresolved
                                            and crdg.treeinfo.coord_unresolved])
                        for seq,gs in consdata) + \
                   sum(sum(
                    [1 for crdr,crdg in zip(seq,gs) if
                                            crdr.treeinfo.coord_resolved
                                            and crdg.treeinfo.coord_resolved])
                        for seq,gs in consdata)
    # Compute precision, recall and f-score from this
    precision = 100.0 * (matches + match_coords) / (chords + rean_coords)
    recall = 100.0 * (matches + match_coords) / (chords + gold_coords)
    fscore = 2.0 * precision * recall / (precision + recall)
    print "%d chords" % chords
    print "\nCategory and coordination accuracy:"
    print "Precision: %.2f" % precision
    print "Recall: %.2f" % recall
    print "F-score: %.2f" % fscore

    if use_metric:
        print

        def _parse_seq(seq):
            # Parse the annotations to get a semantics
            try:
                gold_parses = parse_sequence_with_annotations(
                    DbInput.from_sequence(seq),
                    grammar=grammar,
                    allow_subparses=False)
                # Got a result: return its semantics
                return gold_parses[0].semantics
            except ParseError, err:
                # Could not parse annotated sequence
                print >>sys.stderr, "Could not parse sequence '%s': %s" % \
                                                        (seq.string_name, err)
                return

        # Prepare pairs of gold-standard parse results from the two annotations
        sem_pairs = [(_parse_seq(ann1), _parse_seq(ann2))
                     for (ann1, ann2) in consdata]
        # Compute the distance using the metrics
        for metric in metrics:
            distance = metric.total_distance(sem_pairs)
            print "%s: %s" % (metric.identifier.capitalize(),
                              metric.format_distance(distance))
Esempio n. 46
0
def main():
    usage = "%prog [<options>] <model-name> <training-input>"
    description = "Training of PCFG models."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \
        help="Number of partitions to divide the data into. "\
            "For train, divides the input file, trains a model on each "\
            "partition's complement and appends partition number to "\
            "the model names. For del, appends partition numbers to model "\
            "names and deletes all the models. Recache does similarly. "\
            "Has no effect for parse.")
    parser.add_option('--opts', dest="training_opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options")
    parser.add_option("--debug", dest="debug", action="store_true", help="Output verbose logging information to stderr")
    parser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.")
    options, arguments = parse_args_with_config(parser)
    
    if options.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.WARN
    # Create a logger for training
    logger = create_logger(log_level = log_level,
                  name = "training",
                  stderr = True)
    
    # Load a grammar
    grammar = get_grammar(options.grammar)
    # Get the pcfg model class for the formalism
    PcfgModel = grammar.formalism.PcfgModel
        
    # Parse the option string
    if options.training_opts is None:
        opts = {}
    elif options.training_opts.lower() == "help":
        print options_help_text(PcfgModel.TRAINING_OPTIONS, 
                                            intro="Training options for PCFGs")
        sys.exit(0)
    else:
        opts = ModuleOption.process_option_dict(
                    ModuleOption.process_option_string(options.training_opts),
                    PcfgModel.TRAINING_OPTIONS)
    
    if len(arguments) == 0:
        print >>sys.stderr, "Specify a model name"
        models = PcfgModel.list_models()
        print >>sys.stderr, "Available models: %s" % ", ".join(models)
        sys.exit(1)
    model_name = arguments[0]
    print "Model base name:", model_name
    
    if options.partitions is not None:
        parts = [(i, "%s%d" % (model_name, i)) for i in range(options.partitions)]
    else:
        parts = [(None, model_name)]
    
    if len(arguments) < 2:
        print >>sys.stderr, "Specify an input file to read sequence data from"
        sys.exit(1)
    # Read in the training data from the given file
    seqs = SequenceIndex.from_file(arguments[1])
    
    if options.partitions is not None:
        # Prepare each training partition
        datasets = holdout_partition(seqs.sequences, options.partitions)
    else:
        datasets = [seqs.sequences]
        
    for dataset,(parti,part_model) in zip(datasets,parts):
        # Train the named model on the sequence data
        model = PcfgModel.train(part_model, dataset, opts, grammar=grammar, 
                                logger=logger)
        model.save()
        print "Trained model", part_model
Esempio n. 47
0
 def setUp(self):
     # Load a grammar
     self.grammar = get_grammar()
     self.devel = self.grammar.rules_by_name['dev']
Esempio n. 48
0
def main():
    usage = "%prog [options] <results-files>"
    description = """\
Read in a ParseResults file, just like result_alignment.py. Examines the \
errors that were made and outputs them in context.
"""
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("--window", "-w", dest="window", action="store", type="int", help="size of context window to show before and after each error. Default: 2", default=2)
    parser.add_option("--distance", "--dist", dest="distance", action="store_true", help="show the total distance travelled in the tonal space by the result and the gold standard")
    parser.add_option("--output-opts", "--oopts", dest="output_opts", action="store", help="options that affect the output formatting. Use '--output-opts help' for a list of options.")
    parser.add_option("--summary-threshold", dest="summary_threshold", action="store", type="int", help="how many times a substitution/insertion/deletion needs to have happened to be including in the summary (default: 4)", default=4)
    options, arguments = parser.parse_args()
        
    if len(arguments) == 0:
        print >>sys.stderr, "Specify at least one file to read the results from"
        sys.exit(1)
        
    grammar = get_grammar()
    grammar.formalism.cl_output_options(options.output_opts)
        
    # Size of window of context to show
    win = options.window
    
    errors = []
    unscored_files = []
    scored = 0
    unscored = 0
    result_lengths = []
    gold_lengths = []
    insertions = {}
    deletions = {}
    substitutions = {}
    error_types = {}
    for filename in arguments:
        try:
            top_result, gold_result = get_top_result(filename)
        except ParseResults.LoadError, err:
            print >>sys.stderr, "Error loading file: %s" % (err)
            errors.append(filename)
            continue
        else:
            print "============================="
            print "File: %s" % filename
            if top_result is None:
                # No alignment was found
                unscored +=1
                print "No result"
            else:
                # Wrap these up as a semantics, since some functions need that as input
                Sems = grammar.formalism.Semantics.Semantics
                top_sems, gold_sems = Sems(top_result), Sems(gold_result)
                
                # Do the alignment of the top result and gold result
                alignment,gold_seq,result_seq = results_alignment(top_result, gold_result)
                scored += 1
                # Get the actual list of coordinates
                coords = zip(*grammar.formalism.semantics_to_coordinates(gold_sems))[0]
                funs = zip(*grammar.formalism.semantics_to_functions(gold_sems))[0]
                gold_coords = zip(coords, funs)
                
                coords = zip(*grammar.formalism.semantics_to_coordinates(top_sems))[0]
                funs = zip(*grammar.formalism.semantics_to_functions(top_sems))[0]
                result_coords = zip(coords, funs)
                
                print "Result length: %d, gold length: %d" % \
                        (len(result_coords), len(gold_coords))
                result_lengths.append(len(result_coords))
                gold_lengths.append(len(gold_coords))
                
                if options.distance:
                    # Work out the total distance travelled
                    start, end = gold_coords[-1][0], gold_coords[0][0]
                    gold_vect = end[0] - start[0], end[1] - start[1]
                    # And for the actual result
                    start, end = result_coords[-1][0], result_coords[0][0]
                    result_vect = end[0] - start[0], end[1] - start[1]
                    print "Distance travelled:"
                    print "  Gold result:", gold_vect
                    print "  Top result: ", result_vect
                    print
                
                # Put together a table of error windows
                table = [
                    # Header row
                    ["", "Step", "", "Result", "Gold"]
                ]
                
                gold = iter(zip(gold_seq,gold_coords))
                result = iter(zip(result_seq,result_coords))
                context = []
                post_context = 0
                unseen = 0
                for op in alignment:
                    # Keep a record of how many of each error occur
                    if op not in error_types:
                        error_types[op] = 1
                    else:
                        error_types[op] += 1
                    
                    if op == "A":
                        # Aligned pair
                        # Move both sequences on
                        gold_step,gold_point = gold.next()
                        result_step,result_point = result.next()
                        if post_context > 0:
                            # Show this as part of the post-context of an error
                            table.append(["A", str(gold_step), "", str(result_point), str(gold_point)])
                            context = []
                            post_context -= 1
                        else:
                            # Add this to the rolling window of pre-context
                            if len(context) >= win:
                                # We've not shown something here
                                unseen += 1
                            if win > 0:
                                context.append((gold_step, gold_point, result_step, result_point))
                                context = context[-win:]
                    else:
                        # Mark if there was something we didn't show
                        if unseen:
                            table.append(["", "   ...%d..." % unseen, "", "", ""])
                            unseen = 0
                        if context:
                            # Show the error's pre-context
                            for (pre_gold_step,pre_gold_point,__,pre_result_point) in context:
                                table.append(["A", str(pre_gold_step), "", str(pre_result_point), str(pre_gold_point)])
                            context = []
                        
                        if op == "I":
                            # Inserted in the result
                            result_step,result_point = result.next()
                            table.append(["I", str(result_step), "", str(result_point), ""])
                            if str(result_step) not in insertions:
                                insertions[str(result_step)] = 1
                            else:
                                insertions[str(result_step)] += 1
                        elif op == "D":
                            # Deleted in the result
                            gold_step,gold_point = gold.next()
                            table.append(["D", str(gold_step), "", "", str(gold_point)])
                            if str(gold_step) not in deletions:
                                deletions[str(gold_step)] = 1
                            else:
                                deletions[str(gold_step)] += 1
                        else:
                            # Substituted
                            result_step, result_point = result.next()
                            gold_step, gold_point = gold.next()
                            table.append([str(op), str(result_step), "for %s" % str(gold_step), str(result_point), str(gold_point)])
                            subst_key = "%s > %s" % (gold_step, result_step)
                            if subst_key not in substitutions:
                                substitutions[subst_key] = 1
                            else:
                                substitutions[subst_key] += 1
                        # After anything other than an alignment, cancel the 
                        #  context window
                        context = []
                        # Show up to <win> in the post-context of alignments
                        post_context = win
                # Mark if there was something at the end we didn't show
                if unseen:
                    table.append(["", "   ...%d..." % unseen, "", "", ""])
                # Print out the table
                pprint_table(sys.stdout, table, justs=[True,True,True,True,True])
        
        print "\n"
Esempio n. 49
0
def main():
    usage = "%prog [options] <results-files> <index>"
    description = "Prints a dependency tree for a parse result"
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-l", "--latex", dest="latex", action="store_true", help="output Latex for the graphs using tikz-dependency")
    parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file (--file). Type '--fopt help' for a list of available options.")
    options, arguments = parser.parse_args()
        
    if len(arguments) < 1:
        print >>sys.stderr, "Specify a file to read the results from"
        sys.exit(1)
    filename = arguments[0]
    if len(arguments) < 2:
        print >>sys.stderr, "Specify an of the sequence to load"
        sys.exit(1)
    index = int(arguments[1])
    
    grammar = get_grammar()
    
    # We always need an index, so this is given as an argument
    # Put it in the options list for loading the file
    fopts = options.file_options
    if fopts and len(fopts):
        fopts += ":index=%d" % index
    else:
        fopts = "index=%d" % index
    # Load the sequence index file
    dbinput = command_line_input(filename=filename, filetype="db", options=fopts)
    
    name = dbinput.name
    
    anal = parse_sequence_with_annotations(dbinput, grammar)[0]
    graph, time_map = semantics_to_dependency_graph(anal.semantics)
    
    # Join together chords that are on the same dependency node
    times = iter(sorted(time_map.values()))
    dep_time = times.next()
    current_chord = []
    joined_chords = []
    finished = False
    for chord_time,chord in sorted(dbinput.sequence.time_map.items()):
        if chord_time >= dep_time and not finished:
            if len(current_chord):
                joined_chords.append(current_chord)
            current_chord = [chord]
            try:
                dep_time = times.next()
            except StopIteration:
                finished = True
        else:
            current_chord.append(chord)
    joined_chords.append(current_chord)
    
    chords = [" ".join(filter_latex(str(crd)) for crd in item) 
                                                for item in joined_chords]
    annotations = [" ".join(filter_latex(crd.category) for crd in item) 
                                                for item in joined_chords]
    graph.words = annotations
    
    if options.latex:
        # Exit with status 1 if we don't output anything
        exit_status = 1
        
        # Output a full Latex document in one go
        if name is not None:
            title = r"""\title{%s}
\author{}
\date{}""" % name.capitalize()
            maketitle = r"\maketitle\thispagestyle{empty}\vspace{-20pt}"
        else:
            title = ""
            maketitle = ""
        
        # Print the header
        print r"""\documentclass[a4paper]{article}
\usepackage{tikz-dependency}
%% You may need to set paperheight (for width) and paperwidth (for height) to get things to fit
\usepackage[landscape,margin=1cm,paperheight=50cm]{geometry}
\pagestyle{empty}

%(title)s

\begin{document}
%(maketitle)s

\tikzstyle{every picture}+=[remember picture]
\centering

""" % \
        { 'title' : title,
          'maketitle' : maketitle }
        
        if graph is not None:
            exit_status = 0
            print dependency_graph_to_latex(graph, 
                                            fmt_lab=_fmt_label,
                                            extra_rows=[chords])
            print "\n\\vspace{15pt}"
        
        # Finish off the document
        print r"""
\end{document}
"""
        sys.exit(exit_status)
    else:
        # Not outputing Latex
        print graph
Esempio n. 50
0
def main():
    usage = "%prog [options] <consistency-data>"
    description = "Evaluates annotator consistency."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-m", "--metric", dest="metric", action="store", 
        help="semantics distance metric to use. Use '-m help' for a list of "\
            "available metrics")
    parser.add_option("--mopt", "--metric-options", dest="mopts", 
        action="append", 
        help="options to pass to the semantics metric. Use with '--mopt help' "\
            "with -m to see available options")
    parser.add_option("-f", "--f-score", dest="f_score", action="store_true", 
        help="outputs recall, precision and f-score for an f-score-based "\
            "metric. Just uses the same metric 3 times with output=recall, "\
            "etc. Will only work with appropriate metrics")
    options, arguments = parser.parse_args()
    
    grammar = get_grammar()
    
    if options.metric is not None:
        use_metric = True
        if options.f_score:
            # Special case: get 3 metrics
            metrics = []
            opts = options.mopts or []
            for opt in [ "output=precision", "output=recall", "output=f" ]:
                metrics.append(command_line_metric(formalism, options.metric, 
                                                                    opts+[opt]))
            print "Evaluating precision, recall and f-score on %s" % metrics[0].name
        else:
            # Get a metric according to the options
            metrics = [command_line_metric(formalism, options.metric, options.mopts)]
            print "Evaluating using metric: %s" % metrics[0].name
    else:
        use_metric = False
    
    
    if len(arguments) < 1:
        print >>sys.stderr, "Specify a consistency data file"
        sys.exit(1)
    filename = arguments[0]
    
    consdata = ConsistencyData.from_file(filename)
    
    # Count up matching annotations
    matches = 0
    chords = 0
    for ann1,ann2 in consdata:
        for chord1,chord2 in zip(ann1,ann2):
            chords += 1
            if chord1.category == chord2.category:
                matches += 1
    # Count matching coordination points
    rean_coords = sum(sum(
                    [1 for crd in seq if crd.treeinfo.coord_unresolved])
                        for seq,gs in consdata) + \
                  sum(sum(
                    [1 for crd in seq if crd.treeinfo.coord_resolved])
                        for seq,gs in consdata)
    gold_coords = sum(sum(
                    [1 for crd in gs if crd.treeinfo.coord_unresolved])
                        for seq,gs in consdata) + \
                  sum(sum(
                    [1 for crd in gs if crd.treeinfo.coord_resolved])
                        for seq,gs in consdata)
    match_coords = sum(sum(
                    [1 for crdr,crdg in zip(seq,gs) if 
                                            crdr.treeinfo.coord_unresolved 
                                            and crdg.treeinfo.coord_unresolved])
                        for seq,gs in consdata) + \
                   sum(sum(
                    [1 for crdr,crdg in zip(seq,gs) if 
                                            crdr.treeinfo.coord_resolved 
                                            and crdg.treeinfo.coord_resolved])
                        for seq,gs in consdata)
    # Compute precision, recall and f-score from this
    precision = 100.0 * (matches + match_coords) / (chords + rean_coords)
    recall = 100.0 * (matches + match_coords) / (chords + gold_coords)
    fscore = 2.0 * precision * recall / (precision+recall)
    print "%d chords" % chords
    print "\nCategory and coordination accuracy:"
    print "Precision: %.2f" % precision
    print "Recall: %.2f" % recall
    print "F-score: %.2f" % fscore
    
    if use_metric:
        print 
        def _parse_seq(seq):
            # Parse the annotations to get a semantics
            try:
                gold_parses = parse_sequence_with_annotations(
                                                    DbInput.from_sequence(seq), 
                                                    grammar=grammar,
                                                    allow_subparses=False)
                # Got a result: return its semantics
                return gold_parses[0].semantics
            except ParseError, err:
                # Could not parse annotated sequence
                print >>sys.stderr, "Could not parse sequence '%s': %s" % \
                                                        (seq.string_name, err)
                return 
        
        # Prepare pairs of gold-standard parse results from the two annotations
        sem_pairs = [
            (_parse_seq(ann1), _parse_seq(ann2)) for (ann1,ann2) in consdata
        ]
        # Compute the distance using the metrics
        for metric in metrics:
            distance = metric.total_distance(sem_pairs)
            print "%s: %s" % (metric.identifier.capitalize(), 
                              metric.format_distance(distance))
Esempio n. 51
0
def main():
    usage = "%prog [options] <results-files>"
    description = """\
Read in a ParseResults file, just like result_alignment.py. Examines the \
errors that were made and outputs them in context.
"""
    parser = OptionParser(usage=usage, description=description)
    parser.add_option(
        "--window",
        "-w",
        dest="window",
        action="store",
        type="int",
        help=
        "size of context window to show before and after each error. Default: 2",
        default=2)
    parser.add_option(
        "--distance",
        "--dist",
        dest="distance",
        action="store_true",
        help=
        "show the total distance travelled in the tonal space by the result and the gold standard"
    )
    parser.add_option(
        "--output-opts",
        "--oopts",
        dest="output_opts",
        action="store",
        help=
        "options that affect the output formatting. Use '--output-opts help' for a list of options."
    )
    parser.add_option(
        "--summary-threshold",
        dest="summary_threshold",
        action="store",
        type="int",
        help=
        "how many times a substitution/insertion/deletion needs to have happened to be including in the summary (default: 4)",
        default=4)
    options, arguments = parser.parse_args()

    if len(arguments) == 0:
        print >> sys.stderr, "Specify at least one file to read the results from"
        sys.exit(1)

    grammar = get_grammar()
    grammar.formalism.cl_output_options(options.output_opts)

    # Size of window of context to show
    win = options.window

    errors = []
    unscored_files = []
    scored = 0
    unscored = 0
    result_lengths = []
    gold_lengths = []
    insertions = {}
    deletions = {}
    substitutions = {}
    error_types = {}
    for filename in arguments:
        try:
            top_result, gold_result = get_top_result(filename)
        except ParseResults.LoadError, err:
            print >> sys.stderr, "Error loading file: %s" % (err)
            errors.append(filename)
            continue
        else:
            print "============================="
            print "File: %s" % filename
            if top_result is None:
                # No alignment was found
                unscored += 1
                print "No result"
            else:
                # Wrap these up as a semantics, since some functions need that as input
                Sems = grammar.formalism.Semantics.Semantics
                top_sems, gold_sems = Sems(top_result), Sems(gold_result)

                # Do the alignment of the top result and gold result
                alignment, gold_seq, result_seq = results_alignment(
                    top_result, gold_result)
                scored += 1
                # Get the actual list of coordinates
                coords = zip(
                    *grammar.formalism.semantics_to_coordinates(gold_sems))[0]
                funs = zip(
                    *grammar.formalism.semantics_to_functions(gold_sems))[0]
                gold_coords = zip(coords, funs)

                coords = zip(
                    *grammar.formalism.semantics_to_coordinates(top_sems))[0]
                funs = zip(
                    *grammar.formalism.semantics_to_functions(top_sems))[0]
                result_coords = zip(coords, funs)

                print "Result length: %d, gold length: %d" % \
                        (len(result_coords), len(gold_coords))
                result_lengths.append(len(result_coords))
                gold_lengths.append(len(gold_coords))

                if options.distance:
                    # Work out the total distance travelled
                    start, end = gold_coords[-1][0], gold_coords[0][0]
                    gold_vect = end[0] - start[0], end[1] - start[1]
                    # And for the actual result
                    start, end = result_coords[-1][0], result_coords[0][0]
                    result_vect = end[0] - start[0], end[1] - start[1]
                    print "Distance travelled:"
                    print "  Gold result:", gold_vect
                    print "  Top result: ", result_vect
                    print

                # Put together a table of error windows
                table = [
                    # Header row
                    ["", "Step", "", "Result", "Gold"]
                ]

                gold = iter(zip(gold_seq, gold_coords))
                result = iter(zip(result_seq, result_coords))
                context = []
                post_context = 0
                unseen = 0
                for op in alignment:
                    # Keep a record of how many of each error occur
                    if op not in error_types:
                        error_types[op] = 1
                    else:
                        error_types[op] += 1

                    if op == "A":
                        # Aligned pair
                        # Move both sequences on
                        gold_step, gold_point = gold.next()
                        result_step, result_point = result.next()
                        if post_context > 0:
                            # Show this as part of the post-context of an error
                            table.append([
                                "A",
                                str(gold_step), "",
                                str(result_point),
                                str(gold_point)
                            ])
                            context = []
                            post_context -= 1
                        else:
                            # Add this to the rolling window of pre-context
                            if len(context) >= win:
                                # We've not shown something here
                                unseen += 1
                            if win > 0:
                                context.append((gold_step, gold_point,
                                                result_step, result_point))
                                context = context[-win:]
                    else:
                        # Mark if there was something we didn't show
                        if unseen:
                            table.append(
                                ["", "   ...%d..." % unseen, "", "", ""])
                            unseen = 0
                        if context:
                            # Show the error's pre-context
                            for (pre_gold_step, pre_gold_point, __,
                                 pre_result_point) in context:
                                table.append([
                                    "A",
                                    str(pre_gold_step), "",
                                    str(pre_result_point),
                                    str(pre_gold_point)
                                ])
                            context = []

                        if op == "I":
                            # Inserted in the result
                            result_step, result_point = result.next()
                            table.append([
                                "I",
                                str(result_step), "",
                                str(result_point), ""
                            ])
                            if str(result_step) not in insertions:
                                insertions[str(result_step)] = 1
                            else:
                                insertions[str(result_step)] += 1
                        elif op == "D":
                            # Deleted in the result
                            gold_step, gold_point = gold.next()
                            table.append(
                                ["D",
                                 str(gold_step), "", "",
                                 str(gold_point)])
                            if str(gold_step) not in deletions:
                                deletions[str(gold_step)] = 1
                            else:
                                deletions[str(gold_step)] += 1
                        else:
                            # Substituted
                            result_step, result_point = result.next()
                            gold_step, gold_point = gold.next()
                            table.append([
                                str(op),
                                str(result_step),
                                "for %s" % str(gold_step),
                                str(result_point),
                                str(gold_point)
                            ])
                            subst_key = "%s > %s" % (gold_step, result_step)
                            if subst_key not in substitutions:
                                substitutions[subst_key] = 1
                            else:
                                substitutions[subst_key] += 1
                        # After anything other than an alignment, cancel the
                        #  context window
                        context = []
                        # Show up to <win> in the post-context of alignments
                        post_context = win
                # Mark if there was something at the end we didn't show
                if unseen:
                    table.append(["", "   ...%d..." % unseen, "", "", ""])
                # Print out the table
                pprint_table(sys.stdout,
                             table,
                             justs=[True, True, True, True, True])

        print "\n"
Esempio n. 52
0
 def setUp(self):
     # Load a grammar
     self.grammar = get_grammar()
     self.fapply = self.grammar.rules_by_name['appf']
     self.bapply = self.grammar.rules_by_name['appb']
Esempio n. 53
0
def main():
    usage = "%prog [options] <results-files> <index>"
    description = "Prints a dependency tree for a parse result"
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-l",
                      "--latex",
                      dest="latex",
                      action="store_true",
                      help="output Latex for the graphs using tikz-dependency")
    parser.add_option(
        "--file-options",
        "--fopt",
        dest="file_options",
        action="store",
        help=
        "options for the input file (--file). Type '--fopt help' for a list of available options."
    )
    options, arguments = parser.parse_args()

    if len(arguments) < 1:
        print >> sys.stderr, "Specify a file to read the results from"
        sys.exit(1)
    filename = arguments[0]
    if len(arguments) < 2:
        print >> sys.stderr, "Specify an of the sequence to load"
        sys.exit(1)
    index = int(arguments[1])

    grammar = get_grammar()

    # We always need an index, so this is given as an argument
    # Put it in the options list for loading the file
    fopts = options.file_options
    if fopts and len(fopts):
        fopts += ":index=%d" % index
    else:
        fopts = "index=%d" % index
    # Load the sequence index file
    dbinput = command_line_input(filename=filename,
                                 filetype="db",
                                 options=fopts)

    name = dbinput.name

    anal = parse_sequence_with_annotations(dbinput, grammar)[0]
    graph, time_map = semantics_to_dependency_graph(anal.semantics)

    # Join together chords that are on the same dependency node
    times = iter(sorted(time_map.values()))
    dep_time = times.next()
    current_chord = []
    joined_chords = []
    finished = False
    for chord_time, chord in sorted(dbinput.sequence.time_map.items()):
        if chord_time >= dep_time and not finished:
            if len(current_chord):
                joined_chords.append(current_chord)
            current_chord = [chord]
            try:
                dep_time = times.next()
            except StopIteration:
                finished = True
        else:
            current_chord.append(chord)
    joined_chords.append(current_chord)

    chords = [
        " ".join(filter_latex(str(crd)) for crd in item)
        for item in joined_chords
    ]
    annotations = [
        " ".join(filter_latex(crd.category) for crd in item)
        for item in joined_chords
    ]
    graph.words = annotations

    if options.latex:
        # Exit with status 1 if we don't output anything
        exit_status = 1

        # Output a full Latex document in one go
        if name is not None:
            title = r"""\title{%s}
\author{}
\date{}""" % name.capitalize()
            maketitle = r"\maketitle\thispagestyle{empty}\vspace{-20pt}"
        else:
            title = ""
            maketitle = ""

        # Print the header
        print r"""\documentclass[a4paper]{article}
\usepackage{tikz-dependency}
%% You may need to set paperheight (for width) and paperwidth (for height) to get things to fit
\usepackage[landscape,margin=1cm,paperheight=50cm]{geometry}
\pagestyle{empty}

%(title)s

\begin{document}
%(maketitle)s

\tikzstyle{every picture}+=[remember picture]
\centering

""" % \
        { 'title' : title,
          'maketitle' : maketitle }

        if graph is not None:
            exit_status = 0
            print dependency_graph_to_latex(graph,
                                            fmt_lab=_fmt_label,
                                            extra_rows=[chords])
            print "\n\\vspace{15pt}"

        # Finish off the document
        print r"""
\end{document}
"""
        sys.exit(exit_status)
    else:
        # Not outputing Latex
        print graph
Esempio n. 54
0
    def train(self, sequences, grammar=None, logger=None):
        from jazzparser.utils.nltk.ngram import PrecomputedNgramModel
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()
        
        N = self.options['n']
        backoff = self.options['backoff']
        chordmap = self.options['chord_mapping']
        self.chordmap = chordmap
        self.chordmap_name = chordmap.name
        
        # Get data in the form of lists of (observation,tag) pairs
        training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \
                                for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)]
                                    for seq in sequences]
        # Get all the possible pos tags from the grammar
        label_dom = grammar.pos_tags
        # Build the emission domain to include all the observations that 
        #  theoretically could occur, not just those that are seen - 
        #  we might not see all interval/chord type pairs in the data.
        chord_types = chordmap.values()
        emission_dom = sum([["%d-%s" % (interval,chord) for chord in chord_types] for interval in range(12)], [])
        
        # Ignore unlabelled data
        ignores = ['']
        
        if self.options['backoff_cutoff'] is None:
            backoff_kwargs = {}
        else:
            backoff_kwargs = {'cutoff' : self.options['backoff_cutoff']}
        
        # Precompute the transition matrix and store it along with the model
        self.model = PrecomputedNgramModel.train(
                            self.options['n'],
                            training_data,
                            label_dom,
                            emission_dom=emission_dom,
                            cutoff=self.options['cutoff'],
                            backoff_order=self.options['backoff'],
                            estimator=self.options['estimator'],
                            ignore_list=ignores,
                            backoff_kwargs=backoff_kwargs)
        
        # Add some model-specific info into the descriptive text
        #  so we know how it was trained
        est_name = get_estimator_name(self.options['estimator'])
        self.model_description = """\
Model order: %(order)d
Backoff orders: %(backoff)d
Probability estimator: %(est)s
Zero-count threshold: %(cutoff)d
Chord mapping: %(chordmap)s
Training sequences: %(seqs)d
Training samples: %(samples)d\
""" % \
            {
                'est' : est_name,
                'seqs' : len(training_data),
                'samples' : len(sum(training_data, [])),
                'order' : self.options['n'],
                'backoff' : self.options['backoff'],
                'cutoff' : self.options['cutoff'],
                'chordmap' : self.chordmap_name,
            }
Esempio n. 55
0
    def train(self, inputs, grammar=None, logger=None):
        """
        @type inputs: L{jazzparser.data.input.MidiTaggerTrainingBulkInput} or 
            list of L{jazzparser.data.input.Input}s
        @param inputs: training MIDI data. Annotated chord sequences should 
            also be given (though this is optional) by loading a 
            bulk db input file in the MidiTaggerTrainingBulkInput.
        
        """
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()
            
        if len(inputs) == 0:
            # No data - nothing to do
            return
        
        # Check the type of one of the inputs - no guarantee they're all the 
        #  same, but there's something seriously weird going on if they're not
        input_type = detect_input_type(inputs[0], allowed=['segmidi'])
        # Get the chord training data too if it's been given
        if isinstance(inputs, MidiTaggerTrainingBulkInput) and \
                inputs.chords is not None:
            chord_inputs = inputs.chords
        else:
            chord_inputs = None
        
        # Initialize the emission distribution for chord classes
        self.hmm = ChordClassHmm.initialize_chord_classes(
                    self.options['ccprob'],
                    self.options['maxnotes'],
                    grammar,
                    metric=self.options['metric'],
                    illegal_transitions=self.options['illegal_transitions'],
                    fixed_root_transitions=self.options['fixed_roots'])
        
        if chord_inputs:
            # If chord training data was given, initially train transition 
            #  distribution from this
            self.hmm.add_history("Training initial transition distribution "\
                                    "from annotated chord data")
            self.hmm.train_transition_distribution(chord_inputs, grammar, \
                                        contprob=self.options['contprob'])
        else:
            # Otherwise it gets left as a uniform distribution
            self.hmm.add_history("No annotated chord training data given. "\
                    "Transition distribution initialized to uniform.")
        
        # Get a Baum-Welch trainer to do the EM retraining
        # Pull out the options to pass to the trainer
        bw_opt_names = [opt.name for opt in ChordClassBaumWelchTrainer.OPTIONS]
        bw_opts = dict([(name,val) for (name,val) in self.options.items() \
                        if name in bw_opt_names])
        retrainer = ChordClassBaumWelchTrainer(self.hmm, options=bw_opts)
        # Prepare a callback to save
        def _get_save_callback():
            def _save_callback():
                self.save()
            return _save_callback
        save_callback = _get_save_callback()
        # Do the Baum-Welch training
        retrainer.train(inputs, logger=logger, save_callback=save_callback)
        
        self.model_description = """\
Initial chord class emission prob: %(ccprob)f
Initial self-transition prob: %(contprob)s
Metrical model: %(metric)s
""" % \
            {
                'ccprob' : self.options['ccprob'],
                'metric' : self.options['metric'],
                'contprob' : self.options['contprob'],
            }
Esempio n. 56
0
def build_tree_for_sequence(sequence,
                            debug_stack=False,
                            grammar=None,
                            logger=None):
    """
    Run through the motions of parsing the sequence in order to build 
    its tree structure. Most of the structure is implicit in the 
    lexical categories. Additional information is given in the TreeInfo
    model, associated with chords.
    
    """
    # Read in the possible categories from the grammar
    if grammar is None:
        grammar = get_grammar()
    # This function will format a string and output it to a logger if logging
    if logger is None:

        def _log(*args):
            pass
    else:

        def _log(string, *args):
            string = string % args
            logger.info(string)

    input = []
    shift_reduce = []

    categories = []
    for chord in sequence.iterator():
        # Try getting a family for the specified category
        if chord.category is None or chord.category == "":
            category = None
            cat_name = None
        else:
            if chord.category not in grammar.families:
                raise TreeBuildError, "Could not find the category %s in "\
                    "the lexicon" % chord.category
            # Assume there's only one entry per family, or at least that if
            #  there are multiple they have the same argument structure.
            category = grammar.families[
                chord.category][0].entries[0].sign.category
            cat_name = chord.category
        # Put the generalized form of the category into the stack
        gen_cat = generalize_category(category, grammar.formalism)
        # Attached a tree leaf to this chord
        gen_cat.tree = SyntacticTerminal(chord, category=cat_name)
        input.append(gen_cat)
        categories.append("%s <= %s" % (chord, category))
    _log("CATEGORIES %s", categories)

    input = list(reversed(input))
    stack = []
    rules = [compf, compb, appf, appb, cont]
    # Now do the vague pseudo-parse
    while len(input) > 0:
        # SHIFT
        shift_reduce.append("S")
        stack.append(input.pop())
        if debug_stack:
            print stack
        _log("SHIFT stack = %s, input = %s", stack, input)
        # Use the additional information given to us to override default
        #  rule applications
        coord_unresolved = False
        coord_resolved = False
        if stack[-1].tree.chord.treeinfo.coord_unresolved:
            # This is the end of the first part of a coordination.
            # Continue reducing, but add a special marker afterwards
            coord_unresolved = True
        if stack[-1].tree.chord.treeinfo.coord_resolved:
            # The end of the second part of a coordination.
            # Continue reducing, then apply coordination
            coord_resolved = True

        # REDUCE
        # Try combining the top categories on the stack
        changed = True
        while changed:
            changed = False
            # Try each rule and see whether it applies
            for rule in rules:
                res = rule(stack)
                if res:
                    shift_reduce.append("R(%s)" % rule.name)
                    changed = True
                    _log("REDUCE %s, stack = %s", rule.name, stack)

        if coord_resolved:
            # Try to reduce the coordination
            coord(stack)
        if coord_unresolved:
            # Add a special marker to the stack so we know where the
            #  coordination began
            stack.append(CoordinationMiddleMarker())
    for cat in stack:
        if isinstance(cat, CoordinationMiddleMarker):
            raise TreeBuildError, "Coordination middle marker not "\
                "matched by an end marker. Stack: %s" % strs(stack, ", ")
    tree = SyntacticTreeRoot([cat.tree for cat in stack],
                             shift_reduce=shift_reduce)
    return tree
Esempio n. 57
0
def main():
    set_proc_title("jazzparser")
    ########################################################
    usage = "jazzparser [<options>]"
    description = "The main parser interface for the Jazz Parser"
    ## Process the input options
    optparser = OptionParser(usage=usage, description=description)
    ###
    # File input options
    group = OptionGroup(optparser, "Input", "Input type and location")
    optparser.add_option_group(group)
    group.add_option("--file", "-f", dest="file", action="store", help="use a file to get parser input from. Use --filetype to specify the type of the file.")
    group.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file (--file). Use '--filetype help' for a list of available types. Default: chords", default='chords')
    group.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file (--file). Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.")
    group.add_option("--index", "--indices", dest="input_index", action="store", help="select individual inputs to process. Specify as a comma-separated list of indices. All inputs are loaded as usual, but only the ith input is processed, for each i in the list")
    group.add_option("--only-load", dest="only_load", action="store_true", help="don't do anything with the inputs, just load and list them. Handy for checking the inputs load and getting their indices")
    group.add_option("--partitions", dest="partitions", action="store", type="int", help="divide the input data into this number of partitions and use a different set of models for each. For any parser, tagger and backoff that takes a 'model' argument, the partition number will be appended to the given value")
    group.add_option("--seq-parts", "--sequence-partitions", dest="sequence_partitions", action="store", help="use a chord sequence index to partition the inputs. Input type (bulk) must support association of the inputs with chord sequences by id. Sequences in the given sequence index file are partitioned n ways (--partitions) and the inputs are processed according to their associated sequence.")
    group.add_option("--continue", "--skip-done", dest="skip_done", action="store_true", help="skip any inputs for which a readable results file already exists. This is useful for continuing a bulk job that was stopped in the middle")
    ###
    group = OptionGroup(optparser, "Parser", "Parser, supertagger and backoff parser")
    optparser.add_option_group(group)
    group.add_option("-d", "--derivations", dest="derivations", action="store_true", help="keep derivation logs during parse.")
    group.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.")
    # Parser options
    group.add_option("-p", "--parser", dest="parser", action="store", help="use the named parser algorithm instead of the default. Use '-p help' to see the list of available parsers. Default: %s" % settings.DEFAULT_PARSER, default=settings.DEFAULT_PARSER)
    group.add_option("--popt", "--parser-options", dest="popts", action="append", help="specify options for the parser. Type '--popt help', using '--parser <name>' to select a parser module, to get a list of options.")
    # Tagger options
    group.add_option("-t", "--tagger", "--supertagger", dest="supertagger", action="store", help="run the parser using the named supertagger. Use '-t help' to see the list of available taggers. Default: %s" % settings.DEFAULT_SUPERTAGGER, default=settings.DEFAULT_SUPERTAGGER)
    group.add_option("--topt", "--tagger-options", dest="topts", action="append", help="specify options for the tagger. Type '--topt help', using '-u <name>' to select a tagger module, to get a list of options.")
    # Backoff options
    group.add_option("-b", "--backoff", "--noparse", dest="backoff", action="store", help="use the named backoff model as a backoff if the parser produces no results")
    group.add_option("--bopt", "--backoff-options", "--backoff-options", "--npo", dest="backoff_opts", action="append", help="specify options for the  backoff model. Type '--npo help', using '--backoff <name>' to select a backoff modules, to get a list of options.")
    ###
    # Multiprocessing options
    group = OptionGroup(optparser, "Multiprocessing")
    optparser.add_option_group(group)
    group.add_option("--processes", dest="processes", action="store", type="int", help="number of processes to create to perform parses in parallel. Default: 1, i.e. no process pool. Use -1 to create a process for every input", default=1)
    ###
    # Output options
    group = OptionGroup(optparser, "Output")
    optparser.add_option_group(group)
    group.add_option("--output", dest="output", action="store", help="directory name to output parse results to. A filename specific to the individual input will be appended to this")
    group.add_option("--topn", dest="topn", action="store", type="int", help="limit the number of final results to store in the output file to the top n by probability. By default, stores all")
    group.add_option("--output-opts", "--oopts", dest="output_opts", action="store", help="options that affect the output formatting. Use '--output-opts help' for a list of options.")
    group.add_option("-a", "--atomic-results", dest="atoms_only", action="store_true", help="only include atomic categories in the results.")
    group.add_option("-l", "--latex", dest="latex", action="store_true", help="output all results as Latex source. Used to produce a whole Latex document, but doesn't any more")
    group.add_option("--all-times", dest="all_times", action="store_true", help="display all timing information on semantics in output.")
    group.add_option("-v", "--debug", dest="debug", action="store_true", help="output verbose debugging information.")
    group.add_option("--time", dest="time", action="store_true", help="time how long the parse takes and output with the results.")
    group.add_option("--no-results", dest="no_results", action="store_true", help="don't print out the parse results at the end. Obviously you'll want to make sure they're going to a file (--output). This is useful for bulk parse jobs, where the results produce a lot of unnecessary output")
    group.add_option("--no-progress", dest="no_progress", action="store_true", help="don't output the summary of completed sequences after each one finishes")
    ###
    # Output analysis and harmonical
    group = OptionGroup(optparser, "Output processing", "Output analysis and harmonical")
    optparser.add_option_group(group)
    group.add_option("--harmonical", dest="harmonical", action="store", help="use the harmonical to play the chords justly intoned according to the top result and output to a wave file.")
    group.add_option("--enharmonical", dest="enharmonical", action="store", help="use the harmonical to play the chords in equal temperament and output to a wave file.")
    group.add_option("--midi", dest="midi", action="store_true", help="generate MIDI files from the harmonical, instead of wave files.")
    group.add_option("--tempo", dest="tempo", action="store", type=int, help="tempo to use for the generated music (see --harmonical/--enharmonical). Default: 120", default=120)
    group.add_option("--lh-analysis", dest="lh_analysis", action="store_true", help="output the Longuet-Higgins space interpretation of the semantics for each result.")
    group.add_option("--lh-coordinates", dest="lh_coord", action="store_true", help="like lh-analysis, but displays the coordinates of the points instead of their names.")
    ###
    # Logging options
    group = OptionGroup(optparser, "Logging")
    optparser.add_option_group(group)
    group.add_option("--long-progress", dest="long_progress", action="store_true", help="print a summary of the chart so far after each chord/word has been processed.")
    group.add_option("--progress", "--short-progress", dest="short_progress", action="store_true", help="print a small amount of information out during parsing to indicate progress.")
    group.add_option("--logger", dest="logger", action="store", help="directory to put parser logging in. A filename based on an identifier for each individual input will be appended.")
    ###
    # Shell options
    group = OptionGroup(optparser, "Shell", "Interactive shell for inspecting results and parser state")
    optparser.add_option_group(group)
    group.add_option("-i", "--interactive", dest="interactive", action="store_true", help="enter interactive mode after parsing.")
    group.add_option("--error", dest="error_shell", action="store_true", help="catch any errors, report them and then enter the interactive shell. This also catches keyboard interrupts, so you can use it to halt parsing and enter the shell.")
    
    # Read in command line options and args
    options, clinput = parse_args_with_config(optparser)

    ########################### Option processing ####################
    
    # Get log level option first, so we can start using the logger
    if options.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO
    # Set up a logger
    init_logging(log_level)
    
    if options.latex:
        settings.OPTIONS.OUTPUT_LATEX = True
    
    if options.logger:
        # Directory
        parse_logger_dir = options.logger
        check_directory(parse_logger_dir)
    else:
        parse_logger_dir = None
    
    ######## Grammar ########
    # Check the grammar actually exists
    grammar_names = get_grammar_names()
    if options.grammar is not None and options.grammar not in grammar_names:
        # This is not a valid grammar name
        logger.error("The grammar '%s' does not exist. Possible "\
            "grammars are: %s." % (options.grammar, ", ".join(grammar_names)))
        return 1
    grammar = get_grammar(options.grammar)
        
    ######## Parser ########
    # Load the requested parser
    from jazzparser.parsers import PARSERS
    if options.parser.lower() == "help":
        print "Available parsers are: %s" % ", ".join(PARSERS)
        return 0
    try:
        parser_cls = get_parser(options.parser)
    except ParserLoadError:
        logger.error("The parser '%s' could not be loaded. Possible "\
            "parsers are: %s" % (options.parser, ", ".join(PARSERS)))
        return 1
        
    # Get parser options
    if options.popts is not None:
        poptstr = options.popts
        if "help" in [s.strip().lower() for s in poptstr]:
            # Output this tagger's option help
            from jazzparser.utils.options import options_help_text
            print options_help_text(parser_cls.PARSER_OPTIONS, intro="Available options for selected parser")
            return 0
        poptstr = ":".join(poptstr)
    else:
        poptstr = ""
    popts = ModuleOption.process_option_string(poptstr)
    # Check that the options are valid
    try:
        parser_cls.check_options(popts)
    except ModuleOptionError, err:
        logger.error("Problem with parser options (--popt): %s" % err)
        return 1