Esempio n. 1
0
def prepare_db_input():
    """
    Loads a sequence index file, pulls out some data and prepares it 
    as it using it as input to the parser.
    
    This may be used by tests to get hold of data as example input.
    
    @note: Don't rely on the size of the returned tuple to stay the 
    same. I may add more return items in the future, so access the 
    ones that are being returned currently by index.
    
    @rtype: tuple
    @return: (sequence index, sequence, DbInput instance)
    
    """
    from jazzparser.data.db_mirrors import SequenceIndex
    from jazzparser.data.input import DbInput
    from jazzparser.settings import TEST as settings
    
    seqs = SequenceIndex.from_file(settings.SEQUENCE_DATA)
    seq = seqs.sequences[0]
    
    input_sequence = DbInput.from_sequence(seq)
    
    return seqs, seq, input_sequence
Esempio n. 2
0
def prepare_db_input():
    """
    Loads a sequence index file, pulls out some data and prepares it 
    as it using it as input to the parser.
    
    This may be used by tests to get hold of data as example input.
    
    @note: Don't rely on the size of the returned tuple to stay the 
    same. I may add more return items in the future, so access the 
    ones that are being returned currently by index.
    
    @rtype: tuple
    @return: (sequence index, sequence, DbInput instance)
    
    """
    from jazzparser.data.db_mirrors import SequenceIndex
    from jazzparser.data.input import DbInput
    from jazzparser.settings import TEST as settings

    seqs = SequenceIndex.from_file(settings.SEQUENCE_DATA)
    seq = seqs.sequences[0]

    input_sequence = DbInput.from_sequence(seq)

    return seqs, seq, input_sequence
    def test_detect_input_type(self):
        # Load some input: DbInput
        dbi = DbInput.from_file(DB_SEQUENCES_FILE, {"index": 0})
        # Run it through the preprocessor
        datatype, obj = detect_input_type(dbi)
        # Get the datatype from the type name lists
        datatype2 = input_type_name(type(obj))
        self.assertEqual(datatype, datatype2)

        # Do the same with ChordInput
        ci = ChordInput.from_file(CHORDS_FILE, options={"roman": True})
        datatype, obj = detect_input_type(ci)
        datatype2 = input_type_name(type(obj))
        self.assertEqual(datatype, datatype2)

        # Try some bulk input
        bulk = DbBulkInput.from_file(DB_SEQUENCES_FILE)
        datatype, obj = detect_input_type(bulk, allow_bulk=True)
        datatype2 = input_type_name(type(obj))
        self.assertEqual(datatype, datatype2)

        # Try restricting the allowed type
        datatype, obj = detect_input_type(ci, allowed=["chords"])
        # And this one should get rejected
        self.assertRaises(InputTypeError, detect_input_type, (ci,), {"allowed": "db"})
 def test_from_sequence(self):
     # Load the sequence index file
     index = SequenceIndex.from_file(DB_SEQUENCES_FILE)
     # Pick out a sequence
     seq = index.sequences[0]
     # Construct a DbInput from this sequence
     dbi = DbInput.from_sequence(seq)
Esempio n. 5
0
 def test_from_file(self):
     # Select a sequence out of the sequence index file using the "index" option
     options = {
         'index': 0,
     }
     # Just load the sequence up from the file
     dbi = DbInput.from_file(DB_SEQUENCES_FILE, options)
Esempio n. 6
0
 def test_from_sequence(self):
     # Load the sequence index file
     index = SequenceIndex.from_file(DB_SEQUENCES_FILE)
     # Pick out a sequence
     seq = index.sequences[0]
     # Construct a DbInput from this sequence
     dbi = DbInput.from_sequence(seq)
Esempio n. 7
0
    def test_detect_input_type(self):
        # Load some input: DbInput
        dbi = DbInput.from_file(DB_SEQUENCES_FILE, {'index': 0})
        # Run it through the preprocessor
        datatype, obj = detect_input_type(dbi)
        # Get the datatype from the type name lists
        datatype2 = input_type_name(type(obj))
        self.assertEqual(datatype, datatype2)

        # Do the same with ChordInput
        ci = ChordInput.from_file(CHORDS_FILE, options={'roman': True})
        datatype, obj = detect_input_type(ci)
        datatype2 = input_type_name(type(obj))
        self.assertEqual(datatype, datatype2)

        # Try some bulk input
        bulk = DbBulkInput.from_file(DB_SEQUENCES_FILE)
        datatype, obj = detect_input_type(bulk, allow_bulk=True)
        datatype2 = input_type_name(type(obj))
        self.assertEqual(datatype, datatype2)

        # Try restricting the allowed type
        datatype, obj = detect_input_type(ci, allowed=['chords'])
        # And this one should get rejected
        self.assertRaises(InputTypeError, detect_input_type, (ci, ),
                          {'allowed': 'db'})
Esempio n. 8
0
 def _parse_seq(seq):
     # Parse the annotations to get a semantics
     try:
         gold_parses = parse_sequence_with_annotations(
                                             DbInput.from_sequence(seq), 
                                             grammar=grammar,
                                             allow_subparses=False)
         # Got a result: return its semantics
         return gold_parses[0].semantics
     except ParseError, err:
         # Could not parse annotated sequence
         print >>sys.stderr, "Could not parse sequence '%s': %s" % \
                                                 (seq.string_name, err)
         return 
Esempio n. 9
0
 def _parse_seq(seq):
     # Parse the annotations to get a semantics
     try:
         gold_parses = parse_sequence_with_annotations(
             DbInput.from_sequence(seq),
             grammar=grammar,
             allow_subparses=False)
         # Got a result: return its semantics
         return gold_parses[0].semantics
     except ParseError, err:
         # Could not parse annotated sequence
         print >>sys.stderr, "Could not parse sequence '%s': %s" % \
                                                 (seq.string_name, err)
         return
Esempio n. 10
0
 def _sequence_train(self, sequence):
     """
     Adds counts to the model for a single chord sequence.
     
     """
     # Prepare the input and annotations
     input = DbInput.from_sequence(sequence)
     categories = [chord.category for chord in sequence.iterator()]
     str_inputs = input.inputs
     # Build the implicit normal-form tree from the annotations
     try:
         tree = build_tree_for_sequence(sequence)
     except TreeBuildError, err:
         raise ModelTrainingError, "could not build a tree for '%s': %s" % \
             (sequence.string_name, err)
Esempio n. 11
0
 def _sequence_train(self, sequence):
     """
     Adds counts to the model for a single chord sequence.
     
     """
     # Prepare the input and annotations
     input = DbInput.from_sequence(sequence)
     categories = [chord.category for chord in sequence.iterator()]
     str_inputs = input.inputs
     # Build the implicit normal-form tree from the annotations
     try:
         tree = build_tree_for_sequence(sequence)
     except TreeBuildError, err:
         raise ModelTrainingError, "could not build a tree for '%s': %s" % \
             (sequence.string_name, err)
Esempio n. 12
0
def main():    

    usage = "%prog [options] <results-files>"
    description = "Prints a dependency tree for a parse result"
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-t", "--times", dest="times", action="store_true", help="show timings of nodes")
    parser.add_option("-l", "--latex", dest="latex", action="store_true", help="output Latex for the graphs using tikz-dependency")
    parser.add_option("--la", "--latex-align", dest="latex_align", action="store_true", help="show node alignments in Latex output")
    parser.add_option("--align-time", dest="align_time", action="store_true", help="show the graph of common dependencies when the two graphs are aligned by node times")
    parser.add_option("--align-max", dest="align_max", action="store_true", help="show the graph of common dependencies when the two graphs are aligned to maximize the dependency recovery")
    options, arguments = parser.parse_args()
        
    
    if len(arguments) == 0:
        print >>sys.stderr, "Specify a file to read the results from"
        sys.exit(1)
    filename = arguments[0]

    # Swith PCCG/St+PCCG
    PARSER = "PCCG"
    FEATURE_PARAMS = "../xuanhong/params_2_pcfg.txt"
    if filename.find("stpcfg") != -1:
        PARSER = "St+PCCG"
        FEATURE_PARAMS = "../xuanhong/params_2_stpcfg.txt"        


    # Input sequence
    list_songs = read_list_songs("../xuanhong/list_songs.txt")
    song_name = os.path.basename(filename)
    seqs = SequenceIndex.from_file(settings.SEQUENCE_DATA)
    seq = seqs.sequences[list_songs[song_name]]
    input_sequence = DbInput.from_sequence(seq)
    
    try:
        pres = ParseResults.from_file(filename)
    except ParseResults.LoadError, err:
        print >>sys.stderr, "Error loading file: %s" % (err)
        sys.exit(1)
Esempio n. 13
0
def main():
    def _check_args(args):
        if len(args) != 3:
            print >>sys.stderr, "Specify a tagger, model name and input file"
            sys.exit(1)
        return args[1],args[2]
    
    partitions,part_ids,options,arguments = prepare_evaluation_options(
        usage = "%prog [options] <tagger> <model-name> <input-file>",
        description = "Evaluate a tagging model by "\
            "tagging sequences from an input file. If the tagger doesn't "\
            "need a model name, use '-' as the model name.",
        check_args = _check_args,
        optparse_groups = [
            (("Tagging",),
                [(("--topt", "--tagger-options"), 
                    {'dest':"topts", 'action':"append", 'help':"options to pass to the tagger."}),
                ]),
            (("Output",), 
                [(("--no-model-info",), 
                    {'dest':"no_model_info", 'action':"store_true", 'help':"turns of outputing of information about the model being used before using it (useful for identifying output piped to a file later, but may be too verbose sometimes)"}),
                ]),
            (("Evaluation", "Type of evaluation and options"),
                [(("-a", "--agreement"), 
                    {'dest':"agreement", 'action':"store_true", 'help':"instead of doing any parses, just report the agreement of the tops tags with the gold standard tags."}),
                 (("--confusion",), 
                    {'dest':"confusion", 'action':"store_true", 'help':"print out confusion matrix after agreement calculation. Applies only in combination with --agreement"}),
                 (("-e", "--entropy"), 
                    {'dest':"entropy", 'action':"store_true", 'help':"instead of doing any parses, just report the entropy of the returned tag distribution with respect to the gold standard tags."}),
                 (("--tag-stats",), 
                    {'dest':"tag_stats", 'action':"store_true", 'help':"just output stats about the tags that the model assigns to this sequence (or these sequences)"}),
                 (("--topn",), 
                    {'dest':"topn", 'type':"int", 'action':"store", 'help':"when evaluating agreement consider the top N tags the tagger returns. By default, allows only the top one to count as a hit.", 'default':1}),
                ]),
        ],
    )
    
    grammar = Grammar()
    
    tagger_name = arguments[0]
    model_name = arguments[1]
    # Tagger shouldn't use a model in some cases
    no_tagger_model = model_name == "-"
    
    # Load the requested tagger class
    tagger_cls = get_tagger(tagger_name)
    topts = ModuleOption.process_option_string(options.topts)
    
    def _model_info(mname):
        """ Outputs info about the named model """
        if options.no_model_info:
            print >>sys.stderr, "Model %s" % mname
        else:
            # Can only output the nice model info if it's a ModelTagger
            if issubclass(tagger_cls, ModelTagger):
                print >>sys.stderr, "======== Model info ========"
                print >>sys.stderr, tagger_cls.MODEL_CLASS.load_model(mname).description
                print >>sys.stderr, "============================"
            else:
                print >>sys.stderr, "Tagger %s using model %s" % (tagger_cls.__name__, mname)
    
    num_parts = len(partitions)
    num_seqs = sum([len(p[0]) for p in partitions])
    
    ################# Evaluation ########################
    if options.tag_stats:
        raise NotImplementedError, "fix this if you want it"
        # Print out statistics for each partition, with its model
        if no_tagger_model:
            # There could be some circumstance in which we want to do this, 
            #  but I can't think what it is, so I'm not implementing it for now
            print >>sys.stderr, "Cannot run tag_stats with no tagger model"
            sys.exit(1)
        all_stats = {}
        for parti in range(num_parts):
            sequences,model,part_num = partitions[parti]
            # Output the model training info if requested
            _model_info(model)
            ######## This doesn't exist any more
            stats = sequences_top_tags_dict(tagger_cls, model, sequences, topn=options.topn)
            for tag,num in stats.items():
                if tag in all_stats:
                    all_stats[tag] += stats[tag]
                else:
                    all_stats[tag] = stats[tag]
        pprint_table(sys.stdout, list(reversed(sorted(all_stats.items(), key=lambda r:r[1]))), separator="|")
    elif options.agreement:
        # Print out agreement stats for each partition
        if no_tagger_model:
            # Same a tag_stats: probably no need for this ever
            print >>sys.stderr, "Cannot run agreement with no tagger model"
            sys.exit(1)
        correct = 0
        total = 0
        conf_mat = {}
        for parti in range(num_parts):
            sequences,model,part_num = partitions[parti]
            topts['model'] = model
            # Output the model training info if requested
            _model_info(model)
            pcorrect = 0
            ptotal = 0
            # Go through each sequence
            for seq in sequences:
                print >>sys.stderr, "Evaluating %s" % seq.string_name
                input = DbInput.from_sequence(seq)
                correct_tags = [chord.category for chord in seq.iterator()]
                cor,tot = tagger_agreement(input, grammar, tagger_cls, correct_tags, options=topts, confusion_matrix=conf_mat, topn=options.topn)
                pcorrect += cor
                ptotal += tot
                print "  Sequence: %.1f%%" % (float(cor)/tot*100)
                print "  So far: %.1f%%" % (float(pcorrect)/ptotal*100)
            print "Partition %d: %d / %d (%.2f%%)" % (part_num, pcorrect, ptotal, (float(pcorrect)/ptotal*100))
            correct += pcorrect
            total += ptotal
        if num_parts > 1:
            # Print out the overall stats
            print "%d / %d (%f%%)" % (correct,total,(float(correct)/total*100))
        if options.confusion:
            confusion_matrix(conf_mat) 
    elif options.entropy:
        print "Calculating cross-entropy of tagger with gold standard tags"
        entropy = 0.0
        num_chords = 0
        for parti in range(num_parts):
            sequences,model,part_num = partitions[parti]
            if not no_tagger_model:
                topts['model'] = model
                # Output the model training info if requested
                _model_info(model)
            pentropy = 0.0
            pnum_chords = 0
            # Compute the entropy for the partition model
            for seq in sequences:
                print >>sys.stderr, "Evaluating %s" % seq.string_name
                input = " ".join([str(chord) for chord in seq.iterator()])
                correct_tags = [chord.category for chord in seq.iterator()]
                ent,crds = tagger_entropy(input, grammar, tagger_cls, correct_tags, options=topts)
                pentropy += ent
                pnum_chords += crds
                print "   %f bits per chord" % (ent/crds)
            print "Partition %d: %f bits per chord (%d chords)" % (part_num, (pentropy/pnum_chords), pnum_chords)
            entropy += pentropy
            num_chords += pnum_chords
        # Print out the stats for all partitions together
        if num_parts > 1:
            print "%f bits per chord (%d chords)" % ((entropy/num_chords), num_chords)
    else:
        print >>sys.stderr, "Select an evaluation operation with one of the options"
        sys.exit(1)
 def test_from_file(self):
     # Select a sequence out of the sequence index file using the "index" option
     options = {"index": 0}
     # Just load the sequence up from the file
     dbi = DbInput.from_file(DB_SEQUENCES_FILE, options)