def prepare_db_input(): """ Loads a sequence index file, pulls out some data and prepares it as it using it as input to the parser. This may be used by tests to get hold of data as example input. @note: Don't rely on the size of the returned tuple to stay the same. I may add more return items in the future, so access the ones that are being returned currently by index. @rtype: tuple @return: (sequence index, sequence, DbInput instance) """ from jazzparser.data.db_mirrors import SequenceIndex from jazzparser.data.input import DbInput from jazzparser.settings import TEST as settings seqs = SequenceIndex.from_file(settings.SEQUENCE_DATA) seq = seqs.sequences[0] input_sequence = DbInput.from_sequence(seq) return seqs, seq, input_sequence
def test_detect_input_type(self): # Load some input: DbInput dbi = DbInput.from_file(DB_SEQUENCES_FILE, {"index": 0}) # Run it through the preprocessor datatype, obj = detect_input_type(dbi) # Get the datatype from the type name lists datatype2 = input_type_name(type(obj)) self.assertEqual(datatype, datatype2) # Do the same with ChordInput ci = ChordInput.from_file(CHORDS_FILE, options={"roman": True}) datatype, obj = detect_input_type(ci) datatype2 = input_type_name(type(obj)) self.assertEqual(datatype, datatype2) # Try some bulk input bulk = DbBulkInput.from_file(DB_SEQUENCES_FILE) datatype, obj = detect_input_type(bulk, allow_bulk=True) datatype2 = input_type_name(type(obj)) self.assertEqual(datatype, datatype2) # Try restricting the allowed type datatype, obj = detect_input_type(ci, allowed=["chords"]) # And this one should get rejected self.assertRaises(InputTypeError, detect_input_type, (ci,), {"allowed": "db"})
def test_from_sequence(self): # Load the sequence index file index = SequenceIndex.from_file(DB_SEQUENCES_FILE) # Pick out a sequence seq = index.sequences[0] # Construct a DbInput from this sequence dbi = DbInput.from_sequence(seq)
def test_from_file(self): # Select a sequence out of the sequence index file using the "index" option options = { 'index': 0, } # Just load the sequence up from the file dbi = DbInput.from_file(DB_SEQUENCES_FILE, options)
def test_detect_input_type(self): # Load some input: DbInput dbi = DbInput.from_file(DB_SEQUENCES_FILE, {'index': 0}) # Run it through the preprocessor datatype, obj = detect_input_type(dbi) # Get the datatype from the type name lists datatype2 = input_type_name(type(obj)) self.assertEqual(datatype, datatype2) # Do the same with ChordInput ci = ChordInput.from_file(CHORDS_FILE, options={'roman': True}) datatype, obj = detect_input_type(ci) datatype2 = input_type_name(type(obj)) self.assertEqual(datatype, datatype2) # Try some bulk input bulk = DbBulkInput.from_file(DB_SEQUENCES_FILE) datatype, obj = detect_input_type(bulk, allow_bulk=True) datatype2 = input_type_name(type(obj)) self.assertEqual(datatype, datatype2) # Try restricting the allowed type datatype, obj = detect_input_type(ci, allowed=['chords']) # And this one should get rejected self.assertRaises(InputTypeError, detect_input_type, (ci, ), {'allowed': 'db'})
def _parse_seq(seq): # Parse the annotations to get a semantics try: gold_parses = parse_sequence_with_annotations( DbInput.from_sequence(seq), grammar=grammar, allow_subparses=False) # Got a result: return its semantics return gold_parses[0].semantics except ParseError, err: # Could not parse annotated sequence print >>sys.stderr, "Could not parse sequence '%s': %s" % \ (seq.string_name, err) return
def _sequence_train(self, sequence): """ Adds counts to the model for a single chord sequence. """ # Prepare the input and annotations input = DbInput.from_sequence(sequence) categories = [chord.category for chord in sequence.iterator()] str_inputs = input.inputs # Build the implicit normal-form tree from the annotations try: tree = build_tree_for_sequence(sequence) except TreeBuildError, err: raise ModelTrainingError, "could not build a tree for '%s': %s" % \ (sequence.string_name, err)
def main(): usage = "%prog [options] <results-files>" description = "Prints a dependency tree for a parse result" parser = OptionParser(usage=usage, description=description) parser.add_option("-t", "--times", dest="times", action="store_true", help="show timings of nodes") parser.add_option("-l", "--latex", dest="latex", action="store_true", help="output Latex for the graphs using tikz-dependency") parser.add_option("--la", "--latex-align", dest="latex_align", action="store_true", help="show node alignments in Latex output") parser.add_option("--align-time", dest="align_time", action="store_true", help="show the graph of common dependencies when the two graphs are aligned by node times") parser.add_option("--align-max", dest="align_max", action="store_true", help="show the graph of common dependencies when the two graphs are aligned to maximize the dependency recovery") options, arguments = parser.parse_args() if len(arguments) == 0: print >>sys.stderr, "Specify a file to read the results from" sys.exit(1) filename = arguments[0] # Swith PCCG/St+PCCG PARSER = "PCCG" FEATURE_PARAMS = "../xuanhong/params_2_pcfg.txt" if filename.find("stpcfg") != -1: PARSER = "St+PCCG" FEATURE_PARAMS = "../xuanhong/params_2_stpcfg.txt" # Input sequence list_songs = read_list_songs("../xuanhong/list_songs.txt") song_name = os.path.basename(filename) seqs = SequenceIndex.from_file(settings.SEQUENCE_DATA) seq = seqs.sequences[list_songs[song_name]] input_sequence = DbInput.from_sequence(seq) try: pres = ParseResults.from_file(filename) except ParseResults.LoadError, err: print >>sys.stderr, "Error loading file: %s" % (err) sys.exit(1)
def main(): def _check_args(args): if len(args) != 3: print >>sys.stderr, "Specify a tagger, model name and input file" sys.exit(1) return args[1],args[2] partitions,part_ids,options,arguments = prepare_evaluation_options( usage = "%prog [options] <tagger> <model-name> <input-file>", description = "Evaluate a tagging model by "\ "tagging sequences from an input file. If the tagger doesn't "\ "need a model name, use '-' as the model name.", check_args = _check_args, optparse_groups = [ (("Tagging",), [(("--topt", "--tagger-options"), {'dest':"topts", 'action':"append", 'help':"options to pass to the tagger."}), ]), (("Output",), [(("--no-model-info",), {'dest':"no_model_info", 'action':"store_true", 'help':"turns of outputing of information about the model being used before using it (useful for identifying output piped to a file later, but may be too verbose sometimes)"}), ]), (("Evaluation", "Type of evaluation and options"), [(("-a", "--agreement"), {'dest':"agreement", 'action':"store_true", 'help':"instead of doing any parses, just report the agreement of the tops tags with the gold standard tags."}), (("--confusion",), {'dest':"confusion", 'action':"store_true", 'help':"print out confusion matrix after agreement calculation. Applies only in combination with --agreement"}), (("-e", "--entropy"), {'dest':"entropy", 'action':"store_true", 'help':"instead of doing any parses, just report the entropy of the returned tag distribution with respect to the gold standard tags."}), (("--tag-stats",), {'dest':"tag_stats", 'action':"store_true", 'help':"just output stats about the tags that the model assigns to this sequence (or these sequences)"}), (("--topn",), {'dest':"topn", 'type':"int", 'action':"store", 'help':"when evaluating agreement consider the top N tags the tagger returns. By default, allows only the top one to count as a hit.", 'default':1}), ]), ], ) grammar = Grammar() tagger_name = arguments[0] model_name = arguments[1] # Tagger shouldn't use a model in some cases no_tagger_model = model_name == "-" # Load the requested tagger class tagger_cls = get_tagger(tagger_name) topts = ModuleOption.process_option_string(options.topts) def _model_info(mname): """ Outputs info about the named model """ if options.no_model_info: print >>sys.stderr, "Model %s" % mname else: # Can only output the nice model info if it's a ModelTagger if issubclass(tagger_cls, ModelTagger): print >>sys.stderr, "======== Model info ========" print >>sys.stderr, tagger_cls.MODEL_CLASS.load_model(mname).description print >>sys.stderr, "============================" else: print >>sys.stderr, "Tagger %s using model %s" % (tagger_cls.__name__, mname) num_parts = len(partitions) num_seqs = sum([len(p[0]) for p in partitions]) ################# Evaluation ######################## if options.tag_stats: raise NotImplementedError, "fix this if you want it" # Print out statistics for each partition, with its model if no_tagger_model: # There could be some circumstance in which we want to do this, # but I can't think what it is, so I'm not implementing it for now print >>sys.stderr, "Cannot run tag_stats with no tagger model" sys.exit(1) all_stats = {} for parti in range(num_parts): sequences,model,part_num = partitions[parti] # Output the model training info if requested _model_info(model) ######## This doesn't exist any more stats = sequences_top_tags_dict(tagger_cls, model, sequences, topn=options.topn) for tag,num in stats.items(): if tag in all_stats: all_stats[tag] += stats[tag] else: all_stats[tag] = stats[tag] pprint_table(sys.stdout, list(reversed(sorted(all_stats.items(), key=lambda r:r[1]))), separator="|") elif options.agreement: # Print out agreement stats for each partition if no_tagger_model: # Same a tag_stats: probably no need for this ever print >>sys.stderr, "Cannot run agreement with no tagger model" sys.exit(1) correct = 0 total = 0 conf_mat = {} for parti in range(num_parts): sequences,model,part_num = partitions[parti] topts['model'] = model # Output the model training info if requested _model_info(model) pcorrect = 0 ptotal = 0 # Go through each sequence for seq in sequences: print >>sys.stderr, "Evaluating %s" % seq.string_name input = DbInput.from_sequence(seq) correct_tags = [chord.category for chord in seq.iterator()] cor,tot = tagger_agreement(input, grammar, tagger_cls, correct_tags, options=topts, confusion_matrix=conf_mat, topn=options.topn) pcorrect += cor ptotal += tot print " Sequence: %.1f%%" % (float(cor)/tot*100) print " So far: %.1f%%" % (float(pcorrect)/ptotal*100) print "Partition %d: %d / %d (%.2f%%)" % (part_num, pcorrect, ptotal, (float(pcorrect)/ptotal*100)) correct += pcorrect total += ptotal if num_parts > 1: # Print out the overall stats print "%d / %d (%f%%)" % (correct,total,(float(correct)/total*100)) if options.confusion: confusion_matrix(conf_mat) elif options.entropy: print "Calculating cross-entropy of tagger with gold standard tags" entropy = 0.0 num_chords = 0 for parti in range(num_parts): sequences,model,part_num = partitions[parti] if not no_tagger_model: topts['model'] = model # Output the model training info if requested _model_info(model) pentropy = 0.0 pnum_chords = 0 # Compute the entropy for the partition model for seq in sequences: print >>sys.stderr, "Evaluating %s" % seq.string_name input = " ".join([str(chord) for chord in seq.iterator()]) correct_tags = [chord.category for chord in seq.iterator()] ent,crds = tagger_entropy(input, grammar, tagger_cls, correct_tags, options=topts) pentropy += ent pnum_chords += crds print " %f bits per chord" % (ent/crds) print "Partition %d: %f bits per chord (%d chords)" % (part_num, (pentropy/pnum_chords), pnum_chords) entropy += pentropy num_chords += pnum_chords # Print out the stats for all partitions together if num_parts > 1: print "%f bits per chord (%d chords)" % ((entropy/num_chords), num_chords) else: print >>sys.stderr, "Select an evaluation operation with one of the options" sys.exit(1)
def test_from_file(self): # Select a sequence out of the sequence index file using the "index" option options = {"index": 0} # Just load the sequence up from the file dbi = DbInput.from_file(DB_SEQUENCES_FILE, options)