def main(): usage = "%prog [options]" description = "Runs a suite of small parsing experiments to try different beam settings" parser = OptionParser(usage=usage, description=description) options, arguments = parser.parse_args() cmd_dir = os.path.abspath(os.path.join("..", "..")) parser = os.path.join(cmd_dir, "jazzparser") result_eval = os.path.join(cmd_dir, "analysis", "result_eval.py") result_eval_dir = os.path.join(cmd_dir, "analysis") # Try all combinations of threshold and maxarc settings settings = [(threshold,maxarc) for threshold in [0.5, 0.1, 0.01, 0.001] \ for maxarc in [5, 10, 15, 20]] # Don't try different thresholds for maxarc=1: they're all the same settings.append((0.1, 1)) # Open a CSV file to write the results to with open("test_suite.csv", "w") as result_file: results = csv.writer(result_file) results.writerow(["Threshold", "Maxarc", "Dep rec"]) for threshold,maxarc in settings: print "\n#####################################################" print "### Threshold %s, maxarc %d ###" % (threshold, maxarc) # Build a config file string for these settings options = "%%%% DEF threshold %s\n" % threshold options += "%%%% DEF maxarc %d\n" % maxarc conf = ConfigFile.from_string(options+BASE_CONFIG) # Run the parser retcode = call([parser]+conf.get_strings(), cwd=cmd_dir, stderr=STDOUT) if retcode: print "Parse failed" # Don't bother continuing with the others return 1 # Find out where the output was being put output_dir = dict(conf.options)['output'] files = os.path.join(output_dir, "*.res") # Evaluate all the results files in that directory eval_proc = Popen([result_eval, files, "--mopt output=f", "-m deprec", "-q"], cwd=result_eval_dir, stdout=PIPE) eval_out = eval_proc.stdout.read() f_score = eval_out.rstrip().rstrip("%") # Write the result out to the summary file results.writerow(["%s" % threshold, "%d" % maxarc, f_score]) # Flush the file object so each result appears immediately result_file.flush()
def main(): usage = "%prog [options]" description = "Runs a suite of small parsing experiments to try different beam settings" parser = OptionParser(usage=usage, description=description) options, arguments = parser.parse_args() cmd_dir = os.path.abspath(os.path.join("..", "..")) parser = os.path.join(cmd_dir, "jazzparser") result_eval = os.path.join(cmd_dir, "analysis", "result_eval.py") result_eval_dir = os.path.join(cmd_dir, "analysis") # Try all combinations of threshold and maxarc settings settings = [(threshold,maxarc) for threshold in [0.5, 0.1, 0.01, 0.001] \ for maxarc in [5, 10, 15, 20]] # Don't try different thresholds for maxarc=1: they're all the same settings.append((0.1, 1)) # Open a CSV file to write the results to with open("test_suite.csv", "w") as result_file: results = csv.writer(result_file) results.writerow(["Threshold", "Maxarc", "Dep rec"]) for threshold, maxarc in settings: print "\n#####################################################" print "### Threshold %s, maxarc %d ###" % (threshold, maxarc) # Build a config file string for these settings options = "%%%% DEF threshold %s\n" % threshold options += "%%%% DEF maxarc %d\n" % maxarc conf = ConfigFile.from_string(options + BASE_CONFIG) # Run the parser retcode = call([parser] + conf.get_strings(), cwd=cmd_dir, stderr=STDOUT) if retcode: print "Parse failed" # Don't bother continuing with the others return 1 # Find out where the output was being put output_dir = dict(conf.options)['output'] files = os.path.join(output_dir, "*.res") # Evaluate all the results files in that directory eval_proc = Popen( [result_eval, files, "--mopt output=f", "-m deprec", "-q"], cwd=result_eval_dir, stdout=PIPE) eval_out = eval_proc.stdout.read() f_score = eval_out.rstrip().rstrip("%") # Write the result out to the summary file results.writerow(["%s" % threshold, "%d" % maxarc, f_score]) # Flush the file object so each result appears immediately result_file.flush()
def main(): usage = "%prog [options]" description = "Trains a suite of ngram models and tests them all" parser = OptionParser(usage=usage, description=description) parser.add_option('-n', '--no-train', dest="no_train", action="store_true", help="don't train the models. Only do this if you've previously used this script to train all the models") parser.add_option('--train', '--only-train', dest="only_train", action="store_true", help="only train the models, don't do the experiments") parser.add_option('--bt', '--bigram-trigram', dest="bigram_trigram", action="store_true", help="only include bigram and trigram models") parser.add_option('-t', '--trigram', dest="trigram", action="store_true", help="only include trigram models") parser.add_option('--wb', '--witten-bell', dest="witten_bell", action="store_true", help="only use witten-bell smoothing (skip laplace)") parser.add_option('--lap', '--laplace', dest="laplace", action="store_true", help="only use laplace smoothing (skip witten-bell)") parser.add_option('-v', '--viterbi', dest="viterbi", action="store_true", help="use Viterbi decoding") parser.add_option('-4', '--4grams', dest="fourgrams", action="store_true", help="run experiments for 4-gram models") parser.add_option('-c', '--cutoff', dest="cutoff", action="store", type="int", help="custom cutoff to use, instead of trying several") parser.add_option('--gt', '--good-turing', dest="good_turing", action="store_true", help="only use Good-Turing smoothing (not usually included)") options, arguments = parser.parse_args() cmd_dir = os.path.abspath("..") train_cmd = "./train.py" tageval_cmd = "./tageval.py" if options.bigram_trigram: orders = [2, 3] elif options.trigram: orders = [3] elif options.fourgrams: orders = [4] else: orders = [1, 2, 3] if options.witten_bell: smoothings = [("witten-bell", "wb")] elif options.laplace: smoothings = [("laplace", "lap")] elif options.good_turing: smoothings = [("simple-good-turing", "gt")] else: smoothings = [("witten-bell", "wb"), ("laplace", "lap")] if options.cutoff is None: cutoffs = [0, 2, 5] else: cutoffs = [options.cutoff] # Open a CSV file to write the results to with open("test_suite.csv", "w") as result_file: results = csv.writer(result_file) results.writerow(["Order", "Cutoff", "Smoothing", "Entropy", "Agreement"]) for model_order in orders: for cutoff in cutoffs: for smoothing,smoothing_short in smoothings: #for chord_map in ["none", "small", "big"]: print "\n#####################################################" print "### Order %d, cutoff %d, smoothing %s ###" % (model_order, cutoff, smoothing) # Build a unique name for the model model_name = "suite_n%d_c%d_%s" % (model_order, cutoff, smoothing_short) # Train the model if not options.no_train: # Prepare options to train the model model_options = "n=%d:cutoff=%d:backoff=%d:estimator=%s" % \ (model_order, cutoff, model_order-1, smoothing) training_opts = BASE_TRAINING_OPTIONS + \ "opts = %s\n%%%% ARG 1 %s" % (model_options, model_name) # Turn these nice option specifications into command-line args conf = ConfigFile.from_string(training_opts) # Train this model #train_output = check_output([train_cmd]+conf.get_strings(), cwd=cmd_dir) train_proc = Popen([train_cmd]+conf.get_strings(), cwd=cmd_dir, stdout=PIPE, stderr=STDOUT) output_proc(train_proc) if not options.only_train: # Entropy doesn't tell us much for Viterbi decoding if not options.viterbi: # Test the model's entropy print "### Entropy ###" entropy_opts = BASE_ENTROPY_OPTIONS + "%%%% ARG 1 %s" % model_name conf = ConfigFile.from_string(entropy_opts) entropy_proc = Popen([tageval_cmd]+conf.get_strings(), cwd=cmd_dir, stdout=PIPE, stderr=STDOUT) # Output as we go output = output_proc(entropy_proc) # Get the last line and pull out the entropy value last_line = output.strip("\n").rpartition("\n")[2] entropy = float(last_line.split()[0]) else: entropy = 0.0 # Test the model's top tag accuracy print "\n### Agreement ###" accuracy_opts = BASE_ACCURACY_OPTIONS + "%%%% ARG 1 %s" % model_name if options.viterbi: accuracy_opts += "\ntopt = decode=viterbi" conf = ConfigFile.from_string(accuracy_opts) accuracy_proc = Popen([tageval_cmd]+conf.get_strings(), cwd=cmd_dir, stdout=PIPE, stderr=STDOUT) # Output as we go output = output_proc(accuracy_proc) # Get the last line and pull out the agreement value last_line = output.strip("\n").rpartition("\n")[2] agreement = float(last_line.split()[-1].strip("()%")) results.writerow(["%d" % model_order, "%d" % cutoff, "%s" % smoothing, "%f" % entropy, "%f" % agreement]) # Flush the file object so each result appears in the # file immediately result_file.flush()