def train_and_eval_model(working_path, model_path, eval_data, C): parameter_settings = {'C': C} logging.info('Training model with C = {}'.format(C)) model_path = '{}.C{}'.format(model_path, C) logging.info('Evaluating model with C = {}'.format(C)) train_rst_parsing_model(working_path, model_path, parameter_settings) rst_parser = Parser(1, 1, 1) rst_parser.load_model(model_path) results = predict_and_evaluate_rst_trees(None, None, rst_parser, eval_data, use_gold_syntax=True) return results
def batch_process(docs, output_path, zpar_model_directory, segmentation_model, parsing_model): ''' docs is a list or tuple of (doc_id, text) tuples. ''' syntax_parser = SyntaxParserWrapper(zpar_model_directory) segmenter = Segmenter(segmentation_model) parser = Parser(max_acts=1, max_states=1, n_best=1) parser.load_model(parsing_model) with open(output_path, 'w') as outfile: for doc_id, text in docs: logging.info('doc_id: {}'.format(doc_id)) doc_dict = {"doc_id": doc_id, "raw_text": text} edu_tokens, complete_trees = \ segment_and_parse(doc_dict, syntax_parser, segmenter, parser) print(json.dumps({"doc_id": doc_id, "edu_tokens": edu_tokens, \ "scored_rst_trees": \ [{"score": tree["score"], "tree": tree["tree"].pprint(margin=TREE_PRINT_MARGIN)} for tree in complete_trees]}), file=outfile)
def batch_process(docs, output_path, zpar_model_directory, segmentation_model, parsing_model): ''' docs is a list or tuple of (doc_id, text) tuples. ''' syntax_parser = SyntaxParserWrapper(zpar_model_directory) segmenter = Segmenter(segmentation_model) parser = Parser(max_acts=1, max_states=1, n_best=1) parser.load_model(parsing_model) with open(output_path, 'w') as outfile: for doc_id, text in docs: logging.info('doc_id: {}'.format(doc_id)) doc_dict = {"doc_id": doc_id, "raw_text": text} edu_tokens, complete_trees = \ segment_and_parse(doc_dict, syntax_parser, segmenter, parser) print(json.dumps({"doc_id": doc_id, "edu_tokens": edu_tokens, \ "scored_rst_trees": \ [{"score": tree["score"], "tree": tree["tree"].pformat(margin=TREE_PRINT_MARGIN)} for tree in complete_trees]}), file=outfile)
def test_reconstruct_training_examples(): ''' This code goes through the training data and makes sure that the actions extracted from the trees can be used to reconstruct those trees from a list of EDUs. ''' train_path = 'rst_discourse_tb_edus_TRAINING_TRAIN.json' with open(train_path) as f: data = json.load(f) rst_parser = Parser(max_acts=1, max_states=1, n_best=1) for doc_dict in data: tree_orig = ParentedTree.fromstring(doc_dict['rst_tree']) actions = extract_parse_actions(tree_orig) tree2 = next(rst_parser.parse(doc_dict, gold_actions=actions, make_features=False))['tree'] logging.info('test_reconstruct_training_examples verified tree for {}'.format(doc_dict['path_basename'])) assert tree2 == tree_orig
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input_paths', nargs='+', help='A document to segment and parse.' + ' Paragraphs should be separated by two or more' + ' newline characters.') parser.add_argument('-g', '--segmentation_model', help='Path to segmentation model.', required=True) parser.add_argument('-p', '--parsing_model', help='Path to RST parsing model.', required=True) parser.add_argument('-a', '--max_acts', help='Maximum number of actions for...?', type=int, default=1) parser.add_argument('-n', '--n_best', help='Number of parses to return', type=int, default=1) parser.add_argument('-s', '--max_states', help='Maximum number of states to retain for \ best-first search', type=int, default=1) parser.add_argument('-zp', '--zpar_port', type=int) parser.add_argument('-zh', '--zpar_hostname', default=None) parser.add_argument('-zm', '--zpar_model_directory', default=None) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') args = parser.parse_args() # Convert verbose flag to actually logging level. log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely. logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) # Read the models. logging.info('Loading models') syntax_parser = \ SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname, zpar_model_directory=args.zpar_model_directory) segmenter = Segmenter(args.segmentation_model) parser = Parser(max_acts=args.max_acts, max_states=args.max_states, n_best=args.n_best) parser.load_model(args.parsing_model) for input_path in args.input_paths: logging.info('rst_parse input file: {}'.format(input_path)) doc = read_text_file(input_path) logging.debug('rst_parse input. doc_id = {}, text = {}' .format(input_path, doc)) doc_dict = {"raw_text": doc, "doc_id": input_path} edu_tokens, complete_trees = segment_and_parse(doc_dict, syntax_parser, segmenter, parser) print(json.dumps({"edu_tokens": edu_tokens, \ "scored_rst_trees": [{"score": tree["score"], "tree": tree["tree"] .pprint(margin=TREE_PRINT_MARGIN)} for tree in complete_trees]}))
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input_paths', nargs='+', help='A document to segment and parse.' + ' Paragraphs should be separated by two or more' + ' newline characters.') parser.add_argument('-g', '--segmentation_model', help='Path to segmentation model.', required=True) parser.add_argument('-p', '--parsing_model', help='Path to RST parsing model.', required=True) parser.add_argument('-a', '--max_acts', help='Maximum number of actions for...?', type=int, default=1) parser.add_argument('-n', '--n_best', help='Number of parses to return', type=int, default=1) parser.add_argument('-s', '--max_states', help='Maximum number of states to retain for \ best-first search', type=int, default=1) parser.add_argument('-zp', '--zpar_port', type=int) parser.add_argument('-zh', '--zpar_hostname', default=None) parser.add_argument('-zm', '--zpar_model_directory', default=None) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') args = parser.parse_args() # Convert verbose flag to actually logging level. log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely. logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) # Read the models. logging.info('Loading models') syntax_parser = \ SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname, zpar_model_directory=args.zpar_model_directory) segmenter = Segmenter(args.segmentation_model) parser = Parser(max_acts=args.max_acts, max_states=args.max_states, n_best=args.n_best) parser.load_model(args.parsing_model) for input_path in args.input_paths: logging.info('rst_parse input file: {}'.format(input_path)) doc = read_text_file(input_path) logging.debug('rst_parse input. doc_id = {}, text = {}'.format( input_path, doc)) doc_dict = {"raw_text": doc, "doc_id": input_path} edu_tokens, complete_trees = segment_and_parse(doc_dict, syntax_parser, segmenter, parser) complete_trees = [tree for tree in complete_trees ] # can't use a generator twice print(json.dumps({"edu_tokens": edu_tokens, \ "scored_rst_trees": [{"score": tree["score"], "tree": tree["tree"] .pformat(margin=TREE_PRINT_MARGIN)} for tree in complete_trees]})) for i, tree in enumerate(complete_trees, 1): ptree_str = tree['tree'].__repr__() + '\n' with codecs.open(input_path + '_{}.parentedtree'.format(str(i)), 'w', 'utf-8') as ptree_file: ptree_file.write(ptree_str)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('train_file', help='Path to JSON training file.', type=argparse.FileType('r')) parser.add_argument('eval_file', help='Path to JSON dev or test file for ' + 'tuning/evaluation.', type=argparse.FileType('r')) parser.add_argument('model_path', help='Prefix for the path to where the model should be' ' stored. A suffix with the C value will be added.') parser.add_argument('-w', '--working_path', help='Path to where intermediate files should be ' + 'stored', default='working') parser.add_argument('-C', '--C_values', help='comma-separated list of model complexity ' + 'parameter settings to evaluate.', default=','.join([str(2.0**x) for x in range(-4, 5)])) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') parser.add_argument('-s', '--single_process', action='store_true', help='Run in a single process for all hyperparameter' + ' grid points, to simplify debugging.') args = parser.parse_args() if os.path.exists(args.working_path): raise IOError("{} already exists. Stopping here to avoid the " "possibility of overwriting files that are currently " "being used.".format(args.working_path)) os.makedirs(args.working_path) parser = Parser(1, 1, 1) # Convert verbose flag to actually logging level log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) logger = logging.getLogger(__name__) logger.info('Extracting examples') train_data = json.load(args.train_file) eval_data = json.load(args.eval_file) train_examples = [] for doc_dict in train_data: path_basename = doc_dict['path_basename'] logging.info('Extracting examples for {}'.format(path_basename)) tree = ParentedTree.fromstring(doc_dict['rst_tree']) collapse_rst_labels(tree) actions = extract_parse_actions(tree) for i, (action_str, feats) in \ enumerate(parser.parse(doc_dict, gold_actions=actions)): example_id = "{}_{}".format(path_basename, i) example = {"x": Counter(feats), "y": action_str, "id": example_id} train_examples.append(example) # print("{} {}".format(action_str, " ".join(feats))) # train and evaluate a model for each value of C best_labeled_f1 = -1.0 best_C = None # train and evaluate models with different C values in parallel C_values = [float(x) for x in args.C_values.split(',')] partial_train_and_eval_model = partial(train_and_eval_model, args.working_path, args.model_path, eval_data) # Make the SKLL jsonlines feature file train_path = os.path.join(args.working_path, 'rst_parsing.jsonlines') with open(train_path, 'w') as train_file: for example in train_examples: train_file.write('{}\n'.format(json.dumps(example))) if args.single_process: all_results = [ partial_train_and_eval_model(C_value) for C_value in C_values ] else: n_workers = len(C_values) with ProcessPoolExecutor(max_workers=n_workers) as executor: all_results = executor.map(partial_train_and_eval_model, C_values) for C_value, results in zip(C_values, all_results): results["C"] = C_value print(json.dumps(sorted(results.items()))) if results["labeled_f1"] > best_labeled_f1: best_labeled_f1 = results["labeled_f1"] best_C = C_value print("best labeled F1 = {}, with C = {}".format(best_labeled_f1, best_C))
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('evaluation_set', help='The dev or test set JSON file', type=argparse.FileType('r')) parser.add_argument('-g', '--segmentation_model', help='Path to segmentation model. If not specified,' + 'then gold EDUs will be used.', default=None) parser.add_argument('-p', '--parsing_model', help='Path to RST parsing model.', required=True) parser.add_argument('-z', '--zpar_directory', default='zpar') parser.add_argument('-t', '--use_gold_syntax', help='If specified, then gold PTB syntax trees will' + 'be used.', action='store_true') parser.add_argument('-a', '--max_acts', help='Maximum number of actions to perform on each ' + 'state', type=int, default=1) parser.add_argument('-s', '--max_states', help='Maximum number of states to retain for ' + 'best-first search', type=int, default=1) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') args = parser.parse_args() assert args.use_gold_syntax or args.segmentation_model # Convert verbose flag to actually logging level log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) logger = logging.getLogger(__name__) # read the models logger.info('Loading models') # TODO add port, host, model args syntax_parser = SyntaxParserWrapper() if not args.use_gold_syntax else None segmenter = Segmenter(args.segmentation_model) \ if args.segmentation_model else None rst_parser = Parser(max_acts=args.max_acts, max_states=args.max_states, n_best=1) rst_parser.load_model(args.parsing_model) eval_data = json.load(args.evaluation_set) results = \ predict_and_evaluate_rst_trees(syntax_parser, segmenter, rst_parser, eval_data, use_gold_syntax=args.use_gold_syntax) print(json.dumps(sorted(results.items())))
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('train_file', help='Path to JSON training file.', type=argparse.FileType('r')) parser.add_argument('eval_file', help='Path to JSON dev or test file for ' + 'tuning/evaluation.', type=argparse.FileType('r')) parser.add_argument('model_path', help='Prefix for the path to where the model should be' ' stored. A suffix with the C value will be added.') parser.add_argument('-w', '--working_path', help='Path to where intermediate files should be ' + 'stored', default='working') parser.add_argument('-C', '--C_values', help='comma-separated list of model complexity ' + 'parameter settings to evaluate.', default=','.join([str(2.0 ** x) for x in range(-4, 5)])) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') parser.add_argument('-s', '--single_process', action='store_true', help='Run in a single process for all hyperparameter' + ' grid points, to simplify debugging.') args = parser.parse_args() if os.path.exists(args.working_path): raise IOError("{} already exists. Stopping here to avoid the " "possibility of overwriting files that are currently " "being used.".format(args.working_path)) os.makedirs(args.working_path) parser = Parser(1, 1, 1) # Convert verbose flag to actually logging level log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) logger = logging.getLogger(__name__) logger.info('Extracting examples') train_data = json.load(args.train_file) eval_data = json.load(args.eval_file) train_examples = [] for doc_dict in train_data: path_basename = doc_dict['path_basename'] logging.info('Extracting examples for {}'.format(path_basename)) tree = ParentedTree.fromstring(doc_dict['rst_tree']) collapse_rst_labels(tree) actions = extract_parse_actions(tree) for i, (action_str, feats) in \ enumerate(parser.parse(doc_dict, gold_actions=actions)): example_id = "{}_{}".format(path_basename, i) example = {"x": Counter(feats), "y": action_str, "id": example_id} train_examples.append(example) # print("{} {}".format(action_str, " ".join(feats))) # train and evaluate a model for each value of C best_labeled_f1 = -1.0 best_C = None # train and evaluate models with different C values in parallel C_values = [float(x) for x in args.C_values.split(',')] partial_train_and_eval_model = partial(train_and_eval_model, args.working_path, args.model_path, eval_data) # Make the SKLL jsonlines feature file train_path = os.path.join(args.working_path, 'rst_parsing.jsonlines') with open(train_path, 'w') as train_file: for example in train_examples: train_file.write('{}\n'.format(json.dumps(example))) if args.single_process: all_results = [partial_train_and_eval_model(C_value) for C_value in C_values] else: n_workers = len(C_values) with ProcessPoolExecutor(max_workers=n_workers) as executor: all_results = executor.map(partial_train_and_eval_model, C_values) for C_value, results in zip(C_values, all_results): results["C"] = C_value print(json.dumps(sorted(results.items()))) if results["labeled_f1"] > best_labeled_f1: best_labeled_f1 = results["labeled_f1"] best_C = C_value print("best labeled F1 = {}, with C = {}".format(best_labeled_f1, best_C))
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('evaluation_set', help='The dev or test set JSON file', type=argparse.FileType('r')) parser.add_argument('-p', '--parsing_model', help='Path to RST parsing model.', required=True) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') parser.add_argument('--metric_name', help='name of metric to use', choices=["labeled_precision", "labeled_recall", "labeled_f1", "nuc_precision", "nuc_recall", "nuc_f1", "span_precision", "span_recall", "span_f1"], required=True) parser.add_argument('--n_samples', type=int, default=10000) parser.add_argument('--alpha', type=float, default=0.05) args = parser.parse_args() # Convert verbose flag to actually logging level log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) logger = logging.getLogger(__name__) # read the models logger.info('Loading models') rst_parser = Parser(max_acts=1, max_states=1, n_best=1) rst_parser.load_model(args.parsing_model) eval_data = json.load(args.evaluation_set) pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists, gold_trees = \ predict_rst_trees_for_eval(None, None, rst_parser, eval_data) data = np.array(list(zip(pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists, gold_trees))) # score without bootstrapping orig_score = compute_rst_eval_results(pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists, gold_trees)[args.metric_name] tmp_score = make_score_func(args.metric_name)(data) assert tmp_score == orig_score boot_ci_lower, boot_ci_upper = \ boot.ci(data, make_score_func(args.metric_name), n_samples=args.n_samples, method='bca', alpha=args.alpha) print("evaluation_set: {}".format(args.evaluation_set)) print("alpha: {}".format(args.alpha)) print("n_samples: {}".format(args.n_samples)) print("metric: {}".format(args.metric_name)) print("original score: {}".format(orig_score)) print("CI: ({}, {})".format(boot_ci_lower, boot_ci_upper))