def train_and_eval_model(working_path, model_path, eval_data, C): parameter_settings = {'C': C} logging.info('Training model with C = {}'.format(C)) model_path = '{}.C{}'.format(model_path, C) logging.info('Evaluating model with C = {}'.format(C)) train_rst_parsing_model(working_path, model_path, parameter_settings) rst_parser = Parser(1, 1, 1) rst_parser.load_model(model_path) results = predict_and_evaluate_rst_trees(None, None, rst_parser, eval_data, use_gold_syntax=True) return results
def train_and_eval_model(working_path, model_path, eval_data, C): parameter_settings = {'C': C} logging.info('Training model with C = {}'.format(C)) model_path = '{}.C{}'.format(model_path, C) logging.info('Evaluating model with C = {}'.format(C)) train_rst_parsing_model(working_path, model_path, parameter_settings) rst_parser = Parser(1, 1, 1) rst_parser.load_model(model_path) results = predict_and_evaluate_rst_trees(None, None, rst_parser, eval_data, use_gold_syntax=True) return results
def batch_process(docs, output_path, zpar_model_directory, segmentation_model, parsing_model): ''' docs is a list or tuple of (doc_id, text) tuples. ''' syntax_parser = SyntaxParserWrapper(zpar_model_directory) segmenter = Segmenter(segmentation_model) parser = Parser(max_acts=1, max_states=1, n_best=1) parser.load_model(parsing_model) with open(output_path, 'w') as outfile: for doc_id, text in docs: logging.info('doc_id: {}'.format(doc_id)) doc_dict = {"doc_id": doc_id, "raw_text": text} edu_tokens, complete_trees = \ segment_and_parse(doc_dict, syntax_parser, segmenter, parser) print(json.dumps({"doc_id": doc_id, "edu_tokens": edu_tokens, \ "scored_rst_trees": \ [{"score": tree["score"], "tree": tree["tree"].pprint(margin=TREE_PRINT_MARGIN)} for tree in complete_trees]}), file=outfile)
def batch_process(docs, output_path, zpar_model_directory, segmentation_model, parsing_model): ''' docs is a list or tuple of (doc_id, text) tuples. ''' syntax_parser = SyntaxParserWrapper(zpar_model_directory) segmenter = Segmenter(segmentation_model) parser = Parser(max_acts=1, max_states=1, n_best=1) parser.load_model(parsing_model) with open(output_path, 'w') as outfile: for doc_id, text in docs: logging.info('doc_id: {}'.format(doc_id)) doc_dict = {"doc_id": doc_id, "raw_text": text} edu_tokens, complete_trees = \ segment_and_parse(doc_dict, syntax_parser, segmenter, parser) print(json.dumps({"doc_id": doc_id, "edu_tokens": edu_tokens, \ "scored_rst_trees": \ [{"score": tree["score"], "tree": tree["tree"].pformat(margin=TREE_PRINT_MARGIN)} for tree in complete_trees]}), file=outfile)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input_paths', nargs='+', help='A document to segment and parse.' + ' Paragraphs should be separated by two or more' + ' newline characters.') parser.add_argument('-g', '--segmentation_model', help='Path to segmentation model.', required=True) parser.add_argument('-p', '--parsing_model', help='Path to RST parsing model.', required=True) parser.add_argument('-a', '--max_acts', help='Maximum number of actions for...?', type=int, default=1) parser.add_argument('-n', '--n_best', help='Number of parses to return', type=int, default=1) parser.add_argument('-s', '--max_states', help='Maximum number of states to retain for \ best-first search', type=int, default=1) parser.add_argument('-zp', '--zpar_port', type=int) parser.add_argument('-zh', '--zpar_hostname', default=None) parser.add_argument('-zm', '--zpar_model_directory', default=None) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') args = parser.parse_args() # Convert verbose flag to actually logging level. log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely. logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) # Read the models. logging.info('Loading models') syntax_parser = \ SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname, zpar_model_directory=args.zpar_model_directory) segmenter = Segmenter(args.segmentation_model) parser = Parser(max_acts=args.max_acts, max_states=args.max_states, n_best=args.n_best) parser.load_model(args.parsing_model) for input_path in args.input_paths: logging.info('rst_parse input file: {}'.format(input_path)) doc = read_text_file(input_path) logging.debug('rst_parse input. doc_id = {}, text = {}' .format(input_path, doc)) doc_dict = {"raw_text": doc, "doc_id": input_path} edu_tokens, complete_trees = segment_and_parse(doc_dict, syntax_parser, segmenter, parser) print(json.dumps({"edu_tokens": edu_tokens, \ "scored_rst_trees": [{"score": tree["score"], "tree": tree["tree"] .pprint(margin=TREE_PRINT_MARGIN)} for tree in complete_trees]}))
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input_paths', nargs='+', help='A document to segment and parse.' + ' Paragraphs should be separated by two or more' + ' newline characters.') parser.add_argument('-g', '--segmentation_model', help='Path to segmentation model.', required=True) parser.add_argument('-p', '--parsing_model', help='Path to RST parsing model.', required=True) parser.add_argument('-a', '--max_acts', help='Maximum number of actions for...?', type=int, default=1) parser.add_argument('-n', '--n_best', help='Number of parses to return', type=int, default=1) parser.add_argument('-s', '--max_states', help='Maximum number of states to retain for \ best-first search', type=int, default=1) parser.add_argument('-zp', '--zpar_port', type=int) parser.add_argument('-zh', '--zpar_hostname', default=None) parser.add_argument('-zm', '--zpar_model_directory', default=None) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') args = parser.parse_args() # Convert verbose flag to actually logging level. log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely. logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) # Read the models. logging.info('Loading models') syntax_parser = \ SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname, zpar_model_directory=args.zpar_model_directory) segmenter = Segmenter(args.segmentation_model) parser = Parser(max_acts=args.max_acts, max_states=args.max_states, n_best=args.n_best) parser.load_model(args.parsing_model) for input_path in args.input_paths: logging.info('rst_parse input file: {}'.format(input_path)) doc = read_text_file(input_path) logging.debug('rst_parse input. doc_id = {}, text = {}'.format( input_path, doc)) doc_dict = {"raw_text": doc, "doc_id": input_path} edu_tokens, complete_trees = segment_and_parse(doc_dict, syntax_parser, segmenter, parser) complete_trees = [tree for tree in complete_trees ] # can't use a generator twice print(json.dumps({"edu_tokens": edu_tokens, \ "scored_rst_trees": [{"score": tree["score"], "tree": tree["tree"] .pformat(margin=TREE_PRINT_MARGIN)} for tree in complete_trees]})) for i, tree in enumerate(complete_trees, 1): ptree_str = tree['tree'].__repr__() + '\n' with codecs.open(input_path + '_{}.parentedtree'.format(str(i)), 'w', 'utf-8') as ptree_file: ptree_file.write(ptree_str)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('evaluation_set', help='The dev or test set JSON file', type=argparse.FileType('r')) parser.add_argument('-g', '--segmentation_model', help='Path to segmentation model. If not specified,' + 'then gold EDUs will be used.', default=None) parser.add_argument('-p', '--parsing_model', help='Path to RST parsing model.', required=True) parser.add_argument('-z', '--zpar_directory', default='zpar') parser.add_argument('-t', '--use_gold_syntax', help='If specified, then gold PTB syntax trees will' + 'be used.', action='store_true') parser.add_argument('-a', '--max_acts', help='Maximum number of actions to perform on each ' + 'state', type=int, default=1) parser.add_argument('-s', '--max_states', help='Maximum number of states to retain for ' + 'best-first search', type=int, default=1) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') args = parser.parse_args() assert args.use_gold_syntax or args.segmentation_model # Convert verbose flag to actually logging level log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) logger = logging.getLogger(__name__) # read the models logger.info('Loading models') # TODO add port, host, model args syntax_parser = SyntaxParserWrapper() if not args.use_gold_syntax else None segmenter = Segmenter(args.segmentation_model) \ if args.segmentation_model else None rst_parser = Parser(max_acts=args.max_acts, max_states=args.max_states, n_best=1) rst_parser.load_model(args.parsing_model) eval_data = json.load(args.evaluation_set) results = \ predict_and_evaluate_rst_trees(syntax_parser, segmenter, rst_parser, eval_data, use_gold_syntax=args.use_gold_syntax) print(json.dumps(sorted(results.items())))
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('evaluation_set', help='The dev or test set JSON file', type=argparse.FileType('r')) parser.add_argument('-g', '--segmentation_model', help='Path to segmentation model. If not specified,' + 'then gold EDUs will be used.', default=None) parser.add_argument('-p', '--parsing_model', help='Path to RST parsing model.', required=True) parser.add_argument('-z', '--zpar_directory', default='zpar') parser.add_argument('-t', '--use_gold_syntax', help='If specified, then gold PTB syntax trees will' + 'be used.', action='store_true') parser.add_argument('-a', '--max_acts', help='Maximum number of actions to perform on each ' + 'state', type=int, default=1) parser.add_argument('-s', '--max_states', help='Maximum number of states to retain for ' + 'best-first search', type=int, default=1) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') args = parser.parse_args() assert args.use_gold_syntax or args.segmentation_model # Convert verbose flag to actually logging level log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) logger = logging.getLogger(__name__) # read the models logger.info('Loading models') # TODO add port, host, model args syntax_parser = SyntaxParserWrapper() if not args.use_gold_syntax else None segmenter = Segmenter(args.segmentation_model) \ if args.segmentation_model else None rst_parser = Parser(max_acts=args.max_acts, max_states=args.max_states, n_best=1) rst_parser.load_model(args.parsing_model) eval_data = json.load(args.evaluation_set) results = \ predict_and_evaluate_rst_trees(syntax_parser, segmenter, rst_parser, eval_data, use_gold_syntax=args.use_gold_syntax) print(json.dumps(sorted(results.items())))
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('evaluation_set', help='The dev or test set JSON file', type=argparse.FileType('r')) parser.add_argument('-p', '--parsing_model', help='Path to RST parsing model.', required=True) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') parser.add_argument('--metric_name', help='name of metric to use', choices=["labeled_precision", "labeled_recall", "labeled_f1", "nuc_precision", "nuc_recall", "nuc_f1", "span_precision", "span_recall", "span_f1"], required=True) parser.add_argument('--n_samples', type=int, default=10000) parser.add_argument('--alpha', type=float, default=0.05) args = parser.parse_args() # Convert verbose flag to actually logging level log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) logger = logging.getLogger(__name__) # read the models logger.info('Loading models') rst_parser = Parser(max_acts=1, max_states=1, n_best=1) rst_parser.load_model(args.parsing_model) eval_data = json.load(args.evaluation_set) pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists, gold_trees = \ predict_rst_trees_for_eval(None, None, rst_parser, eval_data) data = np.array(list(zip(pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists, gold_trees))) # score without bootstrapping orig_score = compute_rst_eval_results(pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists, gold_trees)[args.metric_name] tmp_score = make_score_func(args.metric_name)(data) assert tmp_score == orig_score boot_ci_lower, boot_ci_upper = \ boot.ci(data, make_score_func(args.metric_name), n_samples=args.n_samples, method='bca', alpha=args.alpha) print("evaluation_set: {}".format(args.evaluation_set)) print("alpha: {}".format(args.alpha)) print("n_samples: {}".format(args.n_samples)) print("metric: {}".format(args.metric_name)) print("original score: {}".format(orig_score)) print("CI: ({}, {})".format(boot_ci_lower, boot_ci_upper))