def predict_rst_trees_for_eval(syntax_parser, segmenter, rst_parser, eval_data, use_gold_syntax=True): pred_edu_tokens_lists = [] pred_trees = [] gold_edu_tokens_lists = [] gold_trees = [] for doc_dict in eval_data: logging.info('processing {}...'.format(doc_dict['path_basename'])) gold_edu_tokens_lists.append( \ extract_edus_tokens(doc_dict['edu_start_indices'], doc_dict['tokens'])) # Collapse the RST labels to use the coarse relations that the parser # produces. gold_tree = ParentedTree.fromstring(doc_dict['rst_tree']) collapse_rst_labels(gold_tree) gold_trees.append(gold_tree) # TODO when not using gold syntax, should the script still use gold # standard tokens? # remove gold standard trees or EDU boundaries if evaluating # using automatic preprocessing if not use_gold_syntax: # TODO will merging the EDU strings here to make the raw_text # variable produce the appropriate eval result when not using gold # standard trees? doc_dict['raw_text'] = ' '.join(doc_dict['edu_strings']) del doc_dict['syntax_trees'] del doc_dict['token_tree_positions'] del doc_dict['tokens'] del doc_dict['pos_tags'] if segmenter is not None: del doc_dict['edu_start_indices'] # predict the RST tree tokens, trees = segment_and_parse(doc_dict, syntax_parser, segmenter, rst_parser) pred_trees.append(next(trees)['tree']) pred_edu_tokens_lists.append(tokens) return (pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists, gold_trees)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('train_file', help='Path to JSON training file.', type=argparse.FileType('r')) parser.add_argument('eval_file', help='Path to JSON dev or test file for ' + 'tuning/evaluation.', type=argparse.FileType('r')) parser.add_argument('model_path', help='Prefix for the path to where the model should be' ' stored. A suffix with the C value will be added.') parser.add_argument('-w', '--working_path', help='Path to where intermediate files should be ' + 'stored', default='working') parser.add_argument('-C', '--C_values', help='comma-separated list of model complexity ' + 'parameter settings to evaluate.', default=','.join([str(2.0**x) for x in range(-4, 5)])) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') parser.add_argument('-s', '--single_process', action='store_true', help='Run in a single process for all hyperparameter' + ' grid points, to simplify debugging.') args = parser.parse_args() if os.path.exists(args.working_path): raise IOError("{} already exists. Stopping here to avoid the " "possibility of overwriting files that are currently " "being used.".format(args.working_path)) os.makedirs(args.working_path) parser = Parser(1, 1, 1) # Convert verbose flag to actually logging level log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) logger = logging.getLogger(__name__) logger.info('Extracting examples') train_data = json.load(args.train_file) eval_data = json.load(args.eval_file) train_examples = [] for doc_dict in train_data: path_basename = doc_dict['path_basename'] logging.info('Extracting examples for {}'.format(path_basename)) tree = ParentedTree.fromstring(doc_dict['rst_tree']) collapse_rst_labels(tree) actions = extract_parse_actions(tree) for i, (action_str, feats) in \ enumerate(parser.parse(doc_dict, gold_actions=actions)): example_id = "{}_{}".format(path_basename, i) example = {"x": Counter(feats), "y": action_str, "id": example_id} train_examples.append(example) # print("{} {}".format(action_str, " ".join(feats))) # train and evaluate a model for each value of C best_labeled_f1 = -1.0 best_C = None # train and evaluate models with different C values in parallel C_values = [float(x) for x in args.C_values.split(',')] partial_train_and_eval_model = partial(train_and_eval_model, args.working_path, args.model_path, eval_data) # Make the SKLL jsonlines feature file train_path = os.path.join(args.working_path, 'rst_parsing.jsonlines') with open(train_path, 'w') as train_file: for example in train_examples: train_file.write('{}\n'.format(json.dumps(example))) if args.single_process: all_results = [ partial_train_and_eval_model(C_value) for C_value in C_values ] else: n_workers = len(C_values) with ProcessPoolExecutor(max_workers=n_workers) as executor: all_results = executor.map(partial_train_and_eval_model, C_values) for C_value, results in zip(C_values, all_results): results["C"] = C_value print(json.dumps(sorted(results.items()))) if results["labeled_f1"] > best_labeled_f1: best_labeled_f1 = results["labeled_f1"] best_C = C_value print("best labeled F1 = {}, with C = {}".format(best_labeled_f1, best_C))
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('train_file', help='Path to JSON training file.', type=argparse.FileType('r')) parser.add_argument('eval_file', help='Path to JSON dev or test file for ' + 'tuning/evaluation.', type=argparse.FileType('r')) parser.add_argument('model_path', help='Prefix for the path to where the model should be' ' stored. A suffix with the C value will be added.') parser.add_argument('-w', '--working_path', help='Path to where intermediate files should be ' + 'stored', default='working') parser.add_argument('-C', '--C_values', help='comma-separated list of model complexity ' + 'parameter settings to evaluate.', default=','.join([str(2.0 ** x) for x in range(-4, 5)])) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') parser.add_argument('-s', '--single_process', action='store_true', help='Run in a single process for all hyperparameter' + ' grid points, to simplify debugging.') args = parser.parse_args() if os.path.exists(args.working_path): raise IOError("{} already exists. Stopping here to avoid the " "possibility of overwriting files that are currently " "being used.".format(args.working_path)) os.makedirs(args.working_path) parser = Parser(1, 1, 1) # Convert verbose flag to actually logging level log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) logger = logging.getLogger(__name__) logger.info('Extracting examples') train_data = json.load(args.train_file) eval_data = json.load(args.eval_file) train_examples = [] for doc_dict in train_data: path_basename = doc_dict['path_basename'] logging.info('Extracting examples for {}'.format(path_basename)) tree = ParentedTree.fromstring(doc_dict['rst_tree']) collapse_rst_labels(tree) actions = extract_parse_actions(tree) for i, (action_str, feats) in \ enumerate(parser.parse(doc_dict, gold_actions=actions)): example_id = "{}_{}".format(path_basename, i) example = {"x": Counter(feats), "y": action_str, "id": example_id} train_examples.append(example) # print("{} {}".format(action_str, " ".join(feats))) # train and evaluate a model for each value of C best_labeled_f1 = -1.0 best_C = None # train and evaluate models with different C values in parallel C_values = [float(x) for x in args.C_values.split(',')] partial_train_and_eval_model = partial(train_and_eval_model, args.working_path, args.model_path, eval_data) # Make the SKLL jsonlines feature file train_path = os.path.join(args.working_path, 'rst_parsing.jsonlines') with open(train_path, 'w') as train_file: for example in train_examples: train_file.write('{}\n'.format(json.dumps(example))) if args.single_process: all_results = [partial_train_and_eval_model(C_value) for C_value in C_values] else: n_workers = len(C_values) with ProcessPoolExecutor(max_workers=n_workers) as executor: all_results = executor.map(partial_train_and_eval_model, C_values) for C_value, results in zip(C_values, all_results): results["C"] = C_value print(json.dumps(sorted(results.items()))) if results["labeled_f1"] > best_labeled_f1: best_labeled_f1 = results["labeled_f1"] best_C = C_value print("best labeled F1 = {}, with C = {}".format(best_labeled_f1, best_C))