def predict_rst_trees_for_eval(syntax_parser, segmenter, rst_parser, eval_data, use_gold_syntax=True): pred_edu_tokens_lists = [] pred_trees = [] gold_edu_tokens_lists = [] gold_trees = [] for doc_dict in eval_data: logging.info('processing {}...'.format(doc_dict['path_basename'])) gold_edu_tokens_lists.append( \ extract_edus_tokens(doc_dict['edu_start_indices'], doc_dict['tokens'])) # Collapse the RST labels to use the coarse relations that the parser # produces. gold_tree = ParentedTree.fromstring(doc_dict['rst_tree']) collapse_rst_labels(gold_tree) gold_trees.append(gold_tree) # TODO when not using gold syntax, should the script still use gold # standard tokens? # remove gold standard trees or EDU boundaries if evaluating # using automatic preprocessing if not use_gold_syntax: # TODO will merging the EDU strings here to make the raw_text # variable produce the appropriate eval result when not using gold # standard trees? doc_dict['raw_text'] = ' '.join(doc_dict['edu_strings']) del doc_dict['syntax_trees'] del doc_dict['token_tree_positions'] del doc_dict['tokens'] del doc_dict['pos_tags'] if segmenter is not None: del doc_dict['edu_start_indices'] # predict the RST tree tokens, trees = segment_and_parse(doc_dict, syntax_parser, segmenter, rst_parser) pred_trees.append(next(trees)['tree']) pred_edu_tokens_lists.append(tokens) return (pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists, gold_trees)
def predict_rst_trees_for_eval(syntax_parser, segmenter, rst_parser, eval_data, use_gold_syntax=True): pred_edu_tokens_lists = [] pred_trees = [] gold_edu_tokens_lists = [] gold_trees = [] for doc_dict in eval_data: logging.info('processing {}...'.format(doc_dict['path_basename'])) gold_edu_tokens_lists.append( \ extract_edus_tokens(doc_dict['edu_start_indices'], doc_dict['tokens'])) # Collapse the RST labels to use the coarse relations that the parser # produces. gold_tree = ParentedTree.fromstring(doc_dict['rst_tree']) collapse_rst_labels(gold_tree) gold_trees.append(gold_tree) # TODO when not using gold syntax, should the script still use gold # standard tokens? # remove gold standard trees or EDU boundaries if evaluating # using automatic preprocessing if not use_gold_syntax: # TODO will merging the EDU strings here to make the raw_text # variable produce the appropriate eval result when not using gold # standard trees? doc_dict['raw_text'] = ' '.join(doc_dict['edu_strings']) del doc_dict['syntax_trees'] del doc_dict['token_tree_positions'] del doc_dict['tokens'] del doc_dict['pos_tags'] if segmenter is not None: del doc_dict['edu_start_indices'] # predict the RST tree tokens, trees = segment_and_parse(doc_dict, syntax_parser, segmenter, rst_parser) pred_trees.append(next(trees)['tree']) pred_edu_tokens_lists.append(tokens) return (pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists, gold_trees)
def batch_process(docs, output_path, zpar_model_directory, segmentation_model, parsing_model): ''' docs is a list or tuple of (doc_id, text) tuples. ''' syntax_parser = SyntaxParserWrapper(zpar_model_directory) segmenter = Segmenter(segmentation_model) parser = Parser(max_acts=1, max_states=1, n_best=1) parser.load_model(parsing_model) with open(output_path, 'w') as outfile: for doc_id, text in docs: logging.info('doc_id: {}'.format(doc_id)) doc_dict = {"doc_id": doc_id, "raw_text": text} edu_tokens, complete_trees = \ segment_and_parse(doc_dict, syntax_parser, segmenter, parser) print(json.dumps({"doc_id": doc_id, "edu_tokens": edu_tokens, \ "scored_rst_trees": \ [{"score": tree["score"], "tree": tree["tree"].pprint(margin=TREE_PRINT_MARGIN)} for tree in complete_trees]}), file=outfile)
def batch_process(docs, output_path, zpar_model_directory, segmentation_model, parsing_model): ''' docs is a list or tuple of (doc_id, text) tuples. ''' syntax_parser = SyntaxParserWrapper(zpar_model_directory) segmenter = Segmenter(segmentation_model) parser = Parser(max_acts=1, max_states=1, n_best=1) parser.load_model(parsing_model) with open(output_path, 'w') as outfile: for doc_id, text in docs: logging.info('doc_id: {}'.format(doc_id)) doc_dict = {"doc_id": doc_id, "raw_text": text} edu_tokens, complete_trees = \ segment_and_parse(doc_dict, syntax_parser, segmenter, parser) print(json.dumps({"doc_id": doc_id, "edu_tokens": edu_tokens, \ "scored_rst_trees": \ [{"score": tree["score"], "tree": tree["tree"].pformat(margin=TREE_PRINT_MARGIN)} for tree in complete_trees]}), file=outfile)