Beispiel #1
0
def predict_rst_trees_for_eval(syntax_parser,
                               segmenter,
                               rst_parser,
                               eval_data,
                               use_gold_syntax=True):
    pred_edu_tokens_lists = []
    pred_trees = []
    gold_edu_tokens_lists = []
    gold_trees = []

    for doc_dict in eval_data:
        logging.info('processing {}...'.format(doc_dict['path_basename']))
        gold_edu_tokens_lists.append( \
            extract_edus_tokens(doc_dict['edu_start_indices'],
                                doc_dict['tokens']))

        # Collapse the RST labels to use the coarse relations that the parser
        # produces.
        gold_tree = ParentedTree.fromstring(doc_dict['rst_tree'])
        collapse_rst_labels(gold_tree)
        gold_trees.append(gold_tree)

        # TODO when not using gold syntax, should the script still use gold
        # standard tokens?

        # remove gold standard trees or EDU boundaries if evaluating
        # using automatic preprocessing
        if not use_gold_syntax:
            # TODO will merging the EDU strings here to make the raw_text
            # variable produce the appropriate eval result when not using gold
            # standard trees?
            doc_dict['raw_text'] = ' '.join(doc_dict['edu_strings'])
            del doc_dict['syntax_trees']
            del doc_dict['token_tree_positions']
            del doc_dict['tokens']
            del doc_dict['pos_tags']
        if segmenter is not None:
            del doc_dict['edu_start_indices']

        # predict the RST tree
        tokens, trees = segment_and_parse(doc_dict, syntax_parser, segmenter,
                                          rst_parser)
        pred_trees.append(next(trees)['tree'])
        pred_edu_tokens_lists.append(tokens)
    return (pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists,
            gold_trees)
def predict_rst_trees_for_eval(syntax_parser, segmenter, rst_parser, eval_data,
                               use_gold_syntax=True):
    pred_edu_tokens_lists = []
    pred_trees = []
    gold_edu_tokens_lists = []
    gold_trees = []

    for doc_dict in eval_data:
        logging.info('processing {}...'.format(doc_dict['path_basename']))
        gold_edu_tokens_lists.append( \
            extract_edus_tokens(doc_dict['edu_start_indices'],
                                doc_dict['tokens']))

        # Collapse the RST labels to use the coarse relations that the parser
        # produces.
        gold_tree = ParentedTree.fromstring(doc_dict['rst_tree'])
        collapse_rst_labels(gold_tree)
        gold_trees.append(gold_tree)

        # TODO when not using gold syntax, should the script still use gold
        # standard tokens?

        # remove gold standard trees or EDU boundaries if evaluating
        # using automatic preprocessing
        if not use_gold_syntax:
            # TODO will merging the EDU strings here to make the raw_text
            # variable produce the appropriate eval result when not using gold
            # standard trees?
            doc_dict['raw_text'] = ' '.join(doc_dict['edu_strings'])
            del doc_dict['syntax_trees']
            del doc_dict['token_tree_positions']
            del doc_dict['tokens']
            del doc_dict['pos_tags']
        if segmenter is not None:
            del doc_dict['edu_start_indices']

        # predict the RST tree
        tokens, trees = segment_and_parse(doc_dict, syntax_parser,
                                          segmenter, rst_parser)
        pred_trees.append(next(trees)['tree'])
        pred_edu_tokens_lists.append(tokens)
    return (pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists,
            gold_trees)
def batch_process(docs, output_path, zpar_model_directory,
                  segmentation_model, parsing_model):
    '''
    docs is a list or tuple of (doc_id, text) tuples.
    '''
    syntax_parser = SyntaxParserWrapper(zpar_model_directory)
    segmenter = Segmenter(segmentation_model)

    parser = Parser(max_acts=1, max_states=1, n_best=1)
    parser.load_model(parsing_model)

    with open(output_path, 'w') as outfile:
        for doc_id, text in docs:
            logging.info('doc_id: {}'.format(doc_id))
            doc_dict = {"doc_id": doc_id, "raw_text": text}
            edu_tokens, complete_trees = \
                segment_and_parse(doc_dict, syntax_parser, segmenter, parser)
            print(json.dumps({"doc_id": doc_id, "edu_tokens": edu_tokens, \
                "scored_rst_trees": \
                [{"score": tree["score"],
                  "tree": tree["tree"].pprint(margin=TREE_PRINT_MARGIN)}
                 for tree in complete_trees]}), file=outfile)
Beispiel #4
0
def batch_process(docs, output_path, zpar_model_directory,
                  segmentation_model, parsing_model):
    '''
    docs is a list or tuple of (doc_id, text) tuples.
    '''
    syntax_parser = SyntaxParserWrapper(zpar_model_directory)
    segmenter = Segmenter(segmentation_model)

    parser = Parser(max_acts=1, max_states=1, n_best=1)
    parser.load_model(parsing_model)

    with open(output_path, 'w') as outfile:
        for doc_id, text in docs:
            logging.info('doc_id: {}'.format(doc_id))
            doc_dict = {"doc_id": doc_id, "raw_text": text}
            edu_tokens, complete_trees = \
                segment_and_parse(doc_dict, syntax_parser, segmenter, parser)
            print(json.dumps({"doc_id": doc_id, "edu_tokens": edu_tokens, \
                "scored_rst_trees": \
                [{"score": tree["score"],
                  "tree": tree["tree"].pformat(margin=TREE_PRINT_MARGIN)}
                 for tree in complete_trees]}), file=outfile)