コード例 #1
0
def train_and_eval_model(working_path, model_path, eval_data, C):
    parameter_settings = {'C': C}
    logging.info('Training model with C = {}'.format(C))
    model_path = '{}.C{}'.format(model_path, C)

    logging.info('Evaluating model with C = {}'.format(C))
    train_rst_parsing_model(working_path, model_path, parameter_settings)
    rst_parser = Parser(1, 1, 1)
    rst_parser.load_model(model_path)
    results = predict_and_evaluate_rst_trees(None, None,
                                             rst_parser, eval_data,
                                             use_gold_syntax=True)
    return results
コード例 #2
0
def train_and_eval_model(working_path, model_path, eval_data, C):
    parameter_settings = {'C': C}
    logging.info('Training model with C = {}'.format(C))
    model_path = '{}.C{}'.format(model_path, C)

    logging.info('Evaluating model with C = {}'.format(C))
    train_rst_parsing_model(working_path, model_path, parameter_settings)
    rst_parser = Parser(1, 1, 1)
    rst_parser.load_model(model_path)
    results = predict_and_evaluate_rst_trees(None,
                                             None,
                                             rst_parser,
                                             eval_data,
                                             use_gold_syntax=True)
    return results
コード例 #3
0
def batch_process(docs, output_path, zpar_model_directory,
                  segmentation_model, parsing_model):
    '''
    docs is a list or tuple of (doc_id, text) tuples.
    '''
    syntax_parser = SyntaxParserWrapper(zpar_model_directory)
    segmenter = Segmenter(segmentation_model)

    parser = Parser(max_acts=1, max_states=1, n_best=1)
    parser.load_model(parsing_model)

    with open(output_path, 'w') as outfile:
        for doc_id, text in docs:
            logging.info('doc_id: {}'.format(doc_id))
            doc_dict = {"doc_id": doc_id, "raw_text": text}
            edu_tokens, complete_trees = \
                segment_and_parse(doc_dict, syntax_parser, segmenter, parser)
            print(json.dumps({"doc_id": doc_id, "edu_tokens": edu_tokens, \
                "scored_rst_trees": \
                [{"score": tree["score"],
                  "tree": tree["tree"].pprint(margin=TREE_PRINT_MARGIN)}
                 for tree in complete_trees]}), file=outfile)
コード例 #4
0
def batch_process(docs, output_path, zpar_model_directory,
                  segmentation_model, parsing_model):
    '''
    docs is a list or tuple of (doc_id, text) tuples.
    '''
    syntax_parser = SyntaxParserWrapper(zpar_model_directory)
    segmenter = Segmenter(segmentation_model)

    parser = Parser(max_acts=1, max_states=1, n_best=1)
    parser.load_model(parsing_model)

    with open(output_path, 'w') as outfile:
        for doc_id, text in docs:
            logging.info('doc_id: {}'.format(doc_id))
            doc_dict = {"doc_id": doc_id, "raw_text": text}
            edu_tokens, complete_trees = \
                segment_and_parse(doc_dict, syntax_parser, segmenter, parser)
            print(json.dumps({"doc_id": doc_id, "edu_tokens": edu_tokens, \
                "scored_rst_trees": \
                [{"score": tree["score"],
                  "tree": tree["tree"].pformat(margin=TREE_PRINT_MARGIN)}
                 for tree in complete_trees]}), file=outfile)
コード例 #5
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input_paths',
                        nargs='+',
                        help='A document to segment and parse.' +
                        ' Paragraphs should be separated by two or more' +
                        ' newline characters.')
    parser.add_argument('-g', '--segmentation_model',
                        help='Path to segmentation model.',
                        required=True)
    parser.add_argument('-p', '--parsing_model',
                        help='Path to RST parsing model.',
                        required=True)
    parser.add_argument('-a', '--max_acts',
                        help='Maximum number of actions for...?',
                        type=int, default=1)
    parser.add_argument('-n', '--n_best',
                        help='Number of parses to return', type=int, default=1)
    parser.add_argument('-s', '--max_states',
                        help='Maximum number of states to retain for \
                              best-first search',
                        type=int, default=1)
    parser.add_argument('-zp', '--zpar_port', type=int)
    parser.add_argument('-zh', '--zpar_hostname', default=None)
    parser.add_argument('-zm', '--zpar_model_directory', default=None)
    parser.add_argument('-v', '--verbose',
                        help='Print more status information. For every ' +
                        'additional time this flag is specified, ' +
                        'output gets more verbose.',
                        default=0, action='count')
    args = parser.parse_args()

    # Convert verbose flag to actually logging level.
    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(args.verbose, 2)]
    # Make warnings from built-in warnings module get formatted more nicely.
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'), level=log_level)

    # Read the models.
    logging.info('Loading models')
    syntax_parser = \
        SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname,
                            zpar_model_directory=args.zpar_model_directory)
    segmenter = Segmenter(args.segmentation_model)

    parser = Parser(max_acts=args.max_acts,
                    max_states=args.max_states,
                    n_best=args.n_best)
    parser.load_model(args.parsing_model)

    for input_path in args.input_paths:
        logging.info('rst_parse input file: {}'.format(input_path))
        doc = read_text_file(input_path)

        logging.debug('rst_parse input. doc_id = {}, text = {}'
                      .format(input_path, doc))
        doc_dict = {"raw_text": doc, "doc_id": input_path}

        edu_tokens, complete_trees = segment_and_parse(doc_dict, syntax_parser,
                                                       segmenter, parser)

        print(json.dumps({"edu_tokens": edu_tokens, \
            "scored_rst_trees": [{"score": tree["score"],
                                  "tree": tree["tree"]
                                          .pprint(margin=TREE_PRINT_MARGIN)}
                                 for tree in complete_trees]}))
コード例 #6
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input_paths',
                        nargs='+',
                        help='A document to segment and parse.' +
                        ' Paragraphs should be separated by two or more' +
                        ' newline characters.')
    parser.add_argument('-g',
                        '--segmentation_model',
                        help='Path to segmentation model.',
                        required=True)
    parser.add_argument('-p',
                        '--parsing_model',
                        help='Path to RST parsing model.',
                        required=True)
    parser.add_argument('-a',
                        '--max_acts',
                        help='Maximum number of actions for...?',
                        type=int,
                        default=1)
    parser.add_argument('-n',
                        '--n_best',
                        help='Number of parses to return',
                        type=int,
                        default=1)
    parser.add_argument('-s',
                        '--max_states',
                        help='Maximum number of states to retain for \
                              best-first search',
                        type=int,
                        default=1)
    parser.add_argument('-zp', '--zpar_port', type=int)
    parser.add_argument('-zh', '--zpar_hostname', default=None)
    parser.add_argument('-zm', '--zpar_model_directory', default=None)
    parser.add_argument('-v',
                        '--verbose',
                        help='Print more status information. For every ' +
                        'additional time this flag is specified, ' +
                        'output gets more verbose.',
                        default=0,
                        action='count')
    args = parser.parse_args()

    # Convert verbose flag to actually logging level.
    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(args.verbose, 2)]
    # Make warnings from built-in warnings module get formatted more nicely.
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'),
                        level=log_level)

    # Read the models.
    logging.info('Loading models')
    syntax_parser = \
        SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname,
                            zpar_model_directory=args.zpar_model_directory)
    segmenter = Segmenter(args.segmentation_model)

    parser = Parser(max_acts=args.max_acts,
                    max_states=args.max_states,
                    n_best=args.n_best)
    parser.load_model(args.parsing_model)

    for input_path in args.input_paths:
        logging.info('rst_parse input file: {}'.format(input_path))
        doc = read_text_file(input_path)

        logging.debug('rst_parse input. doc_id = {}, text = {}'.format(
            input_path, doc))
        doc_dict = {"raw_text": doc, "doc_id": input_path}

        edu_tokens, complete_trees = segment_and_parse(doc_dict, syntax_parser,
                                                       segmenter, parser)

        complete_trees = [tree for tree in complete_trees
                          ]  # can't use a generator twice

        print(json.dumps({"edu_tokens": edu_tokens, \
            "scored_rst_trees": [{"score": tree["score"],
                                  "tree": tree["tree"]
                                          .pformat(margin=TREE_PRINT_MARGIN)}
                                 for tree in complete_trees]}))

        for i, tree in enumerate(complete_trees, 1):
            ptree_str = tree['tree'].__repr__() + '\n'
            with codecs.open(input_path + '_{}.parentedtree'.format(str(i)),
                             'w', 'utf-8') as ptree_file:
                ptree_file.write(ptree_str)
コード例 #7
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('evaluation_set',
                        help='The dev or test set JSON file',
                        type=argparse.FileType('r'))
    parser.add_argument('-g', '--segmentation_model',
                        help='Path to segmentation model.  If not specified,' +
                        'then gold EDUs will be used.',
                        default=None)
    parser.add_argument('-p', '--parsing_model',
                        help='Path to RST parsing model.',
                        required=True)
    parser.add_argument('-z', '--zpar_directory', default='zpar')
    parser.add_argument('-t', '--use_gold_syntax',
                        help='If specified, then gold PTB syntax trees will' +
                        'be used.', action='store_true')
    parser.add_argument('-a', '--max_acts',
                        help='Maximum number of actions to perform on each ' +
                        'state', type=int, default=1)
    parser.add_argument('-s', '--max_states',
                        help='Maximum number of states to retain for ' +
                        'best-first search', type=int, default=1)
    parser.add_argument('-v', '--verbose',
                        help='Print more status information. For every ' +
                        'additional time this flag is specified, ' +
                        'output gets more verbose.',
                        default=0, action='count')
    args = parser.parse_args()
    assert args.use_gold_syntax or args.segmentation_model

    # Convert verbose flag to actually logging level
    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(args.verbose, 2)]
    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'), level=log_level)
    logger = logging.getLogger(__name__)

    # read the models
    logger.info('Loading models')

    # TODO add port, host, model args
    syntax_parser = SyntaxParserWrapper() if not args.use_gold_syntax else None
    segmenter = Segmenter(args.segmentation_model) \
        if args.segmentation_model else None

    rst_parser = Parser(max_acts=args.max_acts,
                        max_states=args.max_states,
                        n_best=1)
    rst_parser.load_model(args.parsing_model)

    eval_data = json.load(args.evaluation_set)

    results = \
        predict_and_evaluate_rst_trees(syntax_parser, segmenter, rst_parser,
                                       eval_data,
                                       use_gold_syntax=args.use_gold_syntax)
    print(json.dumps(sorted(results.items())))
コード例 #8
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('evaluation_set',
                        help='The dev or test set JSON file',
                        type=argparse.FileType('r'))
    parser.add_argument('-g',
                        '--segmentation_model',
                        help='Path to segmentation model.  If not specified,' +
                        'then gold EDUs will be used.',
                        default=None)
    parser.add_argument('-p',
                        '--parsing_model',
                        help='Path to RST parsing model.',
                        required=True)
    parser.add_argument('-z', '--zpar_directory', default='zpar')
    parser.add_argument('-t',
                        '--use_gold_syntax',
                        help='If specified, then gold PTB syntax trees will' +
                        'be used.',
                        action='store_true')
    parser.add_argument('-a',
                        '--max_acts',
                        help='Maximum number of actions to perform on each ' +
                        'state',
                        type=int,
                        default=1)
    parser.add_argument('-s',
                        '--max_states',
                        help='Maximum number of states to retain for ' +
                        'best-first search',
                        type=int,
                        default=1)
    parser.add_argument('-v',
                        '--verbose',
                        help='Print more status information. For every ' +
                        'additional time this flag is specified, ' +
                        'output gets more verbose.',
                        default=0,
                        action='count')
    args = parser.parse_args()
    assert args.use_gold_syntax or args.segmentation_model

    # Convert verbose flag to actually logging level
    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(args.verbose, 2)]
    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'),
                        level=log_level)
    logger = logging.getLogger(__name__)

    # read the models
    logger.info('Loading models')

    # TODO add port, host, model args
    syntax_parser = SyntaxParserWrapper() if not args.use_gold_syntax else None
    segmenter = Segmenter(args.segmentation_model) \
        if args.segmentation_model else None

    rst_parser = Parser(max_acts=args.max_acts,
                        max_states=args.max_states,
                        n_best=1)
    rst_parser.load_model(args.parsing_model)

    eval_data = json.load(args.evaluation_set)

    results = \
        predict_and_evaluate_rst_trees(syntax_parser, segmenter, rst_parser,
                                       eval_data,
                                       use_gold_syntax=args.use_gold_syntax)
    print(json.dumps(sorted(results.items())))
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('evaluation_set',
                        help='The dev or test set JSON file',
                        type=argparse.FileType('r'))
    parser.add_argument('-p', '--parsing_model',
                        help='Path to RST parsing model.',
                        required=True)
    parser.add_argument('-v', '--verbose',
                        help='Print more status information. For every ' +
                        'additional time this flag is specified, ' +
                        'output gets more verbose.',
                        default=0, action='count')
    parser.add_argument('--metric_name', help='name of metric to use',
                        choices=["labeled_precision",
                                 "labeled_recall",
                                 "labeled_f1",
                                 "nuc_precision",
                                 "nuc_recall",
                                 "nuc_f1",
                                 "span_precision",
                                 "span_recall",
                                 "span_f1"],
                        required=True)
    parser.add_argument('--n_samples', type=int, default=10000)
    parser.add_argument('--alpha', type=float, default=0.05)
    args = parser.parse_args()

    # Convert verbose flag to actually logging level
    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(args.verbose, 2)]
    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'), level=log_level)
    logger = logging.getLogger(__name__)

    # read the models
    logger.info('Loading models')

    rst_parser = Parser(max_acts=1, max_states=1, n_best=1)
    rst_parser.load_model(args.parsing_model)

    eval_data = json.load(args.evaluation_set)

    pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists, gold_trees = \
        predict_rst_trees_for_eval(None, None, rst_parser, eval_data)

    data = np.array(list(zip(pred_edu_tokens_lists, pred_trees,
                             gold_edu_tokens_lists, gold_trees)))

    # score without bootstrapping
    orig_score = compute_rst_eval_results(pred_edu_tokens_lists,
                                          pred_trees,
                                          gold_edu_tokens_lists,
                                          gold_trees)[args.metric_name]
    tmp_score = make_score_func(args.metric_name)(data)
    assert tmp_score == orig_score

    boot_ci_lower, boot_ci_upper = \
        boot.ci(data, make_score_func(args.metric_name),
                n_samples=args.n_samples, method='bca', alpha=args.alpha)

    print("evaluation_set: {}".format(args.evaluation_set))
    print("alpha: {}".format(args.alpha))
    print("n_samples: {}".format(args.n_samples))
    print("metric: {}".format(args.metric_name))
    print("original score: {}".format(orig_score))
    print("CI: ({}, {})".format(boot_ci_lower, boot_ci_upper))