Beispiel #1
0
def predict_rst_trees_for_eval(syntax_parser,
                               segmenter,
                               rst_parser,
                               eval_data,
                               use_gold_syntax=True):
    pred_edu_tokens_lists = []
    pred_trees = []
    gold_edu_tokens_lists = []
    gold_trees = []

    for doc_dict in eval_data:
        logging.info('processing {}...'.format(doc_dict['path_basename']))
        gold_edu_tokens_lists.append( \
            extract_edus_tokens(doc_dict['edu_start_indices'],
                                doc_dict['tokens']))

        # Collapse the RST labels to use the coarse relations that the parser
        # produces.
        gold_tree = ParentedTree.fromstring(doc_dict['rst_tree'])
        collapse_rst_labels(gold_tree)
        gold_trees.append(gold_tree)

        # TODO when not using gold syntax, should the script still use gold
        # standard tokens?

        # remove gold standard trees or EDU boundaries if evaluating
        # using automatic preprocessing
        if not use_gold_syntax:
            # TODO will merging the EDU strings here to make the raw_text
            # variable produce the appropriate eval result when not using gold
            # standard trees?
            doc_dict['raw_text'] = ' '.join(doc_dict['edu_strings'])
            del doc_dict['syntax_trees']
            del doc_dict['token_tree_positions']
            del doc_dict['tokens']
            del doc_dict['pos_tags']
        if segmenter is not None:
            del doc_dict['edu_start_indices']

        # predict the RST tree
        tokens, trees = segment_and_parse(doc_dict, syntax_parser, segmenter,
                                          rst_parser)
        pred_trees.append(next(trees)['tree'])
        pred_edu_tokens_lists.append(tokens)
    return (pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists,
            gold_trees)
def predict_rst_trees_for_eval(syntax_parser, segmenter, rst_parser, eval_data,
                               use_gold_syntax=True):
    pred_edu_tokens_lists = []
    pred_trees = []
    gold_edu_tokens_lists = []
    gold_trees = []

    for doc_dict in eval_data:
        logging.info('processing {}...'.format(doc_dict['path_basename']))
        gold_edu_tokens_lists.append( \
            extract_edus_tokens(doc_dict['edu_start_indices'],
                                doc_dict['tokens']))

        # Collapse the RST labels to use the coarse relations that the parser
        # produces.
        gold_tree = ParentedTree.fromstring(doc_dict['rst_tree'])
        collapse_rst_labels(gold_tree)
        gold_trees.append(gold_tree)

        # TODO when not using gold syntax, should the script still use gold
        # standard tokens?

        # remove gold standard trees or EDU boundaries if evaluating
        # using automatic preprocessing
        if not use_gold_syntax:
            # TODO will merging the EDU strings here to make the raw_text
            # variable produce the appropriate eval result when not using gold
            # standard trees?
            doc_dict['raw_text'] = ' '.join(doc_dict['edu_strings'])
            del doc_dict['syntax_trees']
            del doc_dict['token_tree_positions']
            del doc_dict['tokens']
            del doc_dict['pos_tags']
        if segmenter is not None:
            del doc_dict['edu_start_indices']

        # predict the RST tree
        tokens, trees = segment_and_parse(doc_dict, syntax_parser,
                                          segmenter, rst_parser)
        pred_trees.append(next(trees)['tree'])
        pred_edu_tokens_lists.append(tokens)
    return (pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists,
            gold_trees)
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('train_file',
                        help='Path to JSON training file.',
                        type=argparse.FileType('r'))
    parser.add_argument('eval_file',
                        help='Path to JSON dev or test file for ' +
                        'tuning/evaluation.',
                        type=argparse.FileType('r'))
    parser.add_argument('model_path',
                        help='Prefix for the path to where the model should be'
                        ' stored.  A suffix with the C value will be added.')
    parser.add_argument('-w',
                        '--working_path',
                        help='Path to where intermediate files should be ' +
                        'stored',
                        default='working')
    parser.add_argument('-C',
                        '--C_values',
                        help='comma-separated list of model complexity ' +
                        'parameter settings to evaluate.',
                        default=','.join([str(2.0**x) for x in range(-4, 5)]))
    parser.add_argument('-v',
                        '--verbose',
                        help='Print more status information. For every ' +
                        'additional time this flag is specified, ' +
                        'output gets more verbose.',
                        default=0,
                        action='count')
    parser.add_argument('-s',
                        '--single_process',
                        action='store_true',
                        help='Run in a single process for all hyperparameter' +
                        ' grid points, to simplify debugging.')
    args = parser.parse_args()

    if os.path.exists(args.working_path):
        raise IOError("{} already exists.  Stopping here to avoid the "
                      "possibility of overwriting files that are currently "
                      "being used.".format(args.working_path))
    os.makedirs(args.working_path)

    parser = Parser(1, 1, 1)

    # Convert verbose flag to actually logging level
    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(args.verbose, 2)]
    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'),
                        level=log_level)
    logger = logging.getLogger(__name__)

    logger.info('Extracting examples')
    train_data = json.load(args.train_file)
    eval_data = json.load(args.eval_file)

    train_examples = []

    for doc_dict in train_data:
        path_basename = doc_dict['path_basename']
        logging.info('Extracting examples for {}'.format(path_basename))
        tree = ParentedTree.fromstring(doc_dict['rst_tree'])
        collapse_rst_labels(tree)
        actions = extract_parse_actions(tree)

        for i, (action_str, feats) in \
                enumerate(parser.parse(doc_dict, gold_actions=actions)):
            example_id = "{}_{}".format(path_basename, i)
            example = {"x": Counter(feats), "y": action_str, "id": example_id}
            train_examples.append(example)
            # print("{} {}".format(action_str, " ".join(feats)))

    # train and evaluate a model for each value of C
    best_labeled_f1 = -1.0
    best_C = None

    # train and evaluate models with different C values in parallel
    C_values = [float(x) for x in args.C_values.split(',')]
    partial_train_and_eval_model = partial(train_and_eval_model,
                                           args.working_path, args.model_path,
                                           eval_data)

    # Make the SKLL jsonlines feature file
    train_path = os.path.join(args.working_path, 'rst_parsing.jsonlines')
    with open(train_path, 'w') as train_file:
        for example in train_examples:
            train_file.write('{}\n'.format(json.dumps(example)))

    if args.single_process:
        all_results = [
            partial_train_and_eval_model(C_value) for C_value in C_values
        ]
    else:
        n_workers = len(C_values)
        with ProcessPoolExecutor(max_workers=n_workers) as executor:
            all_results = executor.map(partial_train_and_eval_model, C_values)

    for C_value, results in zip(C_values, all_results):
        results["C"] = C_value
        print(json.dumps(sorted(results.items())))
        if results["labeled_f1"] > best_labeled_f1:
            best_labeled_f1 = results["labeled_f1"]
            best_C = C_value

    print("best labeled F1 = {}, with C = {}".format(best_labeled_f1, best_C))
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('train_file',
                        help='Path to JSON training file.',
                        type=argparse.FileType('r'))
    parser.add_argument('eval_file',
                        help='Path to JSON dev or test file for ' +
                        'tuning/evaluation.',
                        type=argparse.FileType('r'))
    parser.add_argument('model_path',
                        help='Prefix for the path to where the model should be'
                        ' stored.  A suffix with the C value will be added.')
    parser.add_argument('-w', '--working_path',
                        help='Path to where intermediate files should be ' +
                        'stored', default='working')
    parser.add_argument('-C', '--C_values',
                        help='comma-separated list of model complexity ' +
                        'parameter settings to evaluate.',
                        default=','.join([str(2.0 ** x)
                                          for x in range(-4, 5)]))
    parser.add_argument('-v', '--verbose',
                        help='Print more status information. For every ' +
                        'additional time this flag is specified, ' +
                        'output gets more verbose.',
                        default=0, action='count')
    parser.add_argument('-s', '--single_process', action='store_true',
                        help='Run in a single process for all hyperparameter' +
                        ' grid points, to simplify debugging.')
    args = parser.parse_args()

    if os.path.exists(args.working_path):
        raise IOError("{} already exists.  Stopping here to avoid the "
                      "possibility of overwriting files that are currently "
                      "being used.".format(args.working_path))
    os.makedirs(args.working_path)

    parser = Parser(1, 1, 1)

    # Convert verbose flag to actually logging level
    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(args.verbose, 2)]
    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'), level=log_level)
    logger = logging.getLogger(__name__)

    logger.info('Extracting examples')
    train_data = json.load(args.train_file)
    eval_data = json.load(args.eval_file)

    train_examples = []

    for doc_dict in train_data:
        path_basename = doc_dict['path_basename']
        logging.info('Extracting examples for {}'.format(path_basename))
        tree = ParentedTree.fromstring(doc_dict['rst_tree'])
        collapse_rst_labels(tree)
        actions = extract_parse_actions(tree)

        for i, (action_str, feats) in \
                enumerate(parser.parse(doc_dict, gold_actions=actions)):
            example_id = "{}_{}".format(path_basename, i)
            example = {"x": Counter(feats), "y": action_str, "id": example_id}
            train_examples.append(example)
            # print("{} {}".format(action_str, " ".join(feats)))

    # train and evaluate a model for each value of C
    best_labeled_f1 = -1.0
    best_C = None

    # train and evaluate models with different C values in parallel
    C_values = [float(x) for x in args.C_values.split(',')]
    partial_train_and_eval_model = partial(train_and_eval_model,
                                           args.working_path, args.model_path,
                                           eval_data)

    # Make the SKLL jsonlines feature file
    train_path = os.path.join(args.working_path, 'rst_parsing.jsonlines')
    with open(train_path, 'w') as train_file:
        for example in train_examples:
            train_file.write('{}\n'.format(json.dumps(example)))

    if args.single_process:
        all_results = [partial_train_and_eval_model(C_value)
                       for C_value in C_values]
    else:
        n_workers = len(C_values)
        with ProcessPoolExecutor(max_workers=n_workers) as executor:
            all_results = executor.map(partial_train_and_eval_model, C_values)

    for C_value, results in zip(C_values, all_results):
        results["C"] = C_value
        print(json.dumps(sorted(results.items())))
        if results["labeled_f1"] > best_labeled_f1:
            best_labeled_f1 = results["labeled_f1"]
            best_C = C_value

    print("best labeled F1 = {}, with C = {}".format(best_labeled_f1, best_C))