def test_extract_parse_actions():
    tree = ParentedTree.fromstring('(ROOT (satellite:attribution (text 0)) (nucleus:span (satellite:condition (text 1)) (nucleus:span (nucleus:span (nucleus:same-unit (text 2)) (nucleus:same-unit (satellite:temporal (text 3)) (nucleus:span (text 4)))) (satellite:conclusion (text 5)))))')
    # I think the tree above would be for something
    # like this silly little example:
    # "John said that if Bob bought this excellent book,
    # then before the end of next week Bob would finish it,
    # and therefore he would be happy."

    actions = extract_parse_actions(tree)

    num_shifts = len([x for x in actions if x.type == 'S'])
    assert num_shifts == 6
    assert actions[0].type == 'S'
    assert actions[1].type == 'U'
    assert actions[1].label == 'satellite:attribution'
    assert actions[2].type == 'S'
def test_reconstruct_training_examples():
    '''
    This code goes through the training data and makes sure
    that the actions extracted from the trees can be used to
    reconstruct those trees from a list of EDUs.
    '''

    train_path = 'rst_discourse_tb_edus_TRAINING_TRAIN.json'
    with open(train_path) as f:
        data = json.load(f)

    rst_parser = Parser(max_acts=1, max_states=1, n_best=1)
    for doc_dict in data:
        tree_orig = ParentedTree.fromstring(doc_dict['rst_tree'])
        actions = extract_parse_actions(tree_orig)

        tree2 = next(rst_parser.parse(doc_dict,
                                      gold_actions=actions,
                                      make_features=False))['tree']

        logging.info('test_reconstruct_training_examples verified tree for {}'.format(doc_dict['path_basename']))
        assert tree2 == tree_orig
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('train_file',
                        help='Path to JSON training file.',
                        type=argparse.FileType('r'))
    parser.add_argument('eval_file',
                        help='Path to JSON dev or test file for ' +
                        'tuning/evaluation.',
                        type=argparse.FileType('r'))
    parser.add_argument('model_path',
                        help='Prefix for the path to where the model should be'
                        ' stored.  A suffix with the C value will be added.')
    parser.add_argument('-w',
                        '--working_path',
                        help='Path to where intermediate files should be ' +
                        'stored',
                        default='working')
    parser.add_argument('-C',
                        '--C_values',
                        help='comma-separated list of model complexity ' +
                        'parameter settings to evaluate.',
                        default=','.join([str(2.0**x) for x in range(-4, 5)]))
    parser.add_argument('-v',
                        '--verbose',
                        help='Print more status information. For every ' +
                        'additional time this flag is specified, ' +
                        'output gets more verbose.',
                        default=0,
                        action='count')
    parser.add_argument('-s',
                        '--single_process',
                        action='store_true',
                        help='Run in a single process for all hyperparameter' +
                        ' grid points, to simplify debugging.')
    args = parser.parse_args()

    if os.path.exists(args.working_path):
        raise IOError("{} already exists.  Stopping here to avoid the "
                      "possibility of overwriting files that are currently "
                      "being used.".format(args.working_path))
    os.makedirs(args.working_path)

    parser = Parser(1, 1, 1)

    # Convert verbose flag to actually logging level
    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(args.verbose, 2)]
    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'),
                        level=log_level)
    logger = logging.getLogger(__name__)

    logger.info('Extracting examples')
    train_data = json.load(args.train_file)
    eval_data = json.load(args.eval_file)

    train_examples = []

    for doc_dict in train_data:
        path_basename = doc_dict['path_basename']
        logging.info('Extracting examples for {}'.format(path_basename))
        tree = ParentedTree.fromstring(doc_dict['rst_tree'])
        collapse_rst_labels(tree)
        actions = extract_parse_actions(tree)

        for i, (action_str, feats) in \
                enumerate(parser.parse(doc_dict, gold_actions=actions)):
            example_id = "{}_{}".format(path_basename, i)
            example = {"x": Counter(feats), "y": action_str, "id": example_id}
            train_examples.append(example)
            # print("{} {}".format(action_str, " ".join(feats)))

    # train and evaluate a model for each value of C
    best_labeled_f1 = -1.0
    best_C = None

    # train and evaluate models with different C values in parallel
    C_values = [float(x) for x in args.C_values.split(',')]
    partial_train_and_eval_model = partial(train_and_eval_model,
                                           args.working_path, args.model_path,
                                           eval_data)

    # Make the SKLL jsonlines feature file
    train_path = os.path.join(args.working_path, 'rst_parsing.jsonlines')
    with open(train_path, 'w') as train_file:
        for example in train_examples:
            train_file.write('{}\n'.format(json.dumps(example)))

    if args.single_process:
        all_results = [
            partial_train_and_eval_model(C_value) for C_value in C_values
        ]
    else:
        n_workers = len(C_values)
        with ProcessPoolExecutor(max_workers=n_workers) as executor:
            all_results = executor.map(partial_train_and_eval_model, C_values)

    for C_value, results in zip(C_values, all_results):
        results["C"] = C_value
        print(json.dumps(sorted(results.items())))
        if results["labeled_f1"] > best_labeled_f1:
            best_labeled_f1 = results["labeled_f1"]
            best_C = C_value

    print("best labeled F1 = {}, with C = {}".format(best_labeled_f1, best_C))
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('train_file',
                        help='Path to JSON training file.',
                        type=argparse.FileType('r'))
    parser.add_argument('eval_file',
                        help='Path to JSON dev or test file for ' +
                        'tuning/evaluation.',
                        type=argparse.FileType('r'))
    parser.add_argument('model_path',
                        help='Prefix for the path to where the model should be'
                        ' stored.  A suffix with the C value will be added.')
    parser.add_argument('-w', '--working_path',
                        help='Path to where intermediate files should be ' +
                        'stored', default='working')
    parser.add_argument('-C', '--C_values',
                        help='comma-separated list of model complexity ' +
                        'parameter settings to evaluate.',
                        default=','.join([str(2.0 ** x)
                                          for x in range(-4, 5)]))
    parser.add_argument('-v', '--verbose',
                        help='Print more status information. For every ' +
                        'additional time this flag is specified, ' +
                        'output gets more verbose.',
                        default=0, action='count')
    parser.add_argument('-s', '--single_process', action='store_true',
                        help='Run in a single process for all hyperparameter' +
                        ' grid points, to simplify debugging.')
    args = parser.parse_args()

    if os.path.exists(args.working_path):
        raise IOError("{} already exists.  Stopping here to avoid the "
                      "possibility of overwriting files that are currently "
                      "being used.".format(args.working_path))
    os.makedirs(args.working_path)

    parser = Parser(1, 1, 1)

    # Convert verbose flag to actually logging level
    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(args.verbose, 2)]
    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'), level=log_level)
    logger = logging.getLogger(__name__)

    logger.info('Extracting examples')
    train_data = json.load(args.train_file)
    eval_data = json.load(args.eval_file)

    train_examples = []

    for doc_dict in train_data:
        path_basename = doc_dict['path_basename']
        logging.info('Extracting examples for {}'.format(path_basename))
        tree = ParentedTree.fromstring(doc_dict['rst_tree'])
        collapse_rst_labels(tree)
        actions = extract_parse_actions(tree)

        for i, (action_str, feats) in \
                enumerate(parser.parse(doc_dict, gold_actions=actions)):
            example_id = "{}_{}".format(path_basename, i)
            example = {"x": Counter(feats), "y": action_str, "id": example_id}
            train_examples.append(example)
            # print("{} {}".format(action_str, " ".join(feats)))

    # train and evaluate a model for each value of C
    best_labeled_f1 = -1.0
    best_C = None

    # train and evaluate models with different C values in parallel
    C_values = [float(x) for x in args.C_values.split(',')]
    partial_train_and_eval_model = partial(train_and_eval_model,
                                           args.working_path, args.model_path,
                                           eval_data)

    # Make the SKLL jsonlines feature file
    train_path = os.path.join(args.working_path, 'rst_parsing.jsonlines')
    with open(train_path, 'w') as train_file:
        for example in train_examples:
            train_file.write('{}\n'.format(json.dumps(example)))

    if args.single_process:
        all_results = [partial_train_and_eval_model(C_value)
                       for C_value in C_values]
    else:
        n_workers = len(C_values)
        with ProcessPoolExecutor(max_workers=n_workers) as executor:
            all_results = executor.map(partial_train_and_eval_model, C_values)

    for C_value, results in zip(C_values, all_results):
        results["C"] = C_value
        print(json.dumps(sorted(results.items())))
        if results["labeled_f1"] > best_labeled_f1:
            best_labeled_f1 = results["labeled_f1"]
            best_C = C_value

    print("best labeled F1 = {}, with C = {}".format(best_labeled_f1, best_C))