Beispiel #1
0
def main():

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--train-inputs", type=str, required=True)
    parser.add_argument("--valid-inputs", type=str, required=True)
    parser.add_argument("--batch-size", default=16, type=int)
    parser.add_argument("--gpu", default=-1, type=int)
    parser.add_argument("--train-salience", type=str, default=None)
    parser.add_argument("--valid-salience", type=str, default=None)
    parser.add_argument("--model-path", type=str, default=None)
    parser.add_argument("--seed", default=48929234, type=int)

    args = parser.parse_args()

    ntp.set_random_seed(args.seed)

    print("Reading model from {} ...".format(args.model_path))
    model = torch.load(args.model_path)

    if args.gpu > -1:
        model.cuda(args.gpu)
    else:
        model.cpu()

    if args.train_salience is not None:
        results_dir = os.path.dirname(args.train_salience)
        if not os.path.exists(results_dir) and results_dir != "":
            os.makedirs(results_dir)

    if args.valid_salience is not None:
        results_dir = os.path.dirname(args.valid_salience)
        if not os.path.exists(results_dir) and results_dir != "":
            os.makedirs(results_dir)

    print("Reading training inputs data from {} ...".format(args.train_inputs))
    training_data = make_dataset(read_data(args.train_inputs), args.batch_size,
                                 args.gpu)
    print("Writing training salience data to {} ...".format(
        args.train_salience))
    write_salience(model, training_data, args.train_salience)

    print("Reading validation inputs data from {} ...".format(
        args.valid_inputs))
    validation_data = make_dataset(read_data(args.valid_inputs),
                                   args.batch_size, args.gpu)
    print("Writing validation salience data to {} ...".format(
        args.valid_salience))
    write_salience(model, validation_data, args.valid_salience)
Beispiel #2
0
def main(args=None):
    parser = argparse.ArgumentParser()
    parser.add_argument("--train-inputs", type=str, required=True)
    parser.add_argument("--train-labels", type=str, required=True)
    parser.add_argument("--valid-inputs", type=str, required=True)
    parser.add_argument("--valid-labels", type=str, required=True)

    parser.add_argument("--train-summary-dir", type=str, required=True)
    parser.add_argument("--valid-summary-dir", type=str, required=True)

    #parser.add_argument("--valid", type=str, required=True)

    parser.add_argument("--gpu", default=-1, type=int, required=False)
    parser.add_argument("--epochs", default=20, type=int, required=False)
    parser.add_argument("--seed", default=83432534, type=int, required=False)

    parser.add_argument("--lr", required=False, default=.0001, type=float)
    parser.add_argument("--batch-size", default=8, type=int, required=False)

    parser.add_argument("--embedding-size",
                        type=int,
                        required=False,
                        default=300)
    parser.add_argument("--rnn-hidden-size",
                        type=int,
                        required=False,
                        default=512)
    parser.add_argument("--rnn-layers", type=int, required=False, default=1)

    parser.add_argument("--hidden-layer-sizes",
                        nargs="+",
                        default=[100],
                        type=int,
                        required=False)
    parser.add_argument("--hidden-layer-activations",
                        nargs="+",
                        default="relu",
                        type=str,
                        required=False)
    parser.add_argument("--hidden-layer-dropout",
                        default=.0,
                        type=float,
                        required=False)
    parser.add_argument("--input-layer-norm",
                        default=False,
                        action="store_true")

    parser.add_argument("--save-model", required=False, type=str)
    #    parser.add_argument(
    #        "--save-predictor", default=None, required=False, type=str)

    args = parser.parse_args(args)

    ntp.set_random_seed(args.seed)

    input_reader = spensum.dataio.init_duc_sds_input_reader(
        args.embedding_size)
    label_reader = spensum.dataio.init_duc_sds_label_reader()

    train_dataset = spensum.dataio.read_input_label_dataset(
        args.train_inputs,
        args.train_labels,
        input_reader,
        label_reader,
        batch_size=args.batch_size,
        gpu=args.gpu)

    valid_dataset = spensum.dataio.read_input_label_dataset(
        args.valid_inputs,
        args.valid_labels,
        input_reader,
        label_reader,
        batch_size=args.batch_size,
        gpu=args.gpu)

    model = spensum.model.CNNExtractor2(args.embedding_size)
    if args.gpu > -1:
        model.cuda(args.gpu)

    non_salient_count = train_dataset.targets.eq(0).sum()
    salient_count = train_dataset.targets.eq(1).sum()
    weight = torch.FloatTensor([1 / non_salient_count, 1 / salient_count])
    #    print("Training data:")
    #    print("# salient = {}".format(salient_count))
    #    print("# non-salient = {}".format(non_salient_count))

    opt = ntp.optimizer.Adam(model.parameters(), lr=args.lr)
    crit = ntp.criterion.BinaryCrossEntropy(mode="prob",
                                            weight=weight,
                                            mask_value=-1)
    crit.add_reporter(ntp.criterion.BinaryFMeasureReporter(mode="prob"))
    crit.set_selection_criterion("BinaryFMeasureReporter")

    #ntp.trainer.optimize_criterion(crit, model, opt, train_dataset,
    #                               validation_data=valid_dataset,
    #                               max_epochs=15)

    train_rouge_results = []
    valid_rouge_results = []
    best_rouge = 0

    for epoch in range(1, args.epochs + 1):

        def train_step_callback(step, max_steps, batch_loss, criterion):
            sys.stdout.write("\r")
            sys.stdout.write(" " * 79)
            sys.stdout.write("\r")
            sys.stdout.write("\ttrain {}: {} / {} | obj: {:0.9f}".format(
                epoch, step, max_steps, criterion.avg_loss))
            sys.stdout.flush()
            if step == max_steps:
                sys.stdout.write("\r" + " " * 79 + "\r")
                sys.stdout.flush()

        def valid_step_callback(step, max_steps, batch_loss, criterion):
            sys.stdout.write("\r")
            sys.stdout.write(" " * 79)
            sys.stdout.write("\r")
            sys.stdout.write("\tvalid {}: {} / {} | obj: {:0.9f}".format(
                epoch, step, max_steps, criterion.avg_loss))
            sys.stdout.flush()
            if step == max_steps:
                sys.stdout.write("\r" + " " * 79 + "\r")
                sys.stdout.flush()

        ntp.trainer.train_epoch(crit,
                                model,
                                opt,
                                train_dataset,
                                step_callback=train_step_callback)
        crit.checkpoint("training")

        #print(crit.report(indent="     "))
        #print(compute_rouge(model, train_dataset, args.train_summary_dir))

        #print("\n  * == Validation ==")

        ntp.trainer.eval(crit,
                         model,
                         valid_dataset,
                         step_callback=valid_step_callback)
        crit.checkpoint("validation")

        #best_epoch, obj = crit.find_best_checkpoint("validation")
        #if best_epoch == epoch and save_model is not None:
        #    torch.save(model, save_model)
        #print(crit.report(indent="     "))
        #print("\n     Best epoch: {} obj: {}\n".format(
        #    best_epoch, obj))
        #print("")

        valid_rouge = compute_rouge(model, valid_dataset,
                                    args.valid_summary_dir)
        #print(valid_rouge)
        valid_rouge_results.append(valid_rouge)

        rouge_score = valid_rouge["rouge-2"].values[0]
        if rouge_score > best_rouge:
            best_rouge = rouge_score
            if args.save_model is not None:
                print("Saving model!")
                torch.save(model, args.save_model)

    #,
    #                              save_model=module_save_path)

    return pd.concat(valid_rouge_results, axis=0)
def main():

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--train-tsne", type=str, required=True)
    parser.add_argument("--train-ranks", type=str, required=True)
    parser.add_argument("--valid-tsne", type=str, required=True)
    parser.add_argument("--valid-ranks", type=str, required=True)
    parser.add_argument("--batch-size", default=16, type=int)
    parser.add_argument("--gpu", default=-1, type=int)
    parser.add_argument("--lr", default=.001, type=float)
    parser.add_argument("--remove-stopwords",
                        action="store_true",
                        default=False)
    parser.add_argument("--results-path", type=str, default=None)
    parser.add_argument("--model-path", type=str, default=None)
    parser.add_argument("--context-dropout", default=.5, type=float)
    parser.add_argument("--context-size", default=300, type=int)
    parser.add_argument("--validation-summary-dir", required=True, type=str)
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--seed", default=48929234, type=int)

    args = parser.parse_args()

    ntp.set_random_seed(args.seed)

    embedding_size = 4

    if args.results_path is not None:
        results_dir = os.path.dirname(args.results_path)
        if not os.path.exists(results_dir) and results_dir != "":
            os.makedirs(results_dir)

    if args.model_path is not None:
        model_dir = os.path.dirname(args.model_path)
        if not os.path.exists(model_dir) and model_dir != "":
            os.makedirs(model_dir)

    print("Reading training tsne and salience data from {} ...".format(
        args.train_tsne))
    training_tsne_data = read_data(args.train_tsne)

    print("Reading training rank data from {} ...".format(args.train_ranks))
    training_ranks_data = read_data(args.train_ranks)
    for a, b in zip(training_tsne_data, training_ranks_data):
        assert a["id"] == b["id"]

    training_data = make_dataset(training_tsne_data, training_ranks_data,
                                 args.batch_size, args.gpu)

    print("Reading validation tsne and salience data from {} ...".format(
        args.valid_tsne))
    validation_tsne_data = read_data(args.valid_tsne)

    print("Reading validation rank data from {} ...".format(args.valid_ranks))
    validation_ranks_data = read_data(args.valid_ranks)
    for a, b in zip(validation_tsne_data, validation_ranks_data):
        assert a["id"] == b["id"]

    validation_data = make_dataset(validation_tsne_data, validation_ranks_data,
                                   args.batch_size, args.gpu)

    input_module = SequenceStandardizer(embedding_size)

    pn_model = PointerNetwork(input_module,
                              args.context_size,
                              attention_hidden_size=150,
                              layers=2,
                              context_dropout=args.context_dropout)

    for name, param in pn_model.named_parameters():
        if "weight" in name or name.startswith("W") or name == "v":
            nn.init.xavier_normal(param)
        elif "bias" in name:
            nn.init.constant(param, 0)
        else:
            nn.init.normal(param)

    if args.gpu > -1:
        pn_model.cuda(args.gpu)

    optim = torch.optim.Adam(pn_model.parameters(), lr=args.lr)

    train_xents = []
    valid_results = []

    best_rouge_2 = 0
    best_epoch = None

    for epoch in range(1, args.epochs + 1):

        train_xent = train(optim, pn_model, training_data, epoch)
        train_xents.append(train_xent)

        valid_result = validate(pn_model,
                                validation_data,
                                epoch,
                                args.validation_summary_dir,
                                remove_stopwords=args.remove_stopwords)
        valid_results.append(valid_result)
        print(
            "Epoch {} :: Train xent: {:0.3f} | Valid xent: {:0.3f} | R1: {:0.3f} | R2: {:0.3f}"
            .format(epoch, train_xents[-1], *valid_results[-1]))
        if valid_results[-1][-1] > best_rouge_2:
            best_rouge_2 = valid_results[-1][-1]
            best_epoch = epoch
            if args.model_path is not None:
                print("Saving model ...")
                torch.save(pn_model, args.model_path)

    print("Best epoch: {}  ROUGE-1 {:0.3f}  ROUGE-2 {:0.3f}".format(
        best_epoch, *valid_results[best_epoch - 1][1:]))

    if args.results_path is not None:
        results = {
            "training": {
                "cross-entropy": train_xents
            },
            "validation": {
                "cross-entropy": [x[0] for x in valid_results],
                "rouge-1": [x[1] for x in valid_results],
                "rouge-2": [x[2] for x in valid_results]
            }
        }
        print("Writing results to {} ...".format(args.results_path))
        with open(args.results_path, "w") as fp:
            fp.write(json.dumps(results))
Beispiel #4
0
def main():

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--train-inputs", type=str, required=True)
    parser.add_argument("--valid-inputs", type=str, required=True)
    parser.add_argument("--test-inputs", type=str, required=True)

    parser.add_argument("--batch-size", default=16, type=int)
    parser.add_argument("--gpu", default=-1, type=int)
    parser.add_argument(
        "--remove-stopwords", action="store_true", default=False)
    parser.add_argument(
        "--summary-length", default=100, type=int)
    parser.add_argument("--results-path", type=str, default=None)
    parser.add_argument("--model-path", type=str, default=None)
    parser.add_argument("--train-summary-dir", required=True, type=str)
    parser.add_argument("--valid-summary-dir", required=True, type=str)
    parser.add_argument("--test-summary-dir", required=True, type=str)
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--seed", default=48929234, type=int)

    args = parser.parse_args()

    ntp.set_random_seed(args.seed)

    if args.results_path is not None:
        results_dir = os.path.dirname(args.results_path)
        if not os.path.exists(results_dir) and results_dir != "":
            os.makedirs(results_dir)

    print("Loading model from {} ...".format(args.model_path))
    model = torch.load(args.model_path)
    if args.gpu > -1:
        model.cuda(args.gpu)
    else:
        model.cpu()

    print("Reading training input data from {} ...".format(
        args.train_inputs)) 
    training_input_data = read_data(args.train_inputs)

    training_data = make_dataset(
        training_input_data, args.batch_size, args.gpu)

    train_rouge_df = compute_rouge(
        model, training_data, args.train_summary_dir, 
        remove_stopwords=args.remove_stopwords,
        summary_length=args.summary_length)
    train_r1, train_r2 = train_rouge_df.values[0].tolist()    
    print("TRAIN R1 {:0.3f}  R2 {:0.3f}".format(train_r1, train_r2)) 

    print("Reading validation input data from {} ...".format(
        args.valid_inputs)) 
    validation_input_data = read_data(args.valid_inputs)

    validation_data = make_dataset(
        validation_input_data, args.batch_size, args.gpu)

    valid_rouge_df = compute_rouge(
        model, validation_data, args.valid_summary_dir, 
        remove_stopwords=args.remove_stopwords,
        summary_length=args.summary_length)
    valid_r1, valid_r2 = valid_rouge_df.values[0].tolist()    
    print("VALID R1 {:0.3f}  R2 {:0.3f}".format(valid_r1, valid_r2)) 

    print("Reading testing input data from {} ...".format(
        args.test_inputs)) 
    testing_input_data = read_data(args.test_inputs)

    testing_data = make_dataset(
        testing_input_data, args.batch_size, args.gpu)

    test_rouge_df = compute_rouge(
        model, testing_data, args.test_summary_dir, 
        remove_stopwords=args.remove_stopwords,
        summary_length=args.summary_length)
    test_r1, test_r2 = test_rouge_df.values[0].tolist()    
    print("TEST  R1 {:0.3f}  R2 {:0.3f}".format(test_r1, test_r2)) 

    results = {"training": {"rouge-1": train_r1, "rouge-2": train_r2},
               "validation": {"rouge-1": valid_r1, "rouge-2": valid_r2},
               "testing": {"rouge-1": test_r1, "rouge-2": test_r2}}

    with open(args.results_path, "w") as fp:
        fp.write(json.dumps(results))
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--spensum-data-path", type=str, default=None)
    parser.add_argument("--nyt-train-inputs-path", type=str, default=None)
    parser.add_argument("--nyt-train-abstracts-path", type=str, default=None)
    parser.add_argument("--nyt-test-inputs-path", type=str, default=None)
    parser.add_argument("--nyt-test-abstracts-path", type=str, default=None)
    parser.add_argument("--seed", type=int, default=43929524)
    parser.add_argument("--nprocs", type=int, default=None)

    args = parser.parse_args()
    if args.nprocs is None:
        args.nprocs = max(1, mp.cpu_count() // 2)

    ntp.set_random_seed(args.seed)

    if args.spensum_data_path is None:
        args.spensum_data_path = os.getenv("SPENSUM_DATA", None)
        if args.spensum_data_path is None:
            sys.stderr.write(
                "Set SPENSUM_DATA to set location to write data.\n")
            sys.exit(1)

    if args.nyt_train_inputs_path is None:
        args.nyt_train_inputs_path = os.getenv("NYT_TRAIN_INPUTS_ORIGINAL",
                                               None)
        if args.nyt_train_inputs_path is None:
            sys.stderr.write(
                "Set NYT_TRAIN_INPUTS_ORIGINAL to location of " \
                "NYT preprocessed training inputs directory (see " \
                "(https://github.com/gregdurrett/berkeley-doc-summarizer " \
                ").\n")
            sys.exit(1)

    if args.nyt_train_abstracts_path is None:
        args.nyt_train_abstracts_path = os.getenv("NYT_TRAIN_ABS_ORIGINAL",
                                                  None)
        if args.nyt_train_abstracts_path is None:
            sys.stderr.write(
                "Set NYT_TRAIN_ABS_ORIGINAL to location of " \
                "NYT preprocessed training abstracts directory (see " \
                "(https://github.com/gregdurrett/berkeley-doc-summarizer " \
                ").\n")
            sys.exit(1)

    if args.nyt_test_inputs_path is None:
        args.nyt_test_inputs_path = os.getenv("NYT_TEST_INPUTS_ORIGINAL", None)
        if args.nyt_test_inputs_path is None:
            sys.stderr.write(
                "Set NYT_TEST_INPUTS_ORIGINAL to location of " \
                "NYT preprocessed testing inputs directory (see " \
                "(https://github.com/gregdurrett/berkeley-doc-summarizer " \
                ").\n")
            sys.exit(1)

    if args.nyt_test_abstracts_path is None:
        args.nyt_test_abstracts_path = os.getenv("NYT_TEST_ABS_ORIGINAL", None)
        if args.nyt_test_abstracts_path is None:
            sys.stderr.write(
                "Set NYT_TEST_ABS_ORIGINAL to location of " \
                "NYT preprocessed testing abstracts directory (see " \
                "(https://github.com/gregdurrett/berkeley-doc-summarizer " \
                ").\n")
            sys.exit(1)

    nyt_sds_data_root = os.path.join(args.spensum_data_path, "nyt-sds")

    train_ids, valid_ids = make_train_valid_list(args.nyt_train_inputs_path)
    print("Reading training abstracts...")
    train_abstracts = read_inputs(args.nyt_train_abstracts_path, train_ids)

    train_abstracts = [
        ex for ex in train_abstracts
        if sum([s["word_count"] for s in ex["inputs"]]) > 50
    ]
    train_abstracts = train_abstracts[:25000]
    train_ids = [ex["id"] for ex in train_abstracts]
    print(len(train_abstracts))

    print("Writing training reference abstracts...")
    summaries_train_path = os.path.join(nyt_sds_data_root, "summaries",
                                        "train", "human_abstracts")
    write_summaries(train_abstracts, summaries_train_path)

    print("Reading validation abstracts...")
    valid_abstracts = read_inputs(args.nyt_train_abstracts_path, valid_ids)

    valid_abstracts = [
        ex for ex in valid_abstracts
        if sum([s["word_count"] for s in ex["inputs"]]) > 50
    ]
    valid_abstracts = valid_abstracts[:2500]
    valid_ids = [ex["id"] for ex in valid_abstracts]
    print(len(valid_abstracts))

    print("Writing validation reference abstracts...")
    summaries_valid_path = os.path.join(nyt_sds_data_root, "summaries",
                                        "valid", "human_abstracts")
    write_summaries(valid_abstracts, summaries_valid_path)

    print("Reading test abstracts...")
    test_abstracts = read_inputs(args.nyt_test_abstracts_path)

    test_abstracts = [
        ex for ex in test_abstracts
        if sum([s["word_count"] for s in ex["inputs"]]) > 50
    ]
    test_ids = [ex["id"] for ex in test_abstracts]

    print(len(test_abstracts))

    print("Writing test reference abstracts...")
    summaries_test_path = os.path.join(nyt_sds_data_root, "summaries", "test",
                                       "human_abstracts")
    write_summaries(test_abstracts, summaries_test_path)

    print("Reading training inputs...")
    train_inputs_data = read_inputs(args.nyt_train_inputs_path, train_ids)
    print("Reading validation inputs...")
    valid_inputs_data = read_inputs(args.nyt_train_inputs_path, valid_ids)
    print("Reading test inputs...")
    test_inputs_data = read_inputs(args.nyt_test_inputs_path, test_ids)

    print("Writing train labels...")
    train_labels_path = os.path.join(
        nyt_sds_data_root, "labels",
        "nyt.sds.labels.seq.rouge-1.sw.train.json")
    train_ranks_path = os.path.join(nyt_sds_data_root, "ranks",
                                    "nyt.sds.ranks.seq.rouge-1.sw.train.json")
    generate_extracts(train_inputs_data, train_abstracts, "sequential", 1,
                      train_labels_path, train_ranks_path, args.nprocs)

    print("Writing valid labels...")
    valid_labels_path = os.path.join(
        nyt_sds_data_root, "labels",
        "nyt.sds.labels.seq.rouge-1.sw.valid.json")
    valid_ranks_path = os.path.join(nyt_sds_data_root, "ranks",
                                    "nyt.sds.ranks.seq.rouge-1.sw.valid.json")
    generate_extracts(valid_inputs_data, valid_abstracts, "sequential", 1,
                      valid_labels_path, valid_ranks_path, args.nprocs)

    print("Writing test labels...")
    test_labels_path = os.path.join(nyt_sds_data_root, "labels",
                                    "nyt.sds.labels.seq.rouge-1.sw.test.json")
    test_ranks_path = os.path.join(nyt_sds_data_root, "ranks",
                                   "nyt.sds.ranks.seq.rouge-1.sw.test.json")
    generate_extracts(test_inputs_data, test_abstracts, "sequential", 1,
                      test_labels_path, test_ranks_path, args.nprocs)

    print("Collecting sentence tokens...")
    all_training_sents = [[token.lower() for token in sent["tokens"]]
                          for ex in train_inputs_data for sent in ex['inputs']]

    print(len(all_training_sents))
    print("Loading sif embedding model...")
    sif_emb = ntp.models.sentence_embedding.SIFEmbedding.from_pretrained()

    print("Fitting principal component...")
    sif_emb.fit_principle_component(all_training_sents)

    sif_path = os.path.join(args.spensum_data_path, nyt_sds_data_root,
                            "sif.bin")
    torch.save(sif_emb, sif_path)

    print("Writing training inputs...")
    inputs_train_path = os.path.join(nyt_sds_data_root, "inputs",
                                     "nyt.sds.inputs.train.json")
    generate_inputs(train_inputs_data, sif_emb, inputs_train_path)

    print("Writing validation inputs...")
    inputs_valid_path = os.path.join(nyt_sds_data_root, "inputs",
                                     "nyt.sds.inputs.valid.json")
    generate_inputs(valid_inputs_data, sif_emb, inputs_valid_path)

    print("Writing test inputs...")
    inputs_test_path = os.path.join(nyt_sds_data_root, "inputs",
                                    "nyt.sds.inputs.test.json")
    generate_inputs(test_inputs_data, sif_emb, inputs_test_path)
Beispiel #6
0
def main(args=None):
    parser = argparse.ArgumentParser()
    parser.add_argument("--train-inputs", type=str, required=True)
    parser.add_argument("--train-labels", type=str, required=True)
    parser.add_argument("--valid-inputs", type=str, required=True)
    parser.add_argument("--valid-labels", type=str, required=True)
    parser.add_argument("--valid-summary-dir", type=str, required=True)

    parser.add_argument("--gpu", default=-1, type=int, required=False)
    parser.add_argument("--epochs", default=100, type=int, required=False)
    parser.add_argument("--seed", default=83432534, type=int, required=False)

    parser.add_argument("--lr", required=False, default=.01, type=float)
    parser.add_argument("--batch-size", default=16, type=int, required=False)
    parser.add_argument("--embedding-size",
                        type=int,
                        required=False,
                        default=300)

    parser.add_argument("--rnn-salience", action="store_true", default=False)
    parser.add_argument("--rs-xent", type=float, required=False, default=0)
    parser.add_argument("--rs-burn-in", type=int, default=0, required=False)
    parser.add_argument("--rs-hidden-size",
                        default=150,
                        type=int,
                        required=False)

    parser.add_argument("--position", action="store_true", default=False)
    parser.add_argument("--p-xent", type=float, default=0, required=False)
    parser.add_argument("--p-num-positions",
                        type=int,
                        default=50,
                        required=False)
    parser.add_argument("--p-burn-in", type=int, default=0, required=False)

    parser.add_argument("--word-count", action="store_true", default=False)
    parser.add_argument("--wc-xent", type=float, default=0, required=False)
    parser.add_argument("--wc-burn-in", type=int, default=0, required=False)

    parser.add_argument("--neighbor-clique",
                        action="store_true",
                        default=False)

    parser.add_argument("--report-every",
                        type=int,
                        required=False,
                        default=1000)
    parser.add_argument("--validate-every",
                        type=int,
                        required=False,
                        default=1)
    parser.add_argument("--burn-in-report-every",
                        type=int,
                        required=False,
                        default=500)

    parser.add_argument("--pc-coverage", action="store_true", default=False)
    parser.add_argument("--pcc-xent", type=float, required=False, default=0)
    parser.add_argument("--pcc-burn-in", type=int, default=0, required=False)

    parser.add_argument("--hidden-layer-sizes",
                        nargs="+",
                        default=[100],
                        type=int,
                        required=False)
    parser.add_argument("--hidden-layer-activations",
                        nargs="+",
                        default="relu",
                        type=str,
                        required=False)
    parser.add_argument("--hidden-layer-dropout",
                        default=.0,
                        type=float,
                        required=False)
    parser.add_argument("--input-layer-norm",
                        default=False,
                        action="store_true")

    #    parser.add_argument(
    #        "--save-module", required=True, type=str)
    #    parser.add_argument(
    #        "--save-predictor", default=None, required=False, type=str)

    args = parser.parse_args(args)

    colorama.init()
    print("")
    print(
        "   ++=============================================================++")
    print("   || Summary Energy Network Sentence Extractor " + \
        colorama.Fore.GREEN + colorama.Style.BRIGHT + "(SENSEi)" + \
        colorama.Fore.RESET + colorama.Style.NORMAL + " trainer. ||" )
    print(
        "   ++=============================================================++")
    print("")


    print("Setting random seed: " + colorama.Style.BRIGHT + \
            colorama.Fore.WHITE + str(args.seed) + \
            colorama.Style.NORMAL + colorama.Fore.RESET + "\n")
    ntp.set_random_seed(args.seed)

    input_reader = spensum.dataio.init_duc_sds_input_reader(
        args.embedding_size)
    label_reader = spensum.dataio.init_duc_sds_label_reader()

    train_dataset = spensum.dataio.read_input_label_dataset(
        args.train_inputs,
        args.train_labels,
        input_reader,
        label_reader,
        batch_size=args.batch_size,
        gpu=args.gpu)

    valid_dataset = spensum.dataio.read_input_label_dataset(
        args.valid_inputs,
        args.valid_labels,
        input_reader,
        label_reader,
        batch_size=args.batch_size,
        gpu=args.gpu)

    non_salient_count = train_dataset.targets.eq(0).sum()
    salient_count = train_dataset.targets.eq(1).sum()
    weight = torch.FloatTensor([1 / non_salient_count, 1 / salient_count])

    crit = spensum.criterion.SPENLoss(weight=weight)
    crit.add_reporter(ntp.criterion.BinaryFMeasureReporter(mode="prob"))
    crit.set_selection_criterion("BinaryFMeasureReporter")

    print(colorama.Style.BRIGHT + "Beginning preflight check...\n" \
            + colorama.Style.NORMAL)

    print("Initializing submodules...")

    submodules = []

    if args.rnn_salience:
        module = spensum.module.RNNSalience(args.embedding_size,
                                            hidden_size=args.rs_hidden_size,
                                            burn_in=args.rs_burn_in)
        submodules.append(module)
        msg = "    {:>15} ... {:>8}".format(
            "rnn salience", colorama.Fore.GREEN + 'OK' + colorama.Fore.RESET)
        if module.burn_in > 0:
            msg += "   burnin={} iters".format(module.burn_in)
        if args.rs_xent > 0:
            aux_crit = ntp.criterion.BinaryCrossEntropy(name="RNNSalienceXEnt",
                                                        mode="logit",
                                                        weight=weight,
                                                        mask_value=-1)
            aux_crit.add_reporter(
                ntp.criterion.BinaryFMeasureReporter(mode="logit"))
            crit.add_aux_criterion(module, aux_crit, args.rs_xent)
            msg += colorama.Fore.GREEN + "   xent obj" + colorama.Fore.RESET
        print(msg)
    else:
        print("    {:>15} ... {:>8}".format(
            "rnn salience",
            colorama.Fore.YELLOW + 'SKIP' + colorama.Fore.RESET))

    if args.position:
        module = spensum.module.Position(args.p_num_positions,
                                         burn_in=args.p_burn_in)
        submodules.append(module)
        msg = "    {:>15} ... {:>8}".format(
            "position", colorama.Fore.GREEN + 'OK' + colorama.Fore.RESET)
        if module.burn_in > 0:
            msg += "   burnin={} iters".format(module.burn_in)
        if args.p_xent > 0:
            aux_crit = ntp.criterion.BinaryCrossEntropy(name="PositionXEnt",
                                                        mode="logit",
                                                        weight=weight,
                                                        mask_value=-1)
            aux_crit.add_reporter(
                ntp.criterion.BinaryFMeasureReporter(mode="logit"))
            crit.add_aux_criterion(module, aux_crit, weight=args.p_xent)
            msg += colorama.Fore.GREEN + "   xent obj" + colorama.Fore.RESET
        print(msg)
    else:
        print("    {:>15} ... {:>8}".format(
            "position", colorama.Fore.YELLOW + 'SKIP' + colorama.Fore.RESET))

    if args.word_count:
        module = spensum.module.WordCount(burn_in=args.wc_burn_in)
        submodules.append(module)
        msg = "    {:>15} ... {:>8}".format(
            "word_count", colorama.Fore.GREEN + 'OK' + colorama.Fore.RESET)
        if module.burn_in > 0:
            msg += "   burnin={} iters".format(module.burn_in)
        if args.wc_xent > 0:
            aux_crit = ntp.criterion.BinaryCrossEntropy(name="WordCountXEnt",
                                                        mode="logit",
                                                        weight=weight,
                                                        mask_value=-1)
            aux_crit.add_reporter(
                ntp.criterion.BinaryFMeasureReporter(mode="logit"))
            crit.add_aux_criterion(module, aux_crit, weight=args.wc_xent)
            msg += colorama.Fore.GREEN + "   xent obj" + colorama.Fore.RESET
        print(msg)
    else:
        print("    {:>15} ... {:>8}".format(
            "word_count", colorama.Fore.YELLOW + 'SKIP' + colorama.Fore.RESET))

    if args.pc_coverage:
        module = spensum.module.PCCoverage(args.embedding_size,
                                           burn_in=args.pcc_burn_in)
        submodules.append(module)
        msg = "    {:>15} ... {:>8}".format(
            "pc_coverage", colorama.Fore.GREEN + 'READY' + colorama.Fore.RESET)
        if module.burn_in > 0:
            msg += "   burnin={} iters".format(module.burn_in)
        if args.pcc_xent > 0:
            aux_crit = ntp.criterion.BinaryCrossEntropy(name="PCCoverageXEnt",
                                                        mode="logit",
                                                        weight=weight,
                                                        mask_value=-1)
            aux_crit.add_reporter(
                ntp.criterion.BinaryFMeasureReporter(mode="logit"))
            crit.add_aux_criterion(module, aux_crit, weight=args.pcc_xent)
            msg += colorama.Fore.GREEN + "   xent obj" + colorama.Fore.RESET
        print(msg)
    else:
        print("    {:>15} ... {:>8}".format(
            "pc_coverage",
            colorama.Fore.YELLOW + 'SKIP' + colorama.Fore.RESET))

    if args.neighbor_clique:
        module = spensum.module.NeighborClique()
        submodules.append(module)
        msg = "    {:>15} ... {:>8}".format(
            "neighbor_clique",
            colorama.Fore.GREEN + 'READY' + colorama.Fore.RESET)
        if module.burn_in > 0:
            msg += "   burnin={} iters".format(module.burn_in)


#        if args.nc_xent > 0:
#            aux_crit = ntp.criterion.BinaryCrossEntropy(
#                name="NCliqueXEnt",
#                mode="logit", weight=weight, mask_value=-1)
#            aux_crit.add_reporter(
#                ntp.criterion.BinaryFMeasureReporter(mode="logit"))
#            crit.add_aux_criterion(module, aux_crit, weight=args.pcc_xent)
#            msg += colorama.Fore.GREEN + "   xent obj" + colorama.Fore.RESET
        print(msg)
    else:
        print("    {:>15} ... {:>8}".format(
            "neighbor_clique",
            colorama.Fore.YELLOW + 'SKIP' + colorama.Fore.RESET))

    print("\nInitializing energy model...")
    model = spensum.model.EnergyModel(submodules)
    if args.gpu > -1:
        print("Placing model on gpu device: " + \
            colorama.Style.BRIGHT + colorama.Fore.WHITE + str(args.gpu) \
            + colorama.Style.NORMAL + colorama.Fore.RESET)
        model.cuda(args.gpu)

    if not model.ready:
        print("Running burn in for {} iters...".format(model.burn_in_iters))
        burn_in(model,
                train_dataset,
                weight=weight,
                report_every=args.burn_in_report_every)

    opt = ntp.optimizer.Adam(model.parameters(), lr=args.lr)

    max_iters = 1000000
    fit_model(model,
              crit,
              opt,
              train_dataset,
              max_iters,
              validation_dataset=valid_dataset,
              report_every=args.report_every,
              validate_every=args.validate_every,
              validation_summary_dir=args.valid_summary_dir)
Beispiel #7
0
def main():

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--train-inputs", type=str, required=True)
    parser.add_argument("--train-salience", type=str, required=True)
    parser.add_argument("--valid-inputs", type=str, required=True)
    parser.add_argument("--valid-salience", type=str, required=True)
    parser.add_argument("--batch-size", default=300, type=int)
    parser.add_argument("--gpu", default=-1, type=int)
    parser.add_argument("--lr", default=.00005, type=float)
    parser.add_argument(
        "--remove-stopwords", action="store_true", default=False)
    parser.add_argument("--results-path", type=str, default=None)
    parser.add_argument("--model-path", type=str, default=None)
    parser.add_argument("--dropout", default=.5, type=float)
    parser.add_argument("--context-size", default=300, type=int)
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--seed", default=48929234, type=int)

    args = parser.parse_args()

    ntp.set_random_seed(args.seed)

    embedding_size = 300

    if args.results_path is not None:
        results_dir = os.path.dirname(args.results_path)
        if not os.path.exists(results_dir) and results_dir != "":
            os.makedirs(results_dir)

    if args.model_path is not None:
        model_dir = os.path.dirname(args.model_path)
        if not os.path.exists(model_dir) and model_dir != "":
            os.makedirs(model_dir)

    print("Reading training salience data from {} ...".format(
        args.train_salience)) 
    training_salience_data = read_data(args.train_salience)

    print("Reading training inputs data from {} ...".format(args.train_inputs))
    training_inputs_data = read_data(args.train_inputs)
    for a, b in zip(training_inputs_data, training_salience_data):
        assert a["id"] == b["id"]

    training_data = make_dataset(
        training_inputs_data, training_salience_data, args.batch_size, 
        args.gpu)
 
    print("Reading validation salience data from {} ...".format(
        args.valid_salience)) 
    validation_salience_data = read_data(args.valid_salience)

    print("Reading validation inputs data from {} ...".format(args.valid_inputs)) 
    validation_inputs_data = read_data(args.valid_inputs)
    for a, b in zip(validation_salience_data, validation_inputs_data):
        assert a["id"] == b["id"]

    validation_data = make_dataset(
        validation_inputs_data, validation_salience_data, args.batch_size, 
        args.gpu)
    
    model = RougePredictor(dropout=args.dropout)
        #input_module, args.context_size, attention_hidden_size=150, layers=2,
        #context_dropout=args.context_dropout)

    for name, param in model.named_parameters():
        if "emb" in name:
            nn.init.normal(param)
        elif "weight" in name:
            nn.init.xavier_normal(param)    
        elif "bias" in name:
            nn.init.constant(param, 0)    
        else:
            nn.init.normal(param)

    if args.gpu > -1:
        model.cuda(args.gpu)

    optim = torch.optim.Adam(model.parameters(), lr=args.lr)

    train_loss = []
    valid_loss = []

    best_loss = float("inf")
    best_epoch = None

    for epoch in range(1, args.epochs + 1):
        
        train_loss.append(train(optim, model, training_data, epoch))
        
        valid_loss.append(validate(model, validation_data, epoch))
        print("Epoch {} :: Train err: {:0.5f} | Valid err: {:0.5f} | ".format(
            epoch, train_loss[-1], valid_loss[-1]))
        if valid_loss[-1] < best_loss:
            best_loss = valid_loss[-1]
            best_epoch = epoch
            if args.model_path is not None:
                print("Saving model ...")
                torch.save(model, args.model_path)

    print("Best epoch: {} Error={:0.5f}".format(
        best_epoch, valid_loss[best_epoch - 1]))

    if args.results_path is not None:
        results = {"training": train_loss,
                   "validation": valid_loss}
        print("Writing results to {} ...".format(args.results_path))
        with open(args.results_path, "w") as fp:
            fp.write(json.dumps(results)) 
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--spensum-data-path", type=str, default=None)
    parser.add_argument("--duc-2001-data-path", type=str, default=None)
    parser.add_argument("--seed", type=int, default=43929524)

    args = parser.parse_args()

    ntp.set_random_seed(args.seed)

    if args.spensum_data_path is None:
        args.spensum_data_path = os.getenv("SPENSUM_DATA", None)
        if args.spensum_data_path is None:
            sys.stderr.write(
                "Set SPENSUM_DATA to set location to write data.\n")
            sys.exit(1)

    if args.duc_2001_data_path is None:
        args.duc_2001_data_path = os.getenv("DUC2001_ORIGINAL", None)
        if args.duc_2001_data_path is None:
            sys.stderr.write(
                "Set DUC2001_ORIGINAL to location of nist duc 2001 data.\n")
            sys.exit(1)

    pp_duc_2001_sds_path = os.path.join(args.spensum_data_path, "duc-sds",
                                        "preprocessed-data", "duc2001")

    #    print("Preprocessing raw duc 2001 sds data ...")
    #    duc2001.preprocess_sds(
    #        pp_duc_2001_sds_path, nist_data_path=args.duc_2001_data_path,
    #        cnlp_port=9000)

    print("Loading sif embedding model ...")
    sif_emb = ntp.models.sentence_embedding.SIFEmbedding.from_pretrained()

    duc_sds_data_root = os.path.join(args.spensum_data_path, "duc-sds")

    train_data, valid_data = generate_train_valid_splits(
        pp_duc_2001_sds_path, duc_sds_data_root)

    all_training_sents = [[token.lower() for token in sent["tokens"]]
                          for ex in train_data for sent in ex[0]]

    sif_emb.fit_principle_component(all_training_sents)

    sif_path = os.path.join(args.spensum_data_path, duc_sds_data_root,
                            "sif.bin")
    torch.save(sif_emb, sif_path)

    print("Writing training inputs...")
    inputs_train_path = os.path.join(duc_sds_data_root, "inputs",
                                     "duc.sds.inputs.train.json")
    generate_inputs(train_data, sif_emb, inputs_train_path)

    print("Writing validation inputs...")
    inputs_valid_path = os.path.join(duc_sds_data_root, "inputs",
                                     "duc.sds.inputs.valid.json")
    generate_inputs(valid_data, sif_emb, inputs_valid_path)

    #    print("Writing training summaries...")
    #    summaries_train_path = os.path.join(
    #        duc_sds_data_root, "summaries", "train", "human_abstracts")
    #    write_summaries(train_data, summaries_train_path)

    #    print("Writing validation summaries...")
    #    summaries_valid_path = os.path.join(
    #        duc_sds_data_root, "summaries", "valid", "human_abstracts")
    #    write_summaries(valid_data, summaries_valid_path)

    #for mode in ["independent", "sequential"]:
    for mode in ["sequential"]:
        for part, data in [["valid", valid_data], ["train", train_data]]:
            #for mode in ["sequential"]:
            for rouge in [1]:  #2, 3, 4]:

                labels_path = os.path.join(
                    duc_sds_data_root, "labels",
                    "duc.sds.labels.{}.rouge-{}.sw.{}.json".format(
                        "indie" if mode == "independent" else "seq", rouge,
                        part))
                ranks_path = os.path.join(
                    duc_sds_data_root, "ranks",
                    "duc.sds.ranks.{}.rouge-{}.sw.{}.json".format(
                        "indie" if mode == "independent" else "seq", rouge,
                        part))

                print("Generating {} rouge-{} ranks/labels " \
                      "for {} data".format(mode, rouge, part))

                generate_extracts(data, mode, rouge, labels_path, ranks_path)
Beispiel #9
0
def main():

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--train-inputs", type=str, required=True)
    parser.add_argument("--train-labels", type=str, required=True)
    parser.add_argument("--valid-inputs", type=str, required=True)
    parser.add_argument("--valid-labels", type=str, required=True)
    parser.add_argument("--batch-size", default=16, type=int)
    parser.add_argument("--gpu", default=-1, type=int)
    parser.add_argument(
        "--remove-stopwords", action="store_true", default=False)
    parser.add_argument(
        "--summary-length", default=100, type=int)
    parser.add_argument("--results-path", type=str, default=None)
    parser.add_argument("--model-path", type=str, default=None)
    parser.add_argument("--context-dropout", default=.5, type=float)
    parser.add_argument("--context-size", default=200, type=int)
    parser.add_argument("--validation-summary-dir", required=True, type=str)
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--seed", default=48929234, type=int)


    args = parser.parse_args()

    ntp.set_random_seed(args.seed)

    embedding_size = 300

    if args.results_path is not None:
        results_dir = os.path.dirname(args.results_path)
        if not os.path.exists(results_dir) and results_dir != "":
            os.makedirs(results_dir)

    if args.model_path is not None:
        model_dir = os.path.dirname(args.model_path)
        if not os.path.exists(model_dir) and model_dir != "":
            os.makedirs(model_dir)

    print("Reading training input data from {} ...".format(
        args.train_inputs)) 
    training_input_data = read_data(args.train_inputs)

    print("Reading training label data from {} ...".format(args.train_labels)) 
    training_label_data = read_data(args.train_labels)
    for a, b in zip(training_input_data, training_label_data):
        assert a["id"] == b["id"]

    training_data = make_dataset(
        training_input_data, training_label_data, args.batch_size, args.gpu)
 
    print("Reading validation input data from {} ...".format(
        args.valid_inputs)) 
    validation_input_data = read_data(args.valid_inputs)

    print("Reading validation label data from {} ...".format(
        args.valid_labels)) 
    validation_label_data = read_data(args.valid_labels)
    for a, b in zip(validation_input_data, validation_label_data):
        assert a["id"] == b["id"]

    validation_data = make_dataset(
        validation_input_data, validation_label_data, args.batch_size, 
        args.gpu)
    
    model = SummaRunner(hidden_size=args.context_size, 
        dropout=args.context_dropout)

    for name, param in model.named_parameters():
        if "emb" not in name and "weight" in name:
            nn.init.xavier_normal(param)    
        elif "emb" not in name and "bias" in name:
            nn.init.constant(param, 0)    

    if args.gpu > -1:
        model.cuda(args.gpu)

    optim = torch.optim.Adam(model.parameters(), lr=.001)

    train_xents = []
    valid_results = []

    best_rouge_2 = 0
    best_epoch = None

    for epoch in range(1, args.epochs + 1):
        
        train_xent = train(optim, model, training_data, epoch)
        train_xents.append(train_xent)
        
        valid_result = validate(
            model, validation_data, epoch, args.validation_summary_dir, 
            remove_stopwords=args.remove_stopwords, 
            summary_length=args.summary_length)
        valid_results.append(valid_result)
        print(("Epoch {} :: Train xent: {:0.3f} | Valid xent: {:0.3f} | " \
               "R1: {:0.3f} | R2: {:0.3f}").format(
                  epoch, train_xents[-1], *valid_results[-1]))

        if valid_results[-1][-1] > best_rouge_2:
            best_rouge_2 = valid_results[-1][-1]
            best_epoch = epoch
            if args.model_path is not None:
                print("Saving model ...")
                torch.save(model, args.model_path)

    print("Best epoch: {}  ROUGE-1 {:0.3f}  ROUGE-2 {:0.3f}".format(
        best_epoch, *valid_results[best_epoch - 1][1:]))
    
    if args.results_path is not None:
        results = {"training": {"cross-entropy": train_xents},
                   "validation": {
                       "cross-entropy": [x[0] for x in valid_results], 
                       "rouge-1": [x[1] for x in valid_results],
                       "rouge-2": [x[2] for x in valid_results]}}
        print("Writing results to {} ...".format(args.results_path))
        with open(args.results_path, "w") as fp:
            fp.write(json.dumps(results))