Example #1
0
def compute_rouge(model, dataset, reference_dir):

    model.eval()

    ids2refs = collect_reference_paths(reference_dir)

    with rouge_papier.util.TempFileManager() as manager:

        path_data = []
        for batch in dataset.iter_batch():
            batch_size = batch.inputs.sequence.size(0)
            predictions = model.greedy_predict(batch.inputs)

            for b in range(batch_size):
                id = batch.metadata.id[b]
                preds = [
                    p for p in predictions.data[b].cpu().tolist() if p > -1
                ]
                summary = "\n".join([batch.metadata.text[b][p] for p in preds])

                #for id, summary in zip(batch.metadata.id, batch.metadata):
                summary_path = manager.create_temp_file(summary)
                ref_paths = ids2refs[id]
                path_data.append([summary_path, ref_paths])

        config_text = rouge_papier.util.make_simple_config_text(path_data)
        config_path = manager.create_temp_file(config_text)
        df = rouge_papier.compute_rouge(config_path,
                                        max_ngram=2,
                                        lcs=False,
                                        remove_stopwords=True)
        return df[-1:]
Example #2
0
def compute_rouge(model, dataset, reference_dir, remove_stopwords=True,
                  summary_length=100):

    model.eval()

    ids2refs = collect_reference_paths(reference_dir)

    with rouge_papier.util.TempFileManager() as manager:

        path_data = []
        for batch in dataset.iter_batch():
            texts = model.predict(batch.inputs, batch.metadata)
            
            for b, text in enumerate(texts):
                id = batch.metadata.id[b]
                summary = "\n".join(text)                
                summary_path = manager.create_temp_file(summary)
                ref_paths = ids2refs[id]
                path_data.append([summary_path, ref_paths])

        config_text = rouge_papier.util.make_simple_config_text(path_data)
        config_path = manager.create_temp_file(config_text)
        df = rouge_papier.compute_rouge(
            config_path, max_ngram=2, lcs=False, 
            remove_stopwords=remove_stopwords,
            length=summary_length)
        return df[-1:]
Example #3
0
def compute_rouge(model, dataset, reference_dir):

    model.eval()

    ids2refs = collect_reference_paths(reference_dir)

    with rouge_papier.util.TempFileManager() as manager:

        path_data = []
        for batch in dataset.iter_batch():
            texts = model.extract(batch.inputs,
                                  batch.metadata,
                                  strategy="rank")

            for id, summary in zip(batch.metadata.id, texts):
                summary_path = manager.create_temp_file(summary)
                ref_paths = ids2refs[id]
                path_data.append([summary_path, ref_paths])

        config_text = rouge_papier.util.make_simple_config_text(path_data)
        config_path = manager.create_temp_file(config_text)
        df = rouge_papier.compute_rouge(config_path,
                                        max_ngram=2,
                                        lcs=False,
                                        remove_stopwords=True)
        return df[-1:]
Example #4
0
def compute_rouge(model,
                  dataloader,
                  remove_stopwords=True,
                  summary_length=100):

    model.eval()

    hist = {}
    with rouge_papier.util.TempFileManager() as manager:

        path_data = []
        for batch in dataloader:
            texts, positions = model.predict(batch,
                                             return_indices=True,
                                             max_length=summary_length)
            for pos_b in positions:
                for p in pos_b:
                    hist[p] = hist.get(p, 0) + 1
            for b, text in enumerate(texts):
                id = batch.id[b]
                summary = "\n".join(text)
                summary_path = manager.create_temp_file(summary)
                path_data.append(
                    [summary_path, [str(x) for x in batch.reference_paths[b]]])

        config_text = rouge_papier.util.make_simple_config_text(path_data)
        config_path = manager.create_temp_file(config_text)
        df = rouge_papier.compute_rouge(config_path,
                                        max_ngram=2,
                                        lcs=False,
                                        remove_stopwords=remove_stopwords,
                                        length=summary_length)
        return df[-1:], hist
Example #5
0
def evaluate_method(inputs_path, abs_dir, output_dir, 
                    method="lead", summary_length=100):

    ids = []
    rouge_config_paths = []
    for f in os.listdir(inputs_path):
        with open("%s/%s"%(inputs_path,f), "r") as inp_fp:
            example = json.load(inp_fp)
            if method == "lead":
                sys_summary_text = make_lead(example, limit=summary_length)
            elif method == "tail":
                sys_summary_text = make_tail(example, limit=summary_length)
            elif method == "random":
                sys_summary_text = make_random(example, limit=summary_length)
            else:
                raise Exception("method not implemented: " + method)
            
            ref_paths = find_references(abs_dir, example["id"])
            sys_path = os.path.join(
                output_dir, "{}.summary".format(example["id"]))
            with open(sys_path, "w") as out_fp:
                out_fp.write(sys_summary_text)
            rouge_config_paths.append([sys_path, ref_paths])
            ids.append(example["id"])
    with rouge_papier.util.TempFileManager() as manager:
        config_text = rouge_papier.util.make_simple_config_text(
                rouge_config_paths)
        config_path = manager.create_temp_file(config_text)
        df, conf = rouge_papier.compute_rouge(
                config_path, max_ngram=2, lcs=True, 
                remove_stopwords=False,
                length=summary_length, return_conf=True)
        df.index = ids + ["average"]
            #df = pd.concat([df[:-1].sort_index(), df[-1:]], axis=0)
        return df, conf
Example #6
0
def partition_inputs(inputs_path,
                     abstracts,
                     inputs_out,
                     output_dir,
                     summary_length=100):

    ids = []
    rouge_config_paths = []
    for f in os.listdir(inputs_path):
        with open("%s/%s" % (inputs_path, f), "r") as inp_fp:
            example = json.load(inp_fp)
            sys_summary_text = make_lead(example, limit=summary_length)
            ref_paths = find_references(abstracts, example["id"])
            sys_path = os.path.join(output_dir,
                                    "{}.summary".format(example["id"]))
            with open(sys_path, "w") as out_fp:
                out_fp.write(sys_summary_text)
            rouge_config_paths.append([sys_path, ref_paths])
            ids.append(example["id"])
    with rouge_papier.util.TempFileManager() as manager:
        config_text = rouge_papier.util.make_simple_config_text(
            rouge_config_paths)
        config_path = manager.create_temp_file(config_text)
        df, conf = rouge_papier.compute_rouge(config_path,
                                              max_ngram=2,
                                              lcs=True,
                                              remove_stopwords=False,
                                              length=summary_length,
                                              return_conf=True)
        df.index = ids + ["average"]
        scored_ids = sorted(df.to_dict()["rouge-1"].items(),
                            key=lambda x: x[1])

    os.system("mkdir -p %s" % inputs_out)
    os.system("mkdir -p %s/%s" % (inputs_out, "inputs1"))
    os.system("mkdir -p %s/%s" % (inputs_out, "inputs2"))
    os.system("mkdir -p %s/%s" % (inputs_out, "inputs3"))
    os.system("mkdir -p %s/%s" % (inputs_out, "inputs4"))

    idx = int(len(scored_ids) / 4)
    for id, score in scored_ids[:idx]:
        if id != "average":
            os.system("cp %s/%s.json %s/inputs1/" %
                      (inputs_path, id, inputs_out))
    for id, score in scored_ids[idx:2 * idx]:
        if id != "average":
            os.system("cp %s/%s.json %s/inputs2/" %
                      (inputs_path, id, inputs_out))
    for id, score in scored_ids[2 * idx:3 * idx]:
        if id != "average":
            os.system("cp %s/%s.json %s/inputs3/" %
                      (inputs_path, id, inputs_out))
    for id, score in scored_ids[3 * idx:4 * idx]:
        if id != "average":
            os.system("cp %s/%s.json %s/inputs4/" %
                      (inputs_path, id, inputs_out))
Example #7
0
def compute_rouge(model,
                  dataset,
                  reference_dir,
                  output_dir,
                  remove_stopwords=True,
                  summary_length=100):

    model.eval()

    hist = {}
    ids2refs = collect_reference_paths(reference_dir)
    max_iters = int(np.ceil(dataset.size / dataset.batch_size))

    ordered_ids = []

    with rouge_papier.util.TempFileManager() as manager:

        path_data = []
        for i, batch in enumerate(dataset.iter_batch(), 1):
            sys.stdout.write("{}/{}\r".format(i, max_iters))
            sys.stdout.flush()

            texts, positions = model.predict(batch.inputs,
                                             batch.metadata,
                                             return_indices=True,
                                             max_length=summary_length + 25)
            for pos_b in positions:
                for p in pos_b:
                    p = int(p)
                    hist[p] = hist.get(p, 0) + 1
            for b, text in enumerate(texts):
                id = batch.metadata.id[b]
                summary = "\n".join(text)
                summary_path = os.path.join(output_dir,
                                            "{}.summary".format(id))
                with open(summary_path, "w") as sfp:
                    sfp.write(summary)
                ref_paths = ids2refs[id]
                path_data.append([summary_path, ref_paths])
                ordered_ids.append(id)

        print("")
        config_text = rouge_papier.util.make_simple_config_text(path_data)
        config_path = manager.create_temp_file(config_text)
        df = rouge_papier.compute_rouge(config_path,
                                        max_ngram=2,
                                        lcs=True,
                                        remove_stopwords=remove_stopwords,
                                        length=summary_length)
        df.index = ordered_ids + ["average"]
        df = pd.concat([df[:-1].sort_index(), df[-1:]], axis=0)
        return df, hist
Example #8
0
def main(args=None):

    parser = argparse.ArgumentParser()
    parser.add_argument("--inputs", type=str, required=True)
    parser.add_argument("--remove-stopwords",
                        action="store_true",
                        required=False,
                        default=False)
    parser.add_argument("--reference-summary-dir", type=str, required=True)

    args = parser.parse_args(args)
    ids2refs = collect_reference_paths(args.reference_summary_dir)

    with rouge_papier.util.TempFileManager() as manager:
        data_paths = []
        with open(args.inputs, "r") as fp:
            for line in fp:
                example = json.loads(line)

                lines = []
                word_count = 0
                for sent in example["inputs"]:
                    lines.append(sent["text"])
                    word_count += sent["word_count"]
                    if word_count > 100:
                        break
                summary = "\n".join(lines)

                summary_path = manager.create_temp_file(summary)
                data_paths.append([summary_path, ids2refs[example["id"]]])
        config_text = rouge_papier.util.make_simple_config_text(data_paths)
        config_path = manager.create_temp_file(config_text)
        df = rouge_papier.compute_rouge(config_path,
                                        max_ngram=2,
                                        lcs=False,
                                        remove_stopwords=args.remove_stopwords)

    result = df[-1:]
    result.index = ["lead"]
    print(result)
    return result
Example #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--reference-summaries", type=str, required=True)
    parser.add_argument("--system-summaries",
                        nargs="+",
                        type=str,
                        required=True)
    parser.add_argument("--system-names",
                        type=str,
                        nargs="+",
                        required=False,
                        default=None)
    args = parser.parse_args()

    if args.system_names is None:
        args.system_names = [path[-20:] for path in args.system_summaries]

    if len(args.system_names) != len(args.system_summaries):
        raise Exception("--system-names must have the same number of " \
                        "arguments as --system-summaries")

    data = []
    systems = []
    id2paths = read_reference_summary_manifest(args.reference_summaries)
    for sys_dir, sys_name in zip(args.system_summaries, args.system_names):
        sys_ids, sys_paths = read_system_summary_manifest(sys_dir, id2paths)
        sys_and_sum_paths = [[spth, id2paths[sid]]
                             for sid, spth in zip(sys_ids, sys_paths)]
        config_text = rouge_papier.util.make_simple_config_text(
            sys_and_sum_paths)
        with rouge_papier.util.TempFileManager() as manager:
            config_path = manager.create_temp_file(config_text)
            df = rouge_papier.compute_rouge(config_path, max_ngram=4, lcs=True)
            data.append(df[-1:])
            systems.append(sys_name)
    df = pd.concat(data, axis=0)
    df.index = systems
    print(df)
Example #10
0
    def compute(self):
        if len(self._path_data) == 0:
            raise NotComputableError(
                'PerlRouge must have at least one example before ' \
                'it can be computed')

        with rouge_papier.util.TempFileManager() as manager:

            config_text = rouge_papier.util.make_simple_config_text(
                self._path_data)
            config_path = manager.create_temp_file(config_text)
            df = rouge_papier.compute_rouge(
                config_path,
                max_ngram=2,
                lcs=False,
                remove_stopwords=self.remove_stopwords,
                length=self.summary_length)

        if self.delete_temp_files:
            for paths in self._path_data:
                pathlib.Path(paths[0]).unlink()

        return df.iloc[-1:].to_dict("records")[0]
Example #11
0
  ids2refs = collect_reference_paths(refs_path)

  with rouge_papier.util.TempFileManager() as manager:
   for line in open(dps_path):
     dp = json.loads(line)
     id = dp["id"]
     query = dp["query"]
     #print("dp,id: %s,%s" % (id,query))
     qry_embds = dp["qembedding"]
     sentences = []
     tokens = []
     sen_embds = []
     for input in dp["inputs"]:
       sen_id = input["sentence_id"]
       sen_embds.append(input["embedding"])
       sentences.append(input["text"])
       tokens.append(input["text"].split(" "))
     ref_paths = ids2refs[id]
     inputs, metadata = get_inputs_metadata(tokens, sentences, sen_embds, qry_embds)
     if inputs is not None:
       summaries, _ = predictor.extract(inputs, metadata, strategy=strategy, word_limit=100, rescore=rescore)
       summary_path = "%s/%s.pred" % (out_path, id)
       write2file("%s" % summaries[0] + "\n", summary_path)
       rouge_paths.append([summary_path, ref_paths])

   # compute rouge
   config_text = rouge_papier.util.make_simple_config_text(rouge_paths)
   config_path = manager.create_temp_file(config_text)
   df = rouge_papier.compute_rouge(config_path, max_ngram=2, lcs=False)
   print(df[-1:])
Example #12
0
def main():
    parser = argparse.ArgumentParser(
        "Evaluate nnsum models using original Perl ROUGE script.")
    parser.add_argument("--batch-size", default=32, type=int)
    parser.add_argument("--gpu", default=-1, type=int)
    parser.add_argument("--sentence-limit", default=None, type=int)
    parser.add_argument("--summary-length", type=int, default=100)
    parser.add_argument("--loader-workers", type=int, default=None)
    parser.add_argument("--remove-stopwords",
                        action="store_true",
                        default=False)
    parser.add_argument("--inputs", type=pathlib.Path, required=True)
    parser.add_argument("--refs", type=pathlib.Path, required=True)
    parser.add_argument("--model", type=pathlib.Path, required=True)
    parser.add_argument("--results",
                        type=pathlib.Path,
                        required=False,
                        default=None)

    args = parser.parse_args()

    if args.loader_workers is None:
        args.loader_workers = min(16, cpu_count())

    print("Loading model...", end="", flush=True)
    model = torch.load(args.model, map_location=lambda storage, loc: storage)
    if args.gpu > -1:
        model.cuda(args.gpu)
    vocab = model.embeddings.vocab
    print(" OK!")

    data = nnsum.data.SummarizationDataset(vocab,
                                           args.inputs,
                                           references_dir=args.refs,
                                           sentence_limit=args.sentence_limit)
    loader = nnsum.data.SummarizationDataLoader(
        data, batch_size=args.batch_size, num_workers=args.loader_workers)

    ids = []
    path_data = []
    model.eval()
    with rouge_papier.util.TempFileManager() as manager:
        with torch.no_grad():
            for step, batch in enumerate(loader, 1):
                batch = batch.to(args.gpu)
                print("generating summaries {} / {} ...".format(
                    step, len(loader)),
                      end="\r" if step < len(loader) else "\n",
                      flush=True)
                texts = model.predict(batch, max_length=args.summary_length)

                for text, ref_paths in zip(texts, batch.reference_paths):
                    summary = "\n".join(text)
                    summary_path = manager.create_temp_file(summary)
                    path_data.append(
                        [summary_path, [str(x) for x in ref_paths]])
                ids.extend(batch.id)

        config_text = rouge_papier.util.make_simple_config_text(path_data)
        config_path = manager.create_temp_file(config_text)
        df = rouge_papier.compute_rouge(config_path,
                                        max_ngram=2,
                                        lcs=True,
                                        remove_stopwords=args.remove_stopwords,
                                        length=args.summary_length)
        df.index = ids + ["average"]
        df = pd.concat([df[:-1].sort_index(), df[-1:]], axis=0)
        print(df[-1:])

        if args.results:
            records = df[:-1].to_dict("records")

            results = {
                "idividual": {id: record
                              for id, record in zip(ids, records)},
                "average": df[-1:].to_dict("records")[0]
            }
            args.results.parent.mkdir(parents=True, exist_ok=True)
            with args.results.open("w") as fp:
                fp.write(json.dumps(results))