def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("config")
    parser.add_argument("corpora", nargs="+")
    parser.add_argument("outfile")

    args = parser.parse_args()

    if os.path.isfile(args.outfile):
        print("WARNING: file {} exists. Continue (Y/n)".format(args.outfile))
        if input() == "n":
            return

    with open(args.config) as f:
        config = json.load(f)

    flat_candidates = []
    all_sentences = []
    for corpus_name in args.corpora:
        corpus = load_corpus(corpus_name)

        tl_gen = GloballyClusteredSentenceCompressionTimelineGenerator(config)
        tl_gen.generator.prepare(corpus)

        clusters = tl_gen.create_clusters(corpus)
        cluster_candidates = tl_gen.generate_candidates_for_clusters(
            corpus, clusters)

        for candidates in cluster_candidates:
            for candidate, _ in candidates:
                flat_candidates.append(candidate)

        all_sentences.extend(corpus.sentences)

    num_samples = min(len(all_sentences), len(flat_candidates))

    sentence_samples = random.sample(all_sentences, num_samples)
    candidate_samples = random.sample(flat_candidates, num_samples)
    sentence_sample_lens = []
    candidate_sample_lens = []

    with open(args.outfile, "w") as f_out:
        for sent_sam in sentence_samples:
            f_out.write(":S\t")
            f_out.write(sent_sam.as_tokenized_string_with_attribute("pos"))
            f_out.write("\n")
            sentence_sample_lens.append(len(sent_sam))
        for cand_sam in candidate_samples:
            f_out.write(":C\t")
            f_out.write(" ".join(map(lambda x: x[0] + "/" + x[1], cand_sam)))
            f_out.write("\n")
            candidate_sample_lens.append(len(cand_sam))

    print(
        np.array(sentence_sample_lens).avg(),
        np.array(sentence_sample_lens).std())
    print(
        np.array(candidate_sample_lens).avg(),
        np.array(candidate_sample_lens).std())
def gen_stats(args):
    all_samples = []

    for corpus_fname in args.corpora:
        for config_name in args.configs.split(","):
            with open(config_name) as f:
                config = json.load(f)
            corpus = load_corpus(corpus_fname)
            tl_gen = GloballyClusteredSentenceCompressionTimelineGenerator(config)
            samples = tl_gen.generate_date_docs_cluster_stats(corpus)

            all_samples.extend(samples)

    plt.scatter([s[0] for s in all_samples], [s[1] for s in all_samples])
    plt.show()
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("config")
    parser.add_argument("corpus_pickles", nargs="+")
    parser.add_argument("-p",
                        dest="use_pos",
                        action="store_true",
                        default=False)

    args = parser.parse_args()

    with open(args.config) as f:
        config = json.load(f)

    config_basename = os.path.basename(args.config)

    for corpus_pickle in args.corpus_pickles:
        corpus = load_corpus(corpus_pickle)
        print(corpus.name)
        export(corpus, config, config_basename, args.use_pos)
Beispiel #4
0
import re
from tleval import load_corpus

import sys
import pickle


def clean_doc(doc):
    for idx, sent in enumerate(doc):
        #print(sent.as_tokenized_string())
        if (re.search(r"By .* \|", sent.as_tokenized_string()) or re.search(
                r"\|.*\| Report abuse", sent.as_tokenized_string())):
            #print("Deleting", sent.as_tokenized_string())
            doc.sentences = doc.sentences[:idx]
            break
        elif "|" in sent.as_tokenized_string():
            print(sent.as_tokenized_string())


if __name__ == "__main__":
    corpus = load_corpus(sys.argv[1])

    for doc in corpus:
        clean_doc(doc)

    with open(sys.argv[2], "wb") as f_out:
        pickle.dump(corpus, f_out)
    parser = argparse.ArgumentParser()
    parser.add_argument("configs")
    parser.add_argument("corpora", nargs="+")

    args = parser.parse_args()

    cluster_lengths = Counter()
    all_cluster_factors = list()
    corpus_sizes = defaultdict(list)
    corpus_doc_counts = defaultdict(list)

    for corpus_fname in args.corpora:
        for config_name in args.configs.split(","):
            with open(config_name) as f:
                config = json.load(f)
            corpus = load_corpus(corpus_fname, False)
            tl_gen = GloballyClusteredSentenceCompressionTimelineGenerator(config)

            _, _, _, cluster_candidates = tl_gen.get_promises(corpus)
            #cluster_len = sum(len(c) for c in clusters)
            #cluster_lengths.update(len(c) for c in clusters)
            corpus_len = len(corpus.sentences)
            corpus_sizes[os.path.basename(corpus_fname).split("-")[0]].append(corpus_len)
            corpus_doc_counts[os.path.basename(corpus_fname).split("-")[0]].append(len(corpus))

            factor = sum(len(c) for c in cluster_candidates) / corpus_len
            print(factor)

            all_cluster_factors.append(factor)

    for c_name, lens in corpus_sizes.items():
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("sys_1_results_dir")
    parser.add_argument("sys_2_results_dir")

    args = parser.parse_args()

    results_1 = read_results_dir(args.sys_1_results_dir)
    results_2 = read_results_dir(args.sys_2_results_dir)
    score_diffs = []
    available_sents = []
    compression_rates = []
    spreads = []

    for corpus_name in results_1:
        if corpus_name not in results_2:
            continue

        corpus = load_corpus("corpora/" + corpus_name.rsplit(".")[0] + ".pkl")

        for tl_name, result_1 in results_1[corpus_name].items():
            result_2 = results_2[corpus_name][tl_name]
            with open("gold-timelines/" + corpus_name.split(".")[0] + "/" +
                      tl_name,
                      errors="ignore") as f:
                print("gold-timelines/" + corpus_name.split(".")[0] + "/" +
                      tl_name)
                gold_tl = Timeline.from_file(f)

            total_tl_length = sum(map(len,
                                      gold_tl.dates_to_summaries.values()))
            total_corpus_length = len(corpus.sentences)

            score_diffs.append(result_1.rouge_2_align.f1 -
                               result_2.rouge_2_align.f1)
            available_sents.append(
                len(
                    corpus.docs_between_dates(min(gold_tl.get_dates()),
                                              max(gold_tl.get_dates()))))
            compression_rates.append(1.0 -
                                     (total_tl_length / total_corpus_length))

            spreads.append(compute_spread(gold_tl))

    print("Sents", scipy.stats.spearmanr(available_sents, score_diffs))
    print("Compression", scipy.stats.spearmanr(compression_rates, score_diffs))
    print("Spread", scipy.stats.spearmanr(spreads, score_diffs))

    plt.axhline(color="b")
    plt.scatter(
        available_sents,
        score_diffs,
        c=["r" if score_diff <= 0.0 else "b" for score_diff in score_diffs])
    plt.figure()
    plt.axhline(color="b")
    plt.scatter(
        compression_rates,
        score_diffs,
        c=["r" if score_diff <= 0.0 else "b" for score_diff in score_diffs])
    plt.figure()
    plt.axhline(color="b")
    plt.scatter(
        spreads,
        score_diffs,
        c=["r" if score_diff <= 0.0 else "b" for score_diff in score_diffs])

    plt.show()