def main(): parser = argparse.ArgumentParser() parser.add_argument("config") parser.add_argument("corpora", nargs="+") parser.add_argument("outfile") args = parser.parse_args() if os.path.isfile(args.outfile): print("WARNING: file {} exists. Continue (Y/n)".format(args.outfile)) if input() == "n": return with open(args.config) as f: config = json.load(f) flat_candidates = [] all_sentences = [] for corpus_name in args.corpora: corpus = load_corpus(corpus_name) tl_gen = GloballyClusteredSentenceCompressionTimelineGenerator(config) tl_gen.generator.prepare(corpus) clusters = tl_gen.create_clusters(corpus) cluster_candidates = tl_gen.generate_candidates_for_clusters( corpus, clusters) for candidates in cluster_candidates: for candidate, _ in candidates: flat_candidates.append(candidate) all_sentences.extend(corpus.sentences) num_samples = min(len(all_sentences), len(flat_candidates)) sentence_samples = random.sample(all_sentences, num_samples) candidate_samples = random.sample(flat_candidates, num_samples) sentence_sample_lens = [] candidate_sample_lens = [] with open(args.outfile, "w") as f_out: for sent_sam in sentence_samples: f_out.write(":S\t") f_out.write(sent_sam.as_tokenized_string_with_attribute("pos")) f_out.write("\n") sentence_sample_lens.append(len(sent_sam)) for cand_sam in candidate_samples: f_out.write(":C\t") f_out.write(" ".join(map(lambda x: x[0] + "/" + x[1], cand_sam))) f_out.write("\n") candidate_sample_lens.append(len(cand_sam)) print( np.array(sentence_sample_lens).avg(), np.array(sentence_sample_lens).std()) print( np.array(candidate_sample_lens).avg(), np.array(candidate_sample_lens).std())
def gen_stats(args): all_samples = [] for corpus_fname in args.corpora: for config_name in args.configs.split(","): with open(config_name) as f: config = json.load(f) corpus = load_corpus(corpus_fname) tl_gen = GloballyClusteredSentenceCompressionTimelineGenerator(config) samples = tl_gen.generate_date_docs_cluster_stats(corpus) all_samples.extend(samples) plt.scatter([s[0] for s in all_samples], [s[1] for s in all_samples]) plt.show()
def main(): parser = argparse.ArgumentParser() parser.add_argument("config") parser.add_argument("corpus_pickles", nargs="+") parser.add_argument("-p", dest="use_pos", action="store_true", default=False) args = parser.parse_args() with open(args.config) as f: config = json.load(f) config_basename = os.path.basename(args.config) for corpus_pickle in args.corpus_pickles: corpus = load_corpus(corpus_pickle) print(corpus.name) export(corpus, config, config_basename, args.use_pos)
import re from tleval import load_corpus import sys import pickle def clean_doc(doc): for idx, sent in enumerate(doc): #print(sent.as_tokenized_string()) if (re.search(r"By .* \|", sent.as_tokenized_string()) or re.search( r"\|.*\| Report abuse", sent.as_tokenized_string())): #print("Deleting", sent.as_tokenized_string()) doc.sentences = doc.sentences[:idx] break elif "|" in sent.as_tokenized_string(): print(sent.as_tokenized_string()) if __name__ == "__main__": corpus = load_corpus(sys.argv[1]) for doc in corpus: clean_doc(doc) with open(sys.argv[2], "wb") as f_out: pickle.dump(corpus, f_out)
parser = argparse.ArgumentParser() parser.add_argument("configs") parser.add_argument("corpora", nargs="+") args = parser.parse_args() cluster_lengths = Counter() all_cluster_factors = list() corpus_sizes = defaultdict(list) corpus_doc_counts = defaultdict(list) for corpus_fname in args.corpora: for config_name in args.configs.split(","): with open(config_name) as f: config = json.load(f) corpus = load_corpus(corpus_fname, False) tl_gen = GloballyClusteredSentenceCompressionTimelineGenerator(config) _, _, _, cluster_candidates = tl_gen.get_promises(corpus) #cluster_len = sum(len(c) for c in clusters) #cluster_lengths.update(len(c) for c in clusters) corpus_len = len(corpus.sentences) corpus_sizes[os.path.basename(corpus_fname).split("-")[0]].append(corpus_len) corpus_doc_counts[os.path.basename(corpus_fname).split("-")[0]].append(len(corpus)) factor = sum(len(c) for c in cluster_candidates) / corpus_len print(factor) all_cluster_factors.append(factor) for c_name, lens in corpus_sizes.items():
def main(): parser = argparse.ArgumentParser() parser.add_argument("sys_1_results_dir") parser.add_argument("sys_2_results_dir") args = parser.parse_args() results_1 = read_results_dir(args.sys_1_results_dir) results_2 = read_results_dir(args.sys_2_results_dir) score_diffs = [] available_sents = [] compression_rates = [] spreads = [] for corpus_name in results_1: if corpus_name not in results_2: continue corpus = load_corpus("corpora/" + corpus_name.rsplit(".")[0] + ".pkl") for tl_name, result_1 in results_1[corpus_name].items(): result_2 = results_2[corpus_name][tl_name] with open("gold-timelines/" + corpus_name.split(".")[0] + "/" + tl_name, errors="ignore") as f: print("gold-timelines/" + corpus_name.split(".")[0] + "/" + tl_name) gold_tl = Timeline.from_file(f) total_tl_length = sum(map(len, gold_tl.dates_to_summaries.values())) total_corpus_length = len(corpus.sentences) score_diffs.append(result_1.rouge_2_align.f1 - result_2.rouge_2_align.f1) available_sents.append( len( corpus.docs_between_dates(min(gold_tl.get_dates()), max(gold_tl.get_dates())))) compression_rates.append(1.0 - (total_tl_length / total_corpus_length)) spreads.append(compute_spread(gold_tl)) print("Sents", scipy.stats.spearmanr(available_sents, score_diffs)) print("Compression", scipy.stats.spearmanr(compression_rates, score_diffs)) print("Spread", scipy.stats.spearmanr(spreads, score_diffs)) plt.axhline(color="b") plt.scatter( available_sents, score_diffs, c=["r" if score_diff <= 0.0 else "b" for score_diff in score_diffs]) plt.figure() plt.axhline(color="b") plt.scatter( compression_rates, score_diffs, c=["r" if score_diff <= 0.0 else "b" for score_diff in score_diffs]) plt.figure() plt.axhline(color="b") plt.scatter( spreads, score_diffs, c=["r" if score_diff <= 0.0 else "b" for score_diff in score_diffs]) plt.show()