def build(): """Build a ML based SBD""" raw_corpus = load_raw_corpus(False) sent_corpus = load_sentence_corpus(False) features = flatten([[span.span_features() for span in Doc(raw).spans] for raw in raw_corpus]) y = flatten([[is_eos(span, sent['sentences']) for span in Doc(raw).spans] for raw, sent in zip(raw_corpus, sent_corpus)]) if len(features) != len(y): raise Exception( f"Sanity check failed feature list length {len(features)} whereas target list length {len(y)}." ) sbd_model = create_model() scores = cross_val_score(sbd_model.pipeline, features, y, scoring="f1") for i, score in enumerate(scores): click.secho(f"Fold {i + 1}: {score:.4f}", fg="yellow") sbd_model.fit(features, y) click.secho("\nTop 10 Features") feature_importance = sbd_model.pipeline.steps[1][1].feature_importances_ for idx in list(reversed(feature_importance.argsort()))[:20]: click.secho( f" {sbd_model.pipeline.steps[0][1].feature_names_[idx]}: {feature_importance[idx]:.4f}", fg="yellow") save_model(sbd_model)
def diff(): """Git like diff tool to compare sentence generated by our tokenizer vs actual list of sentences.""" click.secho("Loading corpus...") raw, sents = load_raw_corpus(False), load_sentence_corpus() y_true = [doc['sentences'] for doc in sents] y_pred = [Doc(doc) for doc in raw] paths = file_paths() for i in range(len(y_true)): if y_true[i] != y_pred[i]: click.secho(f"Document {paths[i]}") for s_true in y_true[i]: if s_true not in y_pred[i]: click.secho(f"+ {s_true}", fg="green") click.secho() for s_pred in y_pred[i]: if s_pred not in y_true[i]: click.secho(f"- {s_pred}", fg="red") click.secho() click.secho()
def evaluate(table_format, tag, debug): """Evaluate all summarizers in sadedeGel""" if not debug: warnings.filterwarnings("ignore") anno = load_annotated_corpus(False) relevance = [[doc['relevance']] for doc in anno] summarizers = [ summ for summ in SUMMARIZERS if any(_tag in summ[1] for _tag in tag) ] scores = defaultdict(list) for word_tokenizer in ['simple', 'bert']: click.echo("Word Tokenizer: " + click.style(f"{word_tokenizer}", fg="blue")) docs = [Doc.from_sentences(doc['sentences']) for doc in anno] # Reset document because of memoization with tokenizer_context(word_tokenizer): for name, summarizer in summarizers: click.echo(click.style(f" {name} ", fg="magenta"), nl=False) # skip simple tokenizer for clustering models if ("cluster" in summarizer or "rank" in summarizer or name == "TFIDF Summarizer") and \ word_tokenizer == "simple": click.echo(click.style("SKIP", fg="yellow")) continue for i, (y_true, d) in enumerate(zip(relevance, docs)): dot_progress(i, len(relevance)) y_pred = [summarizer.predict(d.sents)] score_10 = ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.1)) score_50 = ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.5)) score_80 = ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.8)) scores[f"{name} - {word_tokenizer}"].append( (score_10, score_50, score_80)) table = [[ algo, np.array([s[0] for s in scores]).mean(), np.array([s[1] for s in scores]).mean(), np.array([s[2] for s in scores]).mean() ] for algo, scores in scores.items()] # TODO: Sample weight of instances. print( tabulate(table, headers=[ 'Method & Tokenizer', 'ndcg(k=0.1)', 'ndcg(k=0.5)', 'ndcg(k=0.8)' ], tablefmt=table_format, floatfmt=".4f")) if debug: click.echo(np.array(table).shape)
def diff(verbose): """Git like diff tool to compare sentence generated by our tokenizer vs actual list of sentences.""" click.secho("Loading corpus...") raw, sents = load_raw_corpus(False), load_sentence_corpus() y_true = [doc['sentences'] for doc in sents] y_pred = [[str(s) for s in Doc(doc)] for doc in raw] paths = file_paths() differ = Differ() for t, p, f in zip(y_true, y_pred, paths): table = Table(show_header=True, header_style="bold magenta", show_edge=False) table.add_column("true", style="dim", width=100) table.add_column("predict", style="dim", width=100) table.columns[0].style = "green" table.columns[1].style = "red" ndiff = 0 match = 0 for sent in differ.compare(p, t): if sent.startswith('+'): if match > 0 and verbose > 0: table.add_row(f"[blue]{match} sentences...[/blue]", f"[blue]{match} sentences...[/blue]") match = 0 table.add_row(sent[2:], "") ndiff += 1 elif sent.startswith('-'): if match > 0 and verbose > 0: table.add_row(f"[blue]{match} sentences...[/blue]", f"[blue]{match} sentences...[/blue]") match = 0 table.add_row("", sent[2:]) ndiff += 1 else: match += 1 if match > 0 and verbose > 0: table.add_row(f"[blue]{match} sentences...[/blue]", f"[blue]{match} sentences...[/blue]") if ndiff > 0: console.print(f) console.print(table) console.print(f"[blue]{len(t)} sentences...[/blue]") console.print()
def evaluate(table_format, tag, debug): """Evaluate all summarizers in sadedeGel""" if not debug: warnings.filterwarnings("ignore") anno = load_annotated_corpus(False) summarizers = [summ for summ in SUMMARIZERS if any(_tag in summ[1] for _tag in tag)] scores = defaultdict(list) for word_tokenizer in tqdm(['simple', 'bert'], unit=" word-tokenizer"): with tokenizer_context(word_tokenizer): for name, summarizer in tqdm(summarizers, unit=" method"): # skip simple tokenizer for clustering models if "cluster" in summarizer and word_tokenizer == "simple": continue for doc in tqdm(anno, unit=" doc", desc=f"Evaluating {name}"): y_true = [doc['relevance']] d = Doc.from_sentences(doc['sentences']) y_pred = [summarizer.predict(d.sents)] score_10 = ndcg_score(y_true, y_pred, k=ceil(len(doc['sentences']) * 0.1)) score_50 = ndcg_score(y_true, y_pred, k=ceil(len(doc['sentences']) * 0.5)) score_80 = ndcg_score(y_true, y_pred, k=ceil(len(doc['sentences']) * 0.8)) scores[f"{name} - {word_tokenizer}"].append((score_10, score_50, score_80)) table = [[algo, np.array([s[0] for s in scores]).mean(), np.array([s[1] for s in scores]).mean(), np.array([s[2] for s in scores]).mean()] for algo, scores in scores.items()] # TODO: Sample weight of instances. print( tabulate(table, headers=['Method & Tokenizer', 'ndcg(k=0.1)', 'ndcg(k=0.5)', 'ndcg(k=0.8)'], tablefmt=table_format, floatfmt=".4f")) if debug: click.echo(np.array(table).shape)
def sbd(data_home): """Generate sentence boundary detected corpus out of raw document corpus.""" data_home = Path(os.path.expanduser(data_home)) logger.info(f"Data directory for extended data {data_home}") raw_dir = data_home / 'extended' / 'raw' for section in raw_dir.iterdir(): sents_dir = section.parent.parent / 'sents' / str(section.name) if section.is_dir(): sents_dir.mkdir(parents=True, exist_ok=True) for raw in track(glob.glob( str((raw_dir / section / '*.txt').absolute())), description=f"{section.name} documents"): fn_noext, _ = os.path.splitext(os.path.basename(raw)) target = (sents_dir / f"{fn_noext}.json").absolute() if not os.path.exists(target) or (os.path.exists(target) and os.path.getsize(target) == 0): try: d = Doc(safe_read(raw)) with open(target, 'w') as wp: json.dump(dict(sentences=[s.text for s in d], rouge1=[s.rouge1("f1") for s in d]), wp, ensure_ascii=False) except: logger.exception(f"Error in processing document {raw}") raise
def evaluate(v): """Evaluate IoU metric for different SBD algorithms over our stock dataset.""" click.secho("Loading corpus...") raw, sents = load_raw_corpus(False), load_sentence_corpus() nltk = NLTKPunctTokenizer() reg = RegexpSentenceTokenizer() y_pred = [nltk(doc) for doc in raw] y_true = [doc['sentences'] for doc in sents] iou_eval("NLTKPunctTokenizer", y_true, y_pred, file_paths() if v > 0 else None) y_pred = [reg(doc) for doc in raw] iou_eval("RegexpSentenceTokenizer", y_true, y_pred, file_paths() if v > 0 else None) y_pred = [[s.text for s in Doc(doc)] for doc in raw] iou_eval("MLBasedTokenizer", y_true, y_pred, file_paths() if v > 0 else None)
def test_lxr_summarizer_all_lower(): summ = LexRankSummarizer("log_norm", "smooth", normalize=False) assert summ.predict( Doc('ali topu tut. oya ip atla. ahmet topu at.')) == approx( np.array([1., 1., 1.]))
def test_lxr_summarize_text(): summ = LexRankSummarizer("log_norm", "smooth") doc = Doc('ali topu tut. oya ip atla. ahmet topu at.') assert summ(doc, k=1) == [doc[2]]
def test_lxr_summarizer_proper_case(): summ = LexRankSummarizer("log_norm", "smooth", normalize=False) assert summ.predict( Doc('Ali topu tut. Oya ip atla. Ahmet topu at.')) == approx( np.array([1., 1., 1.]))