def diff(): """Git like diff tool to compare sentence generated by our tokenizer vs actual list of sentences.""" click.secho("Loading corpus...") raw, sents = load_raw_corpus(False), load_sentence_corpus() y_true = [doc['sentences'] for doc in sents] y_pred = [Doc(doc) for doc in raw] paths = file_paths() for i in range(len(y_true)): if y_true[i] != y_pred[i]: click.secho(f"Document {paths[i]}") for s_true in y_true[i]: if s_true not in y_pred[i]: click.secho(f"+ {s_true}", fg="green") click.secho() for s_pred in y_pred[i]: if s_pred not in y_true[i]: click.secho(f"- {s_pred}", fg="red") click.secho() click.secho()
def build(): """Build a ML based SBD""" raw_corpus = load_raw_corpus(False) sent_corpus = load_sentence_corpus(False) features = flatten([[span.span_features() for span in Doc(raw).spans] for raw in raw_corpus]) y = flatten([[is_eos(span, sent['sentences']) for span in Doc(raw).spans] for raw, sent in zip(raw_corpus, sent_corpus)]) if len(features) != len(y): raise Exception( f"Sanity check failed feature list length {len(features)} whereas target list length {len(y)}." ) sbd_model = create_model() scores = cross_val_score(sbd_model.pipeline, features, y, scoring="f1") for i, score in enumerate(scores): click.secho(f"Fold {i + 1}: {score:.4f}", fg="yellow") sbd_model.fit(features, y) click.secho("\nTop 10 Features") feature_importance = sbd_model.pipeline.steps[1][1].feature_importances_ for idx in list(reversed(feature_importance.argsort()))[:20]: click.secho( f" {sbd_model.pipeline.steps[0][1].feature_names_[idx]}: {feature_importance[idx]:.4f}", fg="yellow") save_model(sbd_model)
def diff(verbose): """Git like diff tool to compare sentence generated by our tokenizer vs actual list of sentences.""" click.secho("Loading corpus...") raw, sents = load_raw_corpus(False), load_sentence_corpus() y_true = [doc['sentences'] for doc in sents] y_pred = [[str(s) for s in Doc(doc)] for doc in raw] paths = file_paths() differ = Differ() for t, p, f in zip(y_true, y_pred, paths): table = Table(show_header=True, header_style="bold magenta", show_edge=False) table.add_column("true", style="dim", width=100) table.add_column("predict", style="dim", width=100) table.columns[0].style = "green" table.columns[1].style = "red" ndiff = 0 match = 0 for sent in differ.compare(p, t): if sent.startswith('+'): if match > 0 and verbose > 0: table.add_row(f"[blue]{match} sentences...[/blue]", f"[blue]{match} sentences...[/blue]") match = 0 table.add_row(sent[2:], "") ndiff += 1 elif sent.startswith('-'): if match > 0 and verbose > 0: table.add_row(f"[blue]{match} sentences...[/blue]", f"[blue]{match} sentences...[/blue]") match = 0 table.add_row("", sent[2:]) ndiff += 1 else: match += 1 if match > 0 and verbose > 0: table.add_row(f"[blue]{match} sentences...[/blue]", f"[blue]{match} sentences...[/blue]") if ndiff > 0: console.print(f) console.print(table) console.print(f"[blue]{len(t)} sentences...[/blue]") console.print()
def evaluate(v): """Evaluate IoU metric for different SBD algorithms over our stock dataset.""" click.secho("Loading corpus...") raw, sents = load_raw_corpus(False), load_sentence_corpus() nltk = NLTKPunctTokenizer() reg = RegexpSentenceTokenizer() y_pred = [nltk(doc) for doc in raw] y_true = [doc['sentences'] for doc in sents] iou_eval("NLTKPunctTokenizer", y_true, y_pred, file_paths() if v > 0 else None) y_pred = [reg(doc) for doc in raw] iou_eval("RegexpSentenceTokenizer", y_true, y_pred, file_paths() if v > 0 else None) y_pred = [[s.text for s in Doc(doc).sents] for doc in raw] iou_eval("MLBasedTokenizer", y_true, y_pred, file_paths() if v > 0 else None)
def validate(v, base_path): from sadedegel.dataset import load_raw_corpus, load_sentence_corpus, load_annotated_corpus, file_paths, \ CorpusTypeEnum click.secho("Corpus loading...") raw = load_raw_corpus(False, base_path) sents = load_sentence_corpus(False, base_path) anno = load_annotated_corpus(False, base_path) click.secho(".done.", fg="green") click.secho(f"Number of News Documents (raw): {len(raw)}".rjust(50)) click.secho( f"Number of News Documents (sentences): {len(sents)}".rjust(50)) click.secho(f"Number of News Documents (annotated): {len(anno)}".rjust(50)) if len(anno) != len(sents): anno_files = file_paths(CorpusTypeEnum.ANNOTATED, True, True, base_path) sent_files = file_paths(CorpusTypeEnum.SENTENCE, True, True, base_path) click.secho( "\nSymmetric Difference between sentences & annotated corpus.") for diff in set(anno_files).symmetric_difference(set(sent_files)): click.secho(f"{diff}".rjust(50)) click.secho(".warn", fg="yellow") click.secho("\nPerforming span checks...") for a, b, file in zip(raw, sents, file_paths()): for i, sent in enumerate(b['sentences']): if sent not in a: logger.error( f"""{sent}[{i}] \n\t\t is not a span in raw document \n {a} \n\n Corpus file: {file} """) sys.exit(1) click.secho(".done", fg="green") click.secho("\nPerforming span order checks...") for a, b, file in zip(raw, sents, file_paths()): start = 0 for i, sent in enumerate(b['sentences']): idx = a.find(sent, start) if idx == -1: logger.error( f"""{sent}[{i}] \n\t\t is potential our of order in "sentences" array of sentence corpus\n {a} \n\n Corpus file: {file} """) sys.exit(1) else: start = start + len(sent) click.secho(".done", fg="green") click.secho("\nComparing annotated corpus with sentences corpus...") anno_names = file_paths(CorpusTypeEnum.ANNOTATED, noext=True, use_basename=True, base_path=base_path) sents_names = file_paths(CorpusTypeEnum.SENTENCE, noext=True, use_basename=True, base_path=base_path) anno_dict = dict((name, doc) for name, doc in zip(anno_names, anno)) sents_dict = dict((name, doc) for name, doc in zip(sents_names, sents)) match = 0 for _name, _anno in anno_dict.items(): sent = sents_dict[_name] if sent['sentences'] != _anno['sentences']: click.secho( f"\nSentences in annotated corpus {_name} doesn't match with document in sentence corpus." ) sys.exit(1) else: match += 1 click.secho(f".done ({match}/{len(anno_dict)})", fg="green")