コード例 #1
0
ファイル: __main__.py プロジェクト: GlobalMaksimum/sadedegel
def build():
    """Build a ML based SBD"""

    raw_corpus = load_raw_corpus(False)
    sent_corpus = load_sentence_corpus(False)

    features = flatten([[span.span_features() for span in Doc(raw).spans]
                        for raw in raw_corpus])
    y = flatten([[is_eos(span, sent['sentences']) for span in Doc(raw).spans]
                 for raw, sent in zip(raw_corpus, sent_corpus)])

    if len(features) != len(y):
        raise Exception(
            f"Sanity check failed feature list length {len(features)} whereas target list length {len(y)}."
        )

    sbd_model = create_model()

    scores = cross_val_score(sbd_model.pipeline, features, y, scoring="f1")

    for i, score in enumerate(scores):
        click.secho(f"Fold {i + 1}: {score:.4f}", fg="yellow")

    sbd_model.fit(features, y)

    click.secho("\nTop 10 Features")
    feature_importance = sbd_model.pipeline.steps[1][1].feature_importances_
    for idx in list(reversed(feature_importance.argsort()))[:20]:
        click.secho(
            f"    {sbd_model.pipeline.steps[0][1].feature_names_[idx]}: {feature_importance[idx]:.4f}",
            fg="yellow")

    save_model(sbd_model)
コード例 #2
0
def diff():
    """Git like diff tool to compare sentence generated by our tokenizer vs actual list of sentences."""
    click.secho("Loading corpus...")
    raw, sents = load_raw_corpus(False), load_sentence_corpus()

    y_true = [doc['sentences'] for doc in sents]

    y_pred = [Doc(doc) for doc in raw]

    paths = file_paths()

    for i in range(len(y_true)):

        if y_true[i] != y_pred[i]:
            click.secho(f"Document {paths[i]}")
            for s_true in y_true[i]:
                if s_true not in y_pred[i]:
                    click.secho(f"+ {s_true}", fg="green")

            click.secho()

        for s_pred in y_pred[i]:
            if s_pred not in y_true[i]:
                click.secho(f"- {s_pred}", fg="red")

        click.secho()
        click.secho()
コード例 #3
0
ファイル: __main__.py プロジェクト: bhbduman/sadedegel
def evaluate(table_format, tag, debug):
    """Evaluate all summarizers in sadedeGel"""
    if not debug:
        warnings.filterwarnings("ignore")

    anno = load_annotated_corpus(False)
    relevance = [[doc['relevance']] for doc in anno]

    summarizers = [
        summ for summ in SUMMARIZERS if any(_tag in summ[1] for _tag in tag)
    ]

    scores = defaultdict(list)

    for word_tokenizer in ['simple', 'bert']:
        click.echo("Word Tokenizer: " +
                   click.style(f"{word_tokenizer}", fg="blue"))
        docs = [Doc.from_sentences(doc['sentences'])
                for doc in anno]  # Reset document because of memoization
        with tokenizer_context(word_tokenizer):
            for name, summarizer in summarizers:
                click.echo(click.style(f"    {name} ", fg="magenta"), nl=False)
                # skip simple tokenizer for clustering models
                if ("cluster" in summarizer or "rank" in summarizer or name == "TFIDF Summarizer") and \
                        word_tokenizer == "simple":
                    click.echo(click.style("SKIP", fg="yellow"))
                    continue

                for i, (y_true, d) in enumerate(zip(relevance, docs)):
                    dot_progress(i, len(relevance))

                    y_pred = [summarizer.predict(d.sents)]

                    score_10 = ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.1))
                    score_50 = ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.5))
                    score_80 = ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.8))

                    scores[f"{name} - {word_tokenizer}"].append(
                        (score_10, score_50, score_80))

    table = [[
        algo,
        np.array([s[0] for s in scores]).mean(),
        np.array([s[1] for s in scores]).mean(),
        np.array([s[2] for s in scores]).mean()
    ] for algo, scores in scores.items()]

    # TODO: Sample weight of instances.
    print(
        tabulate(table,
                 headers=[
                     'Method & Tokenizer', 'ndcg(k=0.1)', 'ndcg(k=0.5)',
                     'ndcg(k=0.8)'
                 ],
                 tablefmt=table_format,
                 floatfmt=".4f"))

    if debug:
        click.echo(np.array(table).shape)
コード例 #4
0
ファイル: __main__.py プロジェクト: GlobalMaksimum/sadedegel
def diff(verbose):
    """Git like diff tool to compare sentence generated by our tokenizer vs actual list of sentences."""
    click.secho("Loading corpus...")
    raw, sents = load_raw_corpus(False), load_sentence_corpus()

    y_true = [doc['sentences'] for doc in sents]

    y_pred = [[str(s) for s in Doc(doc)] for doc in raw]

    paths = file_paths()

    differ = Differ()

    for t, p, f in zip(y_true, y_pred, paths):

        table = Table(show_header=True,
                      header_style="bold magenta",
                      show_edge=False)
        table.add_column("true", style="dim", width=100)
        table.add_column("predict", style="dim", width=100)

        table.columns[0].style = "green"
        table.columns[1].style = "red"

        ndiff = 0
        match = 0
        for sent in differ.compare(p, t):
            if sent.startswith('+'):
                if match > 0 and verbose > 0:
                    table.add_row(f"[blue]{match} sentences...[/blue]",
                                  f"[blue]{match} sentences...[/blue]")
                    match = 0

                table.add_row(sent[2:], "")
                ndiff += 1
            elif sent.startswith('-'):
                if match > 0 and verbose > 0:
                    table.add_row(f"[blue]{match} sentences...[/blue]",
                                  f"[blue]{match} sentences...[/blue]")
                    match = 0

                table.add_row("", sent[2:])
                ndiff += 1
            else:
                match += 1

        if match > 0 and verbose > 0:
            table.add_row(f"[blue]{match} sentences...[/blue]",
                          f"[blue]{match} sentences...[/blue]")

        if ndiff > 0:
            console.print(f)
            console.print(table)
            console.print(f"[blue]{len(t)} sentences...[/blue]")
            console.print()
コード例 #5
0
ファイル: __main__.py プロジェクト: doruktiktiklar/sadedegel
def evaluate(table_format, tag, debug):
    """Evaluate all summarizers in sadedeGel"""
    if not debug:
        warnings.filterwarnings("ignore")

    anno = load_annotated_corpus(False)
    summarizers = [summ for summ in SUMMARIZERS if any(_tag in summ[1] for _tag in tag)]

    scores = defaultdict(list)

    for word_tokenizer in tqdm(['simple', 'bert'], unit=" word-tokenizer"):
        with tokenizer_context(word_tokenizer):
            for name, summarizer in tqdm(summarizers, unit=" method"):
                # skip simple tokenizer for clustering models
                if "cluster" in summarizer and word_tokenizer == "simple":
                    continue

                for doc in tqdm(anno, unit=" doc", desc=f"Evaluating {name}"):
                    y_true = [doc['relevance']]

                    d = Doc.from_sentences(doc['sentences'])

                    y_pred = [summarizer.predict(d.sents)]

                    score_10 = ndcg_score(y_true, y_pred, k=ceil(len(doc['sentences']) * 0.1))
                    score_50 = ndcg_score(y_true, y_pred, k=ceil(len(doc['sentences']) * 0.5))
                    score_80 = ndcg_score(y_true, y_pred, k=ceil(len(doc['sentences']) * 0.8))

                    scores[f"{name} - {word_tokenizer}"].append((score_10, score_50, score_80))

    table = [[algo, np.array([s[0] for s in scores]).mean(), np.array([s[1] for s in scores]).mean(),
              np.array([s[2] for s in scores]).mean()] for
             algo, scores in scores.items()]

    # TODO: Sample weight of instances.
    print(
        tabulate(table, headers=['Method & Tokenizer', 'ndcg(k=0.1)', 'ndcg(k=0.5)', 'ndcg(k=0.8)'],
                 tablefmt=table_format,
                 floatfmt=".4f"))

    if debug:
        click.echo(np.array(table).shape)
コード例 #6
0
def sbd(data_home):
    """Generate sentence boundary detected corpus out of raw document corpus."""

    data_home = Path(os.path.expanduser(data_home))

    logger.info(f"Data directory for extended data {data_home}")

    raw_dir = data_home / 'extended' / 'raw'

    for section in raw_dir.iterdir():

        sents_dir = section.parent.parent / 'sents' / str(section.name)

        if section.is_dir():
            sents_dir.mkdir(parents=True, exist_ok=True)

            for raw in track(glob.glob(
                    str((raw_dir / section / '*.txt').absolute())),
                             description=f"{section.name} documents"):
                fn_noext, _ = os.path.splitext(os.path.basename(raw))

                target = (sents_dir / f"{fn_noext}.json").absolute()

                if not os.path.exists(target) or (os.path.exists(target)
                                                  and os.path.getsize(target)
                                                  == 0):
                    try:
                        d = Doc(safe_read(raw))

                        with open(target, 'w') as wp:
                            json.dump(dict(sentences=[s.text for s in d],
                                           rouge1=[s.rouge1("f1") for s in d]),
                                      wp,
                                      ensure_ascii=False)
                    except:
                        logger.exception(f"Error in processing document {raw}")

                        raise
コード例 #7
0
ファイル: __main__.py プロジェクト: GlobalMaksimum/sadedegel
def evaluate(v):
    """Evaluate IoU metric for different SBD algorithms over our stock dataset."""
    click.secho("Loading corpus...")
    raw, sents = load_raw_corpus(False), load_sentence_corpus()

    nltk = NLTKPunctTokenizer()
    reg = RegexpSentenceTokenizer()

    y_pred = [nltk(doc) for doc in raw]
    y_true = [doc['sentences'] for doc in sents]

    iou_eval("NLTKPunctTokenizer", y_true, y_pred,
             file_paths() if v > 0 else None)

    y_pred = [reg(doc) for doc in raw]

    iou_eval("RegexpSentenceTokenizer", y_true, y_pred,
             file_paths() if v > 0 else None)

    y_pred = [[s.text for s in Doc(doc)] for doc in raw]

    iou_eval("MLBasedTokenizer", y_true, y_pred,
             file_paths() if v > 0 else None)
コード例 #8
0
def test_lxr_summarizer_all_lower():
    summ = LexRankSummarizer("log_norm", "smooth", normalize=False)

    assert summ.predict(
        Doc('ali topu tut. oya ip atla. ahmet topu at.')) == approx(
            np.array([1., 1., 1.]))
コード例 #9
0
def test_lxr_summarize_text():
    summ = LexRankSummarizer("log_norm", "smooth")
    doc = Doc('ali topu tut. oya ip atla. ahmet topu at.')

    assert summ(doc, k=1) == [doc[2]]
コード例 #10
0
def test_lxr_summarizer_proper_case():
    summ = LexRankSummarizer("log_norm", "smooth", normalize=False)
    assert summ.predict(
        Doc('Ali topu tut. Oya ip atla. Ahmet topu at.')) == approx(
            np.array([1., 1., 1.]))