Beispiel #1
0
def test_text_rank():
    g1, g2 = random_graphs()
    g1_out = text_rank(g1, seed=1234)
    g2_out = text_rank(g2, seed=1234)
    for (g1_label, g1_score), (g2_label, g2_score) in zip(g1_out, g2_out):
        assert g1_label == g2_label
        math.isclose(g1_score, g2_score)
Beispiel #2
0
def summarize(
    sentences: List[str],
    nsents: Optional[int] = None,
    keep_order: bool = True,
    damping: float = 0.85,
    convergence: float = 0.0001,
    convergence_type: ConvergenceType = ConvergenceType.ALL,
    niter: int = 200,
    seed: Optional[int] = None,
    sim: Callable[..., float] = overlap,
    norm: Callable[[str], str] = norm_sentence,
    GraphType: Type[Graph] = AdjacencyMatrix,
) -> List[str]:
    """Summarize text.

    :param sentences: The sentences to summarize.
    :param nsents: The number of sentences to use in the summary, If `None` it uses the number of
        sentences to summarize divided by 3
    :param keep_order: Should summary sentences appear in the same order they appear in the in the
        text, if False sentences or ordered by their text rank score.
    :param damping: A scalar between 0 and 1. Used to simulate randomly jumping from one vertex to another.
    :param convergence: An early stopping criteria, when any or all of the node scores change by less than `convergence`
        we stop updating the graph. Set to `0` to turn off early stopping.
    :param convergence_type: Should we stop when all nodes move less than `convergence` or when a single node does
    :param niter: An upper bound on the number of iterations to run
    :param seed: A reproducability seed to initialization of the node scores.
    :param sim: A callable that returns the similarity between two vertices, used to set the weight of the edge.
        The callable should have a signature like:
            sim(
                normed_s1,
                normed_s2,
                raw_s1=raw_s1,
                raw_s2=raw_s2,
                s1_idx=s1_idx,
                s2_idx=s2_idx,
            ) -> float:
        Where normed_s1/2 is the normalized strings of the two sentences, raw_s1/2 is the version of the sentence
        before getting normalized and s1/2_idx is the index of the sentences in the token list. This should
        facilitate both simple and complex similarity functions and also experiments that the actual flow of text
        to determine connections.
    :param norm: A function the returns a normalized version of the input sentence. Default implementation lowercases
        string and removes non alpha-numeric characters.
        This is used so simple similarity functions like the set overlap in the paper work well.
    :param GraphType: The Graph class to use.

    :returns: A list of sentences summarizing the original text.
    """
    graph, offsets = sentence_graph(sentences, sim, norm, GraphType)
    if nsents is None:
        nsents = len(sentences) // 3
    selected = text_rank(
        graph, damping=damping, convergence=convergence, convergence_type=convergence_type, niter=niter, seed=seed,
    )[:nsents]
    indices = [offsets[s[0]][0] for s in selected]
    if keep_order:
        return [sentences[i] for i in sorted(indices)]
    return [sentences[i] for i in indices]
Beispiel #3
0
def test_text_rank_mining_massive_datasets():
    """"This is testing with a worked example from here:
            http://infolab.stanford.edu/~ullman/mmds/ch5.pdf
    """
    g = AdjacencyMatrix(list("ABCD"))
    g.add_edge("A", "B")
    g.add_edge("A", "C")
    g.add_edge("A", "D")
    g.add_edge("B", "A")
    g.add_edge("B", "D")
    g.add_edge("C", "A")
    g.add_edge("D", "B")
    g.add_edge("D", "C")
    gold = np.array([3 / 9, 2 / 9, 2 / 9, 2 / 9])

    scores = [x[1] for x in text_rank(g, damping=1, convergence=0)]
    np.testing.assert_allclose(scores, gold)
Beispiel #4
0
def keywords(
    tokens: List[Dict[str, str]],
    nwords: Optional[int] = None,
    winsz: int = 2,
    damping: float = 0.85,
    convergence: float = 0.0001,
    convergence_type: ConvergenceType = ConvergenceType.ANY,
    niter: int = 200,
    seed: Optional[int] = None,
    sim: Callable[..., float] = lambda x, y, **kwargs: 1,
    norm: Callable[[str], str] = norm_token,
    filt: Callable[[Dict[str, str]], bool] = filter_pos,
    GraphType: Type[Graph] = AdjacencyMatrix,
) -> Set[str]:
    """Find keywords.

    :param tokens: The tokens of the text to pull keywords from
    :param nwords: The number of keywords to extract from the text. If `None` it uses the number
        of tokens in the passage divided by 3. This number isn't exact, after finding keywords it will
        join adjacent ones so it might be possible to get fewer keywords back.
    :param winsz: The size of a window around each word that it can connect to. This window is calculated
        in the non-filtered token list.
    :param damping: A scalar between 0 and 1. Used to simulate randomly jumping from one vertex to another.
    :param convergence: An early stopping criteria, when any or all of the node scores change by less than `convergence`
        we stop updating the graph. Set to `0` to turn off early stopping.
    :param convergence_type: Should we stop when all nodes move less than `convergence` or when a single node does
    :param niter: An upper bound on the number of iterations to run
    :param seed: A reproducability seed to initialization of the node scores.
    :param sim: A callable that returns the similarity between two vertices, used to set the weight of the edge.
        The callable should have a signature like:
            sim(
                normed_s1,
                normed_s2,
                context=context,
                raw_s1=raw_s1,
                raw_s2=raw_s2,
                raw_context=raw_context,
                s1_idx=s1_idx,
                s2_idx=s2_idx,
            ) -> float:
        Where normed_s1/2 is the normalized string of two keywords, context is all of the normalized tokens,
        raw_s1/2/context are the prenormalized versions and s1/2_idx are the indices of the keywords in the
        original sentence. This should allow complex similarity functions. The context is the whole list of
        tokens, not just the window around one.
    :param norm: A function the returns a normalized version of the input string. Default implementation
        lowercases string and removes non alpha-numeric characters.
        This is used to unify similar vertices, i.e. `Hurricane` and `hurricane` should be the same vertex
        and will be with normalization.
    :param filt: A function that will filter tokens based on pos tags if the inputs are Dicts.
    :param GraphType: The Graph class to use.

    :returns: The keywords from the passage
    """
    graph, offsets = keyword_graph(tokens, winsz, sim, norm, filt, GraphType)
    if nwords is None:
        nwords = len(tokens) // 3
    keywords = text_rank(graph,
                         damping=damping,
                         convergence=convergence,
                         convergence_type=convergence_type,
                         niter=niter,
                         seed=seed)[:nwords]
    keywords = join_adjacent_keywords([kw[0] for kw in keywords], offsets)
    return keywords
Beispiel #5
0
import json
import argparse
from text_rank import DEMO_LOC
from text_rank.graph import sentence_graph, keyword_graph
from text_rank.graph import sentence_graph_list, keyword_graph_list
from text_rank.text_rank import text_rank, text_rank_list

parser = argparse.ArgumentParser("Text Rank")
parser.add_argument("--iters", "-i", type=int, default=40)
parser.add_argument("--sents", "-s", type=int, default=3)
parser.add_argument("--words", "-w", type=int, default=5)
args = parser.parse_args()

sentences = json.load(open(DEMO_LOC / "Automatic_Summarization-sents.json"))
V, g = sentence_graph(sentences)
sents1 = text_rank(V, g, args.iters)
print([' '.join(x[0]) for x in sents1[:args.sents]])

V = sentence_graph_list(sentences)
sents2 = text_rank_list(V, args.iters)
print([' '.join(x[0].value) for x in sents2[:args.sents]])

tokens = json.load(open(DEMO_LOC / "Automatic_Summarization-tokens.json"))
V, g = keyword_graph(tokens)
keywords1 = text_rank(V, g, args.iters)
print([x[0] for x in keywords1[:args.words]])

V = keyword_graph_list(tokens)
keywords2 = text_rank_list(V, args.iters)
print([x[0].value for x in keywords2[:args.words]])
Beispiel #6
0
def main():
    parser = argparse.ArgumentParser("Text Rank demo")
    parser.add_argument("--sents", "-s", type=int, default=3)
    parser.add_argument("--words", "-w", type=int, default=5)
    parser.add_argument("--iters", "-i", type=int, default=20)
    parser.add_argument("--convergence", "-c", type=float, default=0.0001)
    parser.add_argument("--verbose", "-v", action="store_true")
    parser.add_argument("--compare", action="store_true")
    args = parser.parse_args()

    if args.compare:
        sentences = json.load(
            open(DEMO_LOC / "automatic-summarization-sents.json"))

        G = sentence_graph(sentences)
        sents = text_rank(G,
                          convergence=args.convergence,
                          niter=args.iters,
                          quiet=not args.verbose)
        print("Adjacency Matrix based Text Rank for extractive summarization")
        for i, x in enumerate(sents[:args.sents]):
            print(f" {i + 1}. {x[0]}")

        G = sentence_graph(sentences, GraphType=AdjacencyList)
        sents = text_rank(G,
                          convergence=args.convergence,
                          niter=args.iters,
                          quiet=not args.verbose)
        print()
        print("Adjacency List based Text Rank for extractive summarization")
        for i, x in enumerate(sents[:args.sents]):
            print(f" {i + 1}. {x[0]}")

        tokens = json.load(
            open(DEMO_LOC / "automatic-summarization-tokens.json"))

        G = keyword_graph(tokens)
        kws = text_rank(G,
                        convergence=args.convergence,
                        niter=args.iters,
                        quiet=not args.verbose)
        print()
        print("Adjacency Matrix based Text Rank for key-word extraction")
        for i, x in enumerate(kws[:args.words]):
            print(f" {i + 1}. {x[0]}")

        G = keyword_graph(tokens, GraphType=AdjacencyList)
        kws = text_rank(G,
                        convergence=args.convergence,
                        niter=args.iters,
                        quiet=not args.verbose)
        print()
        print("Adjacency List based Text Rank for key-word extraction")
        for i, x in enumerate(kws[:args.words]):
            print(f" {i + 1}. {x[0]}")

    tokens = json.load(open(DEMO_LOC / "paper-example-keywords.json"))

    kws = keywords(tokens)
    print()
    print("Keyword Extraction from paper")
    for i, kw in enumerate(kws):
        print(f" {i + 1}. {kw}")

    sentences = json.load(open(DEMO_LOC / "paper-example-summarize.json"))

    sentences = summarize(sentences)
    print()
    print("Extractive Summarization from the paper")
    for i, sent in enumerate(sentences):
        print(f" {i + 1}. {sent}")