def all_candidates(self, s):
        """Retrieve all candidate entities from a piece of text.

        Parameters
        ----------
        s : {string, iterable over string}
            Tokens. If a string, it will be tokenized using a naive heuristic.

        Returns
        -------
        candidates : iterable over (int, int, string, float)
            Candidate entities are 4-tuples of the indices `start` and
            `end` (both in tokenized input, and both start at 1),
            `target entity` (title of the Wikipedia article) and
            `probability` (commonness.)
        """

        if isinstance(s, six.string_types):
            # XXX need a smarter tokenizer!
            s = s.split()
        else:
            s = tosequence(s)

        print(self.N)

        for i, j, s in ngrams_with_pos(s, self.N):
            if s in self.commonness:
                for target, prob in self.commonness[s]:
                    yield i, j, target, prob
Esempio n. 2
0
def test_ngrams():
    text = "Hello , world !".split()
    expected = {"Hello , world", ", world !",
                "Hello ,", ", world", "world !",
                "Hello", ",", "world", "!"}

    ng = Counter(ngrams(text, 3))
    assert_equal(set(ng), expected)
    assert_true(all(freq == 1 for freq in ng.values()))

    with_pos = list(ngrams_with_pos(text, 2))
    assert_in((0, 2, 'Hello ,'), with_pos)
    assert_in((1, 3, ', world'), with_pos)
Esempio n. 3
0
def test_ngrams_order_string():
    # This has bitten me three(!) times now. Basta!
    tokens = "a b c".split()
    list(ngrams_with_pos(tokens, 'foobar'))
Esempio n. 4
0
def test_ngrams_order_0():
    tokens = "a b c".split()
    # The execution of the generator is delayed until the values are
    # requested, so that's the first opportunity at which an Exception
    # can/will be raised.
    list(ngrams_with_pos(tokens, -3))