def all_candidates(self, s): """Retrieve all candidate entities from a piece of text. Parameters ---------- s : {string, iterable over string} Tokens. If a string, it will be tokenized using a naive heuristic. Returns ------- candidates : iterable over (int, int, string, float) Candidate entities are 4-tuples of the indices `start` and `end` (both in tokenized input, and both start at 1), `target entity` (title of the Wikipedia article) and `probability` (commonness.) """ if isinstance(s, six.string_types): # XXX need a smarter tokenizer! s = s.split() else: s = tosequence(s) print(self.N) for i, j, s in ngrams_with_pos(s, self.N): if s in self.commonness: for target, prob in self.commonness[s]: yield i, j, target, prob
def test_ngrams(): text = "Hello , world !".split() expected = {"Hello , world", ", world !", "Hello ,", ", world", "world !", "Hello", ",", "world", "!"} ng = Counter(ngrams(text, 3)) assert_equal(set(ng), expected) assert_true(all(freq == 1 for freq in ng.values())) with_pos = list(ngrams_with_pos(text, 2)) assert_in((0, 2, 'Hello ,'), with_pos) assert_in((1, 3, ', world'), with_pos)
def test_ngrams_order_string(): # This has bitten me three(!) times now. Basta! tokens = "a b c".split() list(ngrams_with_pos(tokens, 'foobar'))
def test_ngrams_order_0(): tokens = "a b c".split() # The execution of the generator is delayed until the values are # requested, so that's the first opportunity at which an Exception # can/will be raised. list(ngrams_with_pos(tokens, -3))