Esempio n. 1
0
 def __init__(self, corpus: Corpus, fields: Iterable[str], normalizer: Normalizer, tokenizer: Tokenizer):
     self._corpus = corpus
     self._normalizer = normalizer
     self._tokenizer = tokenizer
     self._posting_lists = []
     self._dictionary = InMemoryDictionary()
     self._build_index(fields)
Esempio n. 2
0
class InMemoryInvertedIndex(InvertedIndex):
    """
    A simple in-memory implementation of an inverted index, suitable for small corpora.

    In a serious application we'd have configuration to allow for field-specific NLP,
    scale beyond current memory constraints, have a positional index, and so on.
    """
    def __init__(self, corpus: Corpus, fields: Iterable[str],
                 normalizer: Normalizer, tokenizer: Tokenizer):
        self._corpus = corpus
        self._normalizer = normalizer
        self._tokenizer = tokenizer
        self._posting_lists = []
        self._posting_lists_size = 0
        self._dictionary = InMemoryDictionary()
        self._build_index(fields)

    def __repr__(self):
        return str({
            term: self._posting_lists[term_id]
            for (term, term_id) in self._dictionary
        })

    def _build_index(self, fields: Iterable[str]) -> None:
        for doc in self._corpus:
            counter = Counter()
            for field in fields:
                for term in self.get_terms(doc[field]):
                    counter[term] += 1
            for word in counter:
                term_id = self._dictionary.add_if_absent(word)
                post = Posting(doc.document_id, counter.get(word))
                try:
                    self._posting_lists[term_id].append(post)
                except IndexError:
                    self._posting_lists.append([post])

    def get_terms(self, buffer: str) -> Iterator[str]:
        return (self._normalizer.normalize(t) for t in self._tokenizer.strings(
            self._normalizer.canonicalize(buffer)))

    def get_postings_iterator(self, term: str) -> Iterator[Posting]:
        # In a serious application a postings list would be stored as a contiguous buffer
        # storing compressed integers, and the iterator would facilitate loading this buffer
        # from somewhere and decompressing the integers.
        term_id = self._dictionary.get_term_id(term)
        return iter([]) if term_id is None else iter(
            self._posting_lists[term_id])

    def get_document_frequency(self, term: str) -> int:
        # In a serious application we'd store this number explicitly, e.g., as part of the dictionary.
        # That way, we can look up the document frequency without having to access the posting lists
        # themselves. Imagine if the posting lists don't even reside in memory!
        term_id = self._dictionary.get_term_id(term)
        return 0 if term_id is None else len(self._posting_lists[term_id])
Esempio n. 3
0
class InMemoryInvertedIndex(InvertedIndex):
    """
    A simple in-memory implementation of an inverted index, suitable for small corpora.

    In a serious application we'd have configuration to allow for field-specific NLP,
    scale beyond current memory constraints, have a positional index, and so on.
    """

    def __init__(self, corpus: Corpus, fields: Iterable[str], normalizer: Normalizer, tokenizer: Tokenizer):
        self._corpus = corpus
        self._normalizer = normalizer
        self._tokenizer = tokenizer
        self._posting_lists = []
        self._dictionary = InMemoryDictionary()
        self._build_index(fields)

    def __repr__(self):
        return str({term: self._posting_lists[term_id] for (term, term_id) in self._dictionary})

    def _build_index(self, fields):
        """
        Builds a simple inverted index from the named fields in the document
        collection. The dictionary implementation is assumed to produce term
        identifiers in the range {0, ..., N - 1}.
        """
        for docs in self._corpus:
            doc_id = docs.get_document_id()
            for f in fields:
                terms = Counter(self.get_terms(docs.get_field(f, '')))
                for term in terms:
                    t_id = self._dictionary.add_if_absent(term)
                    p1 = Posting(doc_id, terms[term])
                    try:
                        self._posting_lists[t_id].append(p1)
                    except IndexError:
                        self._posting_lists.insert(t_id, [p1])

    def get_terms(self, buffer: str) -> Iterable[str]:
        return [self._normalizer.normalize(t) for t in self._tokenizer.strings(self._normalizer.canonicalize(buffer))]

    def get_postings_iterator(self, term: str) -> Iterator[Posting]:
        # In a serious application a postings list would be stored as a contiguous buffer
        # storing compressed integers, and the iterator would facilitate loading this buffer
        # from somewhere and decompressing the integers.
        term_id = self._dictionary.get_term_id(term)
        return iter([]) if term_id < 0 else iter(self._posting_lists[term_id])

    def get_document_frequency(self, term: str) -> int:
        # In a serious application we'd store this number explicitly, e.g., as part of the dictionary.
        # That way, we can look up the document frequency without having to access the posting lists
        # themselves. Imagine if the posting lists don't even reside in memory!
        term_id = self._dictionary.get_term_id(term)
        return 0 if term_id < 0 else len(self._posting_lists[term_id])
Esempio n. 4
0
    def __init__(self, training_set: Dict[str, Corpus], fields: Iterable[str],
                 normalizer: Normalizer, tokenizer: Tokenizer):

        # Used for breaking the text up into discrete classification features.
        self._normalizer = normalizer
        self._tokenizer = tokenizer

        # The vocabulary we've seen during training.
        self._vocabulary = InMemoryDictionary()
        self.training_set = training_set
        self.fields = fields
        self.prior = {}
        self.condprob = {}
        self._train()
Esempio n. 5
0
 def test_access_vocabulary(self):
     from dictionary import InMemoryDictionary
     vocabulary = InMemoryDictionary()
     vocabulary.add_if_absent("foo")
     vocabulary.add_if_absent("bar")
     vocabulary.add_if_absent("foo")
     self.assertEqual(len(vocabulary), 2)
     self.assertEqual(vocabulary.size(), 2)
     self.assertEqual(vocabulary.get_term_id("foo"), 0)
     self.assertEqual(vocabulary.get_term_id("bar"), 1)
     self.assertEqual(vocabulary["bar"], 1)
     self.assertIn("bar", vocabulary)
     self.assertNotIn("wtf", vocabulary)
     self.assertIsNone(vocabulary.get_term_id("wtf"))
     self.assertListEqual(sorted([v for v in vocabulary]), [("bar", 1),
                                                            ("foo", 0)])
Esempio n. 6
0
class InMemoryInvertedIndex(InvertedIndex):
    """
    A simple in-memory implementation of an inverted index, suitable for small corpora.

    In a serious application we'd have configuration to allow for field-specific NLP,
    scale beyond current memory constraints, have a positional index, and so on.
    """
    def __init__(self, corpus: Corpus, fields: Iterable[str],
                 normalizer: Normalizer, tokenizer: Tokenizer):
        self._corpus = corpus
        self._normalizer = normalizer
        self._tokenizer = tokenizer
        self._posting_lists = []
        self._dictionary = InMemoryDictionary()
        self._build_index(fields)

    def __repr__(self):
        return str({
            term: self._posting_lists[term_id]
            for (term, term_id) in self._dictionary
        })

    def _build_index(self, fields: Iterable[str]) -> None:
        for document in self._corpus:

            # Compute TF values for all unique terms in the document. Note that we
            # currently don't keep track of which field each term occurs in.
            # If we were to allow fields searches (e.g., "find documents that
            # contain 'foo' in the 'title' field") then we would have to keep
            # track of that, either as a synthetic term in the dictionary
            # (e.g., 'title.foo') or as extra data in the posting.
            all_terms = itertools.chain.from_iterable(
                self.get_terms(document.get_field(f, "")) for f in fields)
            term_frequencies = Counter(all_terms)

            for (term, term_frequency) in term_frequencies.items():

                # Assign the term an identifier, if needed. First come, first serve.
                term_id = self._dictionary.add_if_absent(term)

                # Locate the posting list for this term.
                if term_id >= len(self._posting_lists):
                    self._posting_lists.extend((
                        []
                        for _ in range(len(self._posting_lists) - term_id +
                                       1)))
                posting_list = self._posting_lists[term_id]

                # Append the posting to the posting list. The posting lists
                # must be kept sorted so that we can efficiently traverse and
                # merge them when querying the inverted index. Be paranoid and
                # verify that iterating over documents in the corpus happens
                # in ascending order by document identifiers.
                assert len(posting_list) == 0 or posting_list[
                    -1].document_id < document.document_id
                posting_list.append(
                    Posting(document.document_id, term_frequency))

    def get_terms(self, buffer: str) -> Iterator[str]:
        return (self._normalizer.normalize(t) for t in self._tokenizer.strings(
            self._normalizer.canonicalize(buffer)))

    def get_postings_iterator(self, term: str) -> Iterator[Posting]:
        # In a serious application a postings list would be stored as a contiguous buffer
        # storing compressed integers, and the iterator would facilitate loading this buffer
        # from somewhere and decompressing the integers.
        term_id = self._dictionary.get_term_id(term)
        return iter([]) if term_id is None else iter(
            self._posting_lists[term_id])

    def get_document_frequency(self, term: str) -> int:
        # In a serious application we'd store this number explicitly, e.g., as part of the dictionary.
        # That way, we can look up the document frequency without having to access the posting lists
        # themselves. Imagine if the posting lists don't even reside in memory!
        term_id = self._dictionary.get_term_id(term)
        return 0 if term_id is None else len(self._posting_lists[term_id])
Esempio n. 7
0
class NaiveBayesClassifier:
    """
    Defines a multinomial naive Bayes text classifier.
    Constructor. Trains the classifier from the named fields in the documents in
    the given training set.
    """
    def __init__(self, training_set: Dict[str, Corpus], fields: Iterable[str],
                 normalizer: Normalizer, tokenizer: Tokenizer):

        # Used for breaking the text up into discrete classification features.
        self._normalizer = normalizer
        self._tokenizer = tokenizer

        # The vocabulary we've seen during training.
        self._vocabulary = InMemoryDictionary()
        self.training_set = training_set
        self.fields = fields
        self.prior = {}
        self.condprob = {}
        self._train()

    def _train(self):
        self._extractVocabulary()
        N = 0
        for _class in self.training_set:
            N += self.training_set.get(_class).size()
        for _class in self.training_set:
            docs_in_class = self.training_set.get(_class).size()
            self.prior[_class] = docs_in_class / N
            text_of_docs_in_class = self._get_text_in_class(_class)
            norm = self._get_terms(text_of_docs_in_class)
            counter = Counter(norm)
            for term in self._vocabulary:
                top = counter[term[0]] + 1
                bottom = len(norm) + self._vocabulary.size()
                result = top / bottom
                self.condprob[(term[0], _class)] = result

    def _get_text_in_class(self, _class):
        text_in_doc = ""
        for doc in self.training_set.get(_class):
            for field in self.fields:
                text_in_doc += " " + doc[field]
        return text_in_doc

    def _extractVocabulary(self):
        content = ""
        for key in self.training_set:
            for doc in self.training_set.get(key):
                for field in self.fields:
                    content += " " + doc[field]
        for term in self._get_terms(content):
            self._vocabulary.add_if_absent(term)

    def _get_terms(self, buffer):
        """
        Processes the given text buffer and returns the sequence of normalized
        terms as they appear. Both the documents in the training set and the buffers
        we classify need to be identically processed.
        """
        return [
            self._normalizer.normalize(s) for s in self._tokenizer.strings(
                self._normalizer.canonicalize(buffer))
        ]

    def classify(self, buffer: str, callback: Callable[[dict], Any]) -> None:
        """
        Classifies the given buffer according to the multinomial naive Bayes rule. The computed (score, category) pairs
        are emitted back to the client via the supplied callback sorted according to the scores. The reported scores
        are log-probabilities, to minimize numerical underflow issues. Logarithms are base e.

        The callback function supplied by the client will receive a dictionary having the keys "score" (float) and
        "category" (str).
        """
        terms = self._get_terms(buffer)
        W = []
        score = []
        for term in terms:
            if term in self._vocabulary:
                W.append(term)
        for _class in self.training_set:
            score_class = math.log(self.prior[_class])
            for term in W:
                score_class += math.log(self.condprob[(term, _class)])
            score.append((score_class, _class))
        score.sort(key=lambda t: math.fabs(t[0]))
        for score in score:
            callback({"score": score[0], "category": score[1]})