Exemple #1
0
    def test_converting_an_object_to_tokens(self):
        class Subject:
            def __str__(self):
                return "custom object"

        tokens = [str(token) for token in Tokenizer(Subject())]
        assert tokens == ["custom", "object"]
Exemple #2
0
    def test_converting_an_object_to_tokens(self):
        @six.python_2_unicode_compatible
        class Subject:
            def __str__(self):
                return "custom object"

        tokens = [str(token) for token in Tokenizer(Subject())]
        assert tokens == ["custom", "object"]
Exemple #3
0
    def add(self, doc, attributes=None):
        """Adds a document to the index.

        Before adding documents to the index it should have been fully
        setup, with the document ref and all fields to index already having
        been specified.

        The document must have a field name as specified by the ref (by default
        this is 'id') and it should have all fields defined for indexing,
        though None values will not cause errors.

        Args:
            - doc (dict): The document to be added to the index.
            - attributes (dict, optional): A set of attributes corresponding
            to the document, currently a single `boost` -> int will be
            taken into account.
        """
        doc_ref = str(doc[self._ref])
        self._documents[doc_ref] = attributes or {}
        self.document_count += 1

        for field_name, field in self._fields.items():
            extractor = field.extractor
            field_value = doc[field_name] if extractor is None else extractor(
                doc)
            tokens = Tokenizer(field_value)
            terms = self.pipeline.run(tokens)
            field_ref = FieldRef(doc_ref, field_name)
            field_terms = defaultdict(int)

            # TODO: field_refs are casted to strings in JS, should we allow
            # FieldRef as keys?
            self.field_term_frequencies[str(field_ref)] = field_terms
            self.field_lengths[str(field_ref)] = len(terms)

            for term in terms:
                # TODO: term is a Token, should we allow Tokens as keys?
                term_key = str(term)

                field_terms[term_key] += 1
                if term_key not in self.inverted_index:
                    posting = {_field_name: {} for _field_name in self._fields}
                    posting["_index"] = self.term_index
                    self.term_index += 1
                    self.inverted_index[term_key] = posting

                if doc_ref not in self.inverted_index[term_key][field_name]:
                    self.inverted_index[term_key][field_name][
                        doc_ref] = defaultdict(list)

                for metadata_key in self.metadata_whitelist:
                    metadata = term.metadata[metadata_key]
                    self.inverted_index[term_key][field_name][doc_ref][
                        metadata_key].append(metadata)
Exemple #4
0
 def test_providing_additional_metadata(self):
     tokens = Tokenizer("foo bar", {"hurp": "durp"})
     assert tokens[0].metadata["hurp"] == "durp"
     assert tokens[1].metadata["hurp"] == "durp"
Exemple #5
0
 def test_tracking_the_token_position(self):
     tokens = Tokenizer("foo bar")
     assert tokens[0].metadata["position"] == [0, 3]
     assert tokens[1].metadata["position"] == [4, 3]
Exemple #6
0
 def test_tracking_the_token_index(self):
     tokens = Tokenizer("foo bar")
     assert tokens[0].metadata["index"] == 0
     assert tokens[1].metadata["index"] == 1
Exemple #7
0
 def test_splits_strings_with_hyphens_and_spaces(self):
     tokens = [str(token) for token in Tokenizer("foo - bar")]
     assert tokens == ["foo", "bar"]
Exemple #8
0
 def test_converting_a_boolean_to_tokens(self):
     tokens = [str(token) for token in Tokenizer(False)]
     assert tokens == ["false"]
Exemple #9
0
 def test_handling_null_like_arguments(self):
     assert len(Tokenizer(None)) == 0
Exemple #10
0
    def test_multiple_whitespace_is_stripped(self):
        tokenizer = Tokenizer("   foo    bar   baz  ")
        tokens = [str(token) for token in tokenizer]

        assert tokens == ["foo", "bar", "baz"]
Exemple #11
0
    def test_none_is_converted_to_empty_string(self):
        tokenizer = Tokenizer(["foo", None, "baz"])
        tokens = [str(token) for token in tokenizer]

        assert tokens == ["foo", "", "baz"]
Exemple #12
0
    def test_array_of_strings(self):
        tokenizer = Tokenizer(["foo", "bar", "baz"])
        tokens = [str(token) for token in tokenizer]

        assert tokens == ["foo", "bar", "baz"]
Exemple #13
0
    def test_run_downcases_tokens(self):
        tokenizer = Tokenizer("foo BAR BAZ")
        tokens = [str(token) for token in tokenizer]

        assert tokens == ["foo", "bar", "baz"]
Exemple #14
0
    def test_splitting_into_tokens(self):
        tokenizer = Tokenizer("foo bar baz")
        tokens = [str(token) for token in tokenizer]

        assert tokens == ["foo", "bar", "baz"]
Exemple #15
0
 def test_providing_separator(self, separator):
     tokens = [str(token) for token in Tokenizer("foo_bar-baz", separator=separator)]
     assert tokens == ["foo", "bar", "baz"]
Exemple #16
0
 def test_tracking_token_position_with_right_hand_whitespace(self):
     tokens = Tokenizer("foo bar ")
     assert tokens[0].metadata["position"] == [0, 3]
     assert tokens[1].metadata["position"] == [4, 3]
Exemple #17
0
 def test_converting_a_number_to_tokens(self):
     tokens = [str(token) for token in Tokenizer(41)]
     assert tokens == ["41"]