Esempio n. 1
0
 def __init__(self, index_dir: str):
     ts = TurkishStemmer()
     self.__schema = fields.Schema(
         message=fields.TEXT(stored=True,
                             field_boost=1.5,
                             analyzer=analysis.StemmingAnalyzer()
                             | analysis.NgramFilter(minsize=2, maxsize=5)),
         meta_content=fields.TEXT(
             stored=True,
             analyzer=analysis.StemmingAnalyzer()
             | analysis.NgramFilter(minsize=2, maxsize=5)),
         message_id=fields.NUMERIC(stored=True, bits=64),
         chat_id=fields.NUMERIC(stored=True, bits=64),
         message_tr=fields.TEXT(
             stored=False,
             field_boost=1.5,
             analyzer=analysis.StemmingAnalyzer(stemfn=ts.stem,
                                                stoplist=STOP_WORDS_TR)
             | analysis.NgramFilter(minsize=2, maxsize=5)),
         meta_content_tr=fields.TEXT(
             stored=False,
             analyzer=analysis.StemmingAnalyzer(stemfn=ts.stem,
                                                stoplist=STOP_WORDS_TR)
             | analysis.NgramFilter(minsize=2, maxsize=5)),
     )
     if not os.path.isdir(index_dir):
         os.mkdir(index_dir)
         self.__index = index.create_in(index_dir, self.__schema)
     else:
         self.__index = index.open_dir(index_dir)
Esempio n. 2
0
def test_multi_language():
    # Analyzer for English
    ana_eng = analysis.StemmingAnalyzer()

    # analyzer for Pig Latin
    def stem_piglatin(w):
        if w.endswith("ay"):
            w = w[:-2]
        return w

    ana_pig = analysis.StemmingAnalyzer(stoplist=["nday", "roay"],
                                        stemfn=stem_piglatin)

    # Dictionary mapping languages to analyzers
    analyzers = {"eng": ana_eng, "pig": ana_pig}

    # Fake documents
    corpus = [(u("eng"), u("Such stuff as dreams are made on")),
              (u("pig"), u("Otay ebay, roay otnay otay ebay"))]

    schema = fields.Schema(content=fields.TEXT(stored=True),
                           lang=fields.ID(stored=True))
    ix = RamStorage().create_index(schema)

    with ix.writer() as w:
        for doclang, content in corpus:
            ana = analyzers[doclang]
            # "Pre-analyze" the field into token strings
            words = [token.text for token in ana(content)]
            # Note we store the original value but index the pre-analyzed words
            w.add_document(lang=doclang,
                           content=words,
                           _stored_content=content)

    with ix.searcher() as s:
        schema = s.schema

        # Modify the schema to fake the correct analyzer for the language
        # we're searching in
        schema["content"].analyzer = analyzers["eng"]

        qp = qparser.QueryParser("content", schema)
        q = qp.parse("dreaming")
        r = s.search(q)
        assert len(r) == 1
        assert r[0]["content"] == "Such stuff as dreams are made on"

        schema["content"].analyzer = analyzers["pig"]
        qp = qparser.QueryParser("content", schema)
        q = qp.parse("otnay")
        r = s.search(q)
        assert len(r) == 1
        assert r[0]["content"] == "Otay ebay, roay otnay otay ebay"
Esempio n. 3
0
def test_add_reader_spelling():
    # Test whether add_spell_word() items get copied over in a merge

    # Because b is stemming and spelled, it will use add_spell_word()
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(a=fields.TEXT(spelling=True),
                           b=fields.TEXT(analyzer=ana, spelling=True))
    with TempIndex(schema, "addreadersp") as ix:
        with ix.writer() as w:
            w.add_document(a=u("rendering modeling compositing enabling"),
                           b=u("rendering modeling compositing enabling"))
            w.add_document(a=u("flying rolling tying quitting polling"),
                           b=u("flying rolling tying quitting polling"))

        with ix.writer() as w:
            w.add_document(a=u("writing eyeing ripping timing yelling"),
                           b=u("writing eyeing ripping timing yelling"))
            w.add_document(a=u("undoing indicating opening pressing"),
                           b=u("undoing indicating opening pressing"))

        with ix.searcher() as s:
            gr = s.reader().word_graph("a")
            assert " ".join(gr.flatten_strings()) == (
                "compositing enabling eyeing flying indicating "
                "modeling opening polling pressing quitting "
                "rendering ripping rolling timing tying undoing "
                "writing yelling")

            gr = s.reader().word_graph("b")
            assert " ".join(gr.flatten_strings()) == (
                "compositing enabling eyeing flying indicating "
                "modeling opening polling pressing quitting "
                "rendering ripping rolling timing tying undoing "
                "writing yelling")
Esempio n. 4
0
 def test_analyzing_terms(self):
     schema = fields.Schema(text=fields.TEXT(
         analyzer=analysis.StemmingAnalyzer()))
     qp = qparser.QueryParser("text", schema=schema)
     q = qp.parse(u"Indexed!")
     self.assertEqual(q.__class__.__name__, "Term")
     self.assertEqual(q.text, "index")
def test_analyzing_terms():
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(text=fields.TEXT(analyzer=ana))
    qp = default.QueryParser("text", schema)
    q = qp.parse(u("Indexed!"))
    assert q.__class__ == query.Term
    assert q.text == "index"
Esempio n. 6
0
def test_add_reader_spelling():
    # Test whether add_spell_word() items get copied over in a merge

    # Because b is stemming and spelled, it will use add_spell_word()
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(a=fields.TEXT(analyzer=ana),
                           b=fields.TEXT(analyzer=ana, spelling=True))

    with TempIndex(schema, "addreadersp") as ix:
        with ix.writer() as w:
            w.add_document(a=u"rendering modeling",
                           b=u"rendering modeling")
            w.add_document(a=u"flying rolling",
                           b=u"flying rolling")

        with ix.writer() as w:
            w.add_document(a=u"writing eyeing",
                           b=u"writing eyeing")
            w.add_document(a=u"undoing indicating",
                           b=u"undoing indicating")
            w.optimize = True

        with ix.reader() as r:
            sws = list(r.lexicon("spell_b"))
            assert sws == [b"eyeing", b"flying", b"indicating", b"modeling",
                           b"rendering", b"rolling",  b"undoing", b"writing"]

            assert list(r.terms_within("a", "undoink", 1)) == []
            assert list(r.terms_within("b", "undoink", 1)) == ["undoing"]
Esempio n. 7
0
def test_spelling_field():
    text = u"rendering shading modeling reactions"
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True))

    assert schema["text"].spelling
    assert schema["text"].separate_spelling()

    with TempIndex(schema) as ix:
        with ix.writer() as w:
            w.add_document(text=text)

        with ix.searcher() as s:
            r = s.reader()
            fieldobj = schema["text"]
            words = [fieldobj.from_bytes(t) for t in r.lexicon("text")]
            assert words == ["model", "reaction", "render", "shade"]

            words = [fieldobj.from_bytes(t) for t in r.lexicon("spell_text")]
            assert words == ["modeling", "reactions", "rendering", "shading"]

            # suggest() automatically looks in the spell_text field because
            # it calls fieldobj.spelling_fieldname() first
            assert s.suggest("text", "renderink") == ["rendering"]

        with ix.writer() as w:
            w.delete_document(0)
Esempio n. 8
0
def add_key_terms(ix):
    s = ix.searcher()
    w = ix.writer()
    stemmer = analysis.StemmingAnalyzer()

    print("Adding key terms...")
    last_book = None
    for doc_num in s.document_numbers():
        fields = s.stored_fields(doc_num)
        if fields['book_name'] != last_book:
            last_book = fields['book_name']
            print(last_book)
        m = re.search(r'session (\d+)', fields['session'], flags=re.IGNORECASE)
        is_session_num = lambda k: re.match(r'{0}(st|nd|rd|th)?'.format(m.group(1)), k) if m else False
        key_terms = [k for k, v in s.key_terms([doc_num], 'key_terms_content', numterms=10) if not is_session_num(k)]
        stemmed = [t.text for t in stemmer(' '.join(key_terms))]

        final_terms = []
        final_stemmed = set()
        for (term, stemmed_term) in zip(key_terms, stemmed):
            if stemmed_term not in final_stemmed:
                final_terms.append(term)
                final_stemmed.add(stemmed_term)

        fields['key_terms'] = final_terms
        fields['stemmed'] = fields['key_terms_content']
        fields['exact'] = fields['key_terms_content']
        fields['common'] = fields['key_terms_content']
        del fields['key_terms_content']
        w.delete_document(doc_num)
        w.add_document(**fields)
    w.commit()
Esempio n. 9
0
def test_issue324():
    sa = analysis.StemmingAnalyzer()
    result = highlight.highlight(u("Indexed!\n1"), [u("index")],
                                 sa,
                                 fragmenter=highlight.ContextFragmenter(),
                                 formatter=highlight.UppercaseFormatter())
    assert result == "INDEXED!\n1"
Esempio n. 10
0
def test_pickle_schema():
    from whoosh import analysis
    from whoosh.support.charset import accent_map
    from whoosh.compat import dumps

    freetext_analyzer = (analysis.StemmingAnalyzer()
                         | analysis.CharsetFilter(accent_map))

    schema = fields.Schema(path=fields.ID(stored=True, unique=True),
                           file_mtime=fields.DATETIME(stored=True),
                           name=fields.TEXT(stored=False, field_boost=2.0),
                           description=fields.TEXT(stored=False,
                                                   field_boost=1.5,
                                                   analyzer=freetext_analyzer),
                           content=fields.TEXT(analyzer=freetext_analyzer))

    # Try to make some sentences that will require stemming
    docs = [
        u"The rain in spain falls mainly in the plain",
        u"Plainly sitting on the plain",
        u"Imagine a greatly improved sentence here"
    ]

    with TempIndex(schema) as ix:
        with ix.writer() as w:
            for doc in docs:
                w.add_document(description=doc, content=doc)

        assert dumps(schema, 2)

        with ix.reader() as r:
            assert dumps(r.schema, 2)
Esempio n. 11
0
def test_correct_spell_field():
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True))
    with TempIndex(schema) as ix:
        with ix.writer() as w:
            w.add_document(text=u"rendering shading modeling reactions")

        with ix.searcher() as s:
            text = s.schema["text"]
            spell_text = s.schema["spell_text"]

            r = s.reader()
            words = [text.from_bytes(t) for t in r.lexicon("text")]
            assert words == ["model", "reaction", "render", "shade"]

            words = [spell_text.from_bytes(t) for t in r.lexicon("spell_text")]
            assert words == ["modeling", "reactions", "rendering", "shading"]

            qp = QueryParser("text", s.schema)
            qtext = u"renderink"
            q = qp.parse(qtext, s.schema)

            r = s.search(q)
            assert len(r) == 0

            c = s.correct_query(q, qtext)
            assert c.string == "rendering"
            assert c.query == query.Term("text", "rendering")

            hf = highlight.HtmlFormatter(classname="c")
            assert c.format_string(
                hf) == '<strong class="c term0">rendering</strong>'
Esempio n. 12
0
def test_memory_codec():
    from whoosh.codec import memory
    from whoosh.searching import Searcher

    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(a=fields.TEXT(vector=True),
                           b=fields.STORED,
                           c=fields.NUMERIC(stored=True, sortable=True),
                           d=fields.TEXT(analyzer=ana, spelling=True))

    codec = memory.MemoryCodec()
    with codec.writer(schema) as w:
        w.add_document(a=u("alfa bravo charlie"),
                       b="hello",
                       c=100,
                       d=u("quelling whining echoing"))
        w.add_document(a=u("bravo charlie delta"),
                       b=1000,
                       c=200,
                       d=u("rolling timing yelling"))
        w.add_document(a=u("charlie delta echo"),
                       b=5.5,
                       c=300,
                       d=u("using opening pulling"))
        w.add_document(a=u("delta echo foxtrot"),
                       b=True,
                       c=-100,
                       d=u("aching selling dipping"))
        w.add_document(a=u("echo foxtrot india"),
                       b=None,
                       c=-200,
                       d=u("filling going hopping"))

    reader = codec.reader(schema)
    s = Searcher(reader)

    assert ("a", "delta") in reader
    q = query.Term("a", "delta")
    r = s.search(q)
    assert len(r) == 3
    assert [hit["b"] for hit in r] == [1000, 5.5, True]

    assert (" ".join(
        s.field_terms("a")) == "alfa bravo charlie delta echo foxtrot india")

    cfield = schema["c"]
    c_sortables = cfield.sortable_terms(reader, "c")
    c_values = [cfield.from_bytes(t) for t in c_sortables]
    assert c_values, [-200, -100, 100, 200, 300]

    assert reader.has_column("c")
    c_values = list(reader.column_reader("c"))
    assert c_values == [100, 200, 300, -100, -200]

    assert s.has_vector(2, "a")
    v = s.vector(2, "a")
    assert " ".join(v.all_ids()) == "charlie delta echo"
Esempio n. 13
0
    def __init__(self, analyzer=NOT_PROVIDED, **kwargs):
        if kwargs.get("facet_class") is None:
            kwargs["facet_class"] = FacetCharField

        # use StemmingAnalyzer by default
        kwargs["analyzer"] = (
            analysis.StemmingAnalyzer() if analyzer is NOT_PROVIDED else analyzer
        )

        super().__init__(**kwargs)
Esempio n. 14
0
def test_bypass_stemming2():
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(content=fields.TEXT(analyzer=ana, spelling=True))

    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(
            content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study00:00:00"))
        w.add_document(content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study"))
        w.add_document(content=u("This is the first document we've added!"))
Esempio n. 15
0
    def create_whoosh_index(cls):
        indexdir = os.path.sep.join(
            [config['datadir'], cls.module_dir, 'index'])
        if not os.path.exists(indexdir):
            os.mkdir(indexdir)

        print "Creating a new index"
        ana = analysis.StemmingAnalyzer()
        schema = Schema(title=TEXT(stored=True),
                        basefile=ID(stored=True, unique=True),
                        content=TEXT)
        # FIXME: Get a keyword list, correct title, and list of treaty
        # references (celex nums as keywords or uris or...)
        whoosh_ix = create_in(indexdir, schema)

        base_dir = config['datadir']

        from time import time

        for basefile in cls.get_iterable_for("relate_all", base_dir):
            if not ("J" in basefile or "A" in basefile or "K" in basefile):
                continue
            readstart = time()
            # just save the text from the document, strip out the tags
            from BeautifulSoup import BeautifulSoup
            m = cls.re_celexno.match(basefile)
            year = m.group(2)
            parsed_file = os.path.sep.join([
                base_dir, cls.module_dir, u'parsed', year, basefile + '.xhtml'
            ])

            soup = BeautifulSoup(open(parsed_file).read())
            text = ''.join(soup.findAll(text=True))
            # Skip the first 150 chars (XML junk) and normalize space
            text = ' '.join(text[150:].split())
            if text:
                indexstart = time()
                writer = whoosh_ix.writer()
                writer.update_document(title="Case " + basefile,
                                       basefile=basefile,
                                       content=text)
                writer.commit()
                print "Added %s '%s...' %.1f kb in %.3f + %.3f s" % (
                    basefile, text[:39], len(text) / 1024,
                    indexstart - readstart, time() - indexstart)
            else:
                print "Noadd %s (no text)" % (basefile)

        searcher = whoosh_ix.searcher()
        results = searcher.find("content",
                                "quantitative imports equivalent prohibited",
                                limit=10)
        for i in range(len(results)):
            print "%s: %s" % (results[i]['title'], results.score(i))
Esempio n. 16
0
def make_index(basedir,
               ixdir,
               procs=4,
               limitmb=128,
               multisegment=True,
               glob="*.mrc"):
    if not os.path.exists(ixdir):
        os.mkdir(ixdir)

    # Multi-lingual stop words
    stoplist = (analysis.STOP_WORDS
                | set("de la der und le die et en al no von di du da "
                      "del zur ein".split()))
    # Schema
    ana = analysis.StemmingAnalyzer(stoplist=stoplist)
    schema = fields.Schema(
        title=fields.TEXT(analyzer=ana),
        author=fields.TEXT(phrase=False),
        subject=fields.TEXT(analyzer=ana, phrase=False),
        file=fields.STORED,
        pos=fields.STORED,
    )

    # MARC fields to extract
    mfields = set(subjectfields)  # Subjects
    mfields.update("100 110 111".split())  # Author
    mfields.add("245")  # Title

    print("Indexing with %d processor(s) and %d MB per processor" %
          (procs, limitmb))
    c = 0
    t = now()
    ix = index.create_in(ixdir, schema)
    with ix.writer(procs=procs, limitmb=limitmb,
                   multisegment=multisegment) as w:
        filenames = [
            filename for filename in os.listdir(basedir)
            if fnmatch.fnmatch(filename, glob)
        ]
        for filename in filenames:
            path = os.path.join(basedir, filename)
            print("Indexing", path)
            f = open(path, 'rb')
            for x, pos in read_file(f, mfields):
                w.add_document(title=uni(title(x)),
                               author=uni(author(x)),
                               subject=uni(subjects(x)),
                               file=filename,
                               pos=pos)
                c += 1
            f.close()
        print("Committing...")
    print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0))
Esempio n. 17
0
 def whoosh_schema(self):
     ana = analysis.StemmingAnalyzer(maxsize=40, cachesize=None)
     storebody = self.options.storebody
     schema = fields.Schema(body=fields.TEXT(analyzer=ana, stored=storebody),
                            filepos=fields.STORED,
                            date=fields.ID(stored=True),
                            frm=fields.ID(stored=True),
                            to=fields.IDLIST(stored=True),
                            subject=fields.TEXT(stored=True),
                            cc=fields.IDLIST,
                            bcc=fields.IDLIST)
     return schema
Esempio n. 18
0
def test_spellable_list():
    # Make sure a spellable field works with a list of pre-analyzed tokens

    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(Location=fields.STORED,Lang=fields.STORED,
                           Title=fields.TEXT(spelling=True, analyzer=ana))
    ix = RamStorage().create_index(schema)

    doc = {'Location': '1000/123', 'Lang': 'E',
           'Title': ['Introduction', 'Numerical', 'Analysis']}

    with ix.writer() as w:
        w.add_document(**doc)
Esempio n. 19
0
def test_bypass_stemming():
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(text=u("rendering shading modeling reactions"))
    w.commit()

    with ix.reader() as r:
        assert_equal(list(r.lexicon("text")),
                     ["model", "reaction", "render", "shade"])
        assert_equal(list(r.word_graph("text").flatten_strings()),
                     ["modeling", "reactions", "rendering", "shading"])
Esempio n. 20
0
def populate_whoosh(text_dir, whoosh_dir):
    loaded = 0

    # Create analyzer used for tokenizing and normalizing tokens
    # 000, 001, 010, 011,
    my_analyzers = [(analysis.RegexTokenizer()),
                    (analysis.RegexTokenizer() | analysis.LowercaseFilter()),
                    (analysis.RegexTokenizer() | analysis.LowercaseFilter()
                     | analysis.StopFilter()), (analysis.StemmingAnalyzer())]
    # Create schemas
    schemas = []
    for my_analyzer in my_analyzers:
        schema = Schema(url=ID(stored=True),
                        body=TEXT(stored=True, analyzer=my_analyzer))
        schemas.append(schema)

    # Setup index
    ixs = []
    for i, my_analyzer in enumerate(my_analyzers):
        whoosh_dir_current = whoosh_dir + str(i) + '/'
        os.makedirs(whoosh_dir_current, exist_ok=True)
        ix = index.create_in(whoosh_dir_current, schemas[i])
        ixs.append(ix)

    # Clear index
    writers = []
    for i, my_analyzer in enumerate(my_analyzer):
        writer = ixs[i].writer()
        writer.commit(mergetype=writing.CLEAR)
        writer = ixs[i].writer()
        writers.append(writer)

    # Index documents
    for root, dirs, files in os.walk(text_dir, topdown=False):
        for name in files:
            text_file = os.path.join(root, name)
            print('.', end='')
            with open(text_file) as tf:
                body = tf.read()
                url = text_file.replace(text_dir, "")
                for writer in writers:
                    writer.add_document(url=url, body=body)
                # print("Added", url)
                loaded += 1

    for writer in writers:
        writer.commit()

    print("\n\nLoaded", loaded, "documents")
Esempio n. 21
0
 def get_index(self):
     stem_ana = analysis.StemmingAnalyzer()
     schema = fields.Schema(
         id=fields.ID(unique=True),
         datetime=fields.DATETIME(sortable=True),
         reply=fields.BOOLEAN,
         retweet=fields.BOOLEAN,
         text=fields.TEXT(analyzer=stem_ana, stored=True)
     )
     index_dir = os.path.join(self.dir, "index")
     if os.path.exists(index_dir):
         self.index = index.open_dir(index_dir)
     else:
         os.mkdir(index_dir)
         self.index = index.create_in(index_dir, schema)
Esempio n. 22
0
def open_index():
    from whoosh import index, fields as f
    if os.path.isdir(app.config['WHOOSH_INDEX']):
        return index.open_dir(app.config['WHOOSH_INDEX'])
    os.mkdir(app.config['WHOOSH_INDEX'])
    analyzer = analysis.StemmingAnalyzer() | analysis.CharsetFilter(accent_map)
    schema = f.Schema(url=f.ID(stored=True, unique=True),
                      id=f.ID(stored=True),
                      title=f.TEXT(stored=True,
                                   field_boost=2.0,
                                   analyzer=analyzer),
                      type=f.ID(stored=True),
                      keywords=f.KEYWORD(commas=True),
                      content=f.TEXT(analyzer=analyzer))
    return index.create_in(app.config['WHOOSH_INDEX'], schema)
Esempio n. 23
0
def test_spelling_field_order():
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(a=fields.TEXT,
                           b=fields.TEXT(analyzer=ana),
                           c=fields.TEXT,
                           d=fields.TEXT(analyzer=ana),
                           e=fields.TEXT(analyzer=ana),
                           f=fields.TEXT)
    ix = RamStorage().create_index(schema)

    domain = u("alfa bravo charlie delta").split()
    w = ix.writer()
    for ls in permutations(domain):
        value = " ".join(ls)
        w.add_document(a=value, b=value, c=value, d=value, e=value, f=value)
    w.commit()
Esempio n. 24
0
def test_very_long_words():
    import sys
    length = int(sys.getrecursionlimit() * 1.5)

    strings1 = [u(chr(i) * length) for i in range(65, 70)]
    strings2 = [u(chr(i) * length) for i in range(71, 75)]

    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        for string in strings1:
            w.add_document(text=string)

    with ix.writer() as w:
        for string in strings2:
            w.add_document(text=string)
        w.optimize = True
Esempio n. 25
0
def test_multivalue():
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(id=fields.STORED,
                           date=fields.DATETIME,
                           num=fields.NUMERIC,
                           txt=fields.TEXT(analyzer=ana))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(id=1, date=datetime(2001, 1, 1), num=5)
        w.add_document(id=2,
                       date=[datetime(2002, 2, 2),
                             datetime(2003, 3, 3)],
                       num=[1, 2, 3, 12])
        w.add_document(txt=u("a b c").split())

    with ix.reader() as r:
        assert ("num", 3) in r
        assert ("date", datetime(2003, 3, 3)) in r
        assert " ".join(r.field_terms("txt")) == "a b c"
Esempio n. 26
0
def test_missing_suggestion():
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(content=fields.TEXT(analyzer=ana, spelling=True),
                           organism=fields.ID)
    ix = RamStorage().create_index(schema)

    with ix.writer() as w:
        w.add_document(organism=u("hs"), content=u("cells"))
        w.add_document(organism=u("hs"), content=u("cell"))

    with ix.searcher() as s:
        r = s.reader()
        assert r.has_word_graph("content")
        gr = r.word_graph("content")
        assert list(gr.flatten()) == [b("cell"), b("cells")]

        c = s.corrector("content")
        # Note that corrector won't suggest the word you submit even though it's
        # in the index
        assert c.suggest("cell") == ["cells"]
def test_snippets():
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(text=fields.TEXT(stored=True, analyzer=ana))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(text=u(
        "Lay out the rough animation by creating the important poses where they occur on the timeline."
    ))
    w.add_document(text=u(
        "Set key frames on everything that's key-able. This is for control and predictability: you don't want to accidentally leave something un-keyed. This is also much faster than selecting the parameters to key."
    ))
    w.add_document(text=u(
        "Use constant (straight) or sometimes linear transitions between keyframes in the channel editor. This makes the character jump between poses."
    ))
    w.add_document(text=u(
        "Keying everything gives quick, immediate results. But it can become difficult to tweak the animation later, especially for complex characters."
    ))
    w.add_document(text=u(
        "Copy the current pose to create the next one: pose the character, key everything, then copy the keyframe in the playbar to another frame, and key everything at that frame."
    ))
    w.commit()

    target = [
        "Set KEY frames on everything that's KEY-able",
        "Copy the current pose to create the next one: pose the character, KEY everything, then copy the keyframe in the playbar to another frame, and KEY everything at that frame",
        "KEYING everything gives quick, immediate results"
    ]

    with ix.searcher() as s:
        qp = qparser.QueryParser("text", ix.schema)
        q = qp.parse(u("key"))
        r = s.search(q, terms=True)
        r.fragmenter = highlight.SentenceFragmenter()
        r.formatter = highlight.UppercaseFormatter()

        assert sorted([hit.highlights("text", top=1)
                       for hit in r]) == sorted(target)
Esempio n. 28
0
 def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None):
     from haystack import site
     results = []
     
     # It's important to grab the hits first before slicing. Otherwise, this
     # can cause pagination failures.
     hits = len(raw_page)
     
     facets = {}
     spelling_suggestion = None
     indexed_models = site.get_indexed_models()
     
     for doc_offset, raw_result in enumerate(raw_page):
         score = raw_page.score(doc_offset) or 0
         app_label, model_name = raw_result['django_ct'].split('.')
         additional_fields = {}
         model = get_model(app_label, model_name)
         
         if model and model in indexed_models:
             for key, value in raw_result.items():
                 index = site.get_index(model)
                 string_key = str(key)
                 
                 if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                     # Special-cased due to the nature of KEYWORD fields.
                     if isinstance(index.fields[string_key], MultiValueField):
                         if value is None or len(value) is 0:
                             additional_fields[string_key] = []
                         else:
                             additional_fields[string_key] = value.split(',')
                     else:
                         additional_fields[string_key] = index.fields[string_key].convert(value)
                 else:
                     additional_fields[string_key] = self._to_python(value)
             
             del(additional_fields['django_ct'])
             del(additional_fields['django_id'])
             
             if highlight:
                 from whoosh import analysis
                 from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
                 sa = analysis.StemmingAnalyzer()
                 terms = [term.replace('*', '') for term in query_string.split()]
                 
                 additional_fields['highlighted'] = {
                     self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())],
                 }
             
             result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields)
             results.append(result)
         else:
             hits -= 1
     
     if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False):
         if spelling_query:
             spelling_suggestion = self.create_spelling_suggestion(spelling_query)
         else:
             spelling_suggestion = self.create_spelling_suggestion(query_string)
     
     return {
         'results': results,
         'hits': hits,
         'facets': facets,
         'spelling_suggestion': spelling_suggestion,
     }
Esempio n. 29
0
def test_analyzing_terms():
    schema = fields.Schema(text=fields.TEXT(analyzer=analysis.StemmingAnalyzer()))
    qp = default.QueryParser("text", schema)
    q = qp.parse(u("Indexed!"))
    assert_equal(q.__class__, query.Term)
    assert_equal(q.text, "index")
Esempio n. 30
0
    def _process_results(self,
                         raw_page,
                         highlight=False,
                         query_string='',
                         spelling_query=None,
                         result_class=None):
        from haystack import connections
        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(
                                    ',')
                        else:
                            additional_fields[string_key] = index.fields[
                                string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])

                if highlight:
                    from whoosh import analysis
                    from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
                    sa = analysis.StemmingAnalyzer()
                    terms = [
                        term.replace('*', '') for term in query_string.split()
                    ]

                    additional_fields['highlighted'] = {
                        self.content_field_name: [
                            highlight(
                                additional_fields.get(self.content_field_name),
                                terms, sa, ContextFragmenter(terms),
                                UppercaseFormatter())
                        ],
                    }

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID], score,
                                      **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(
                    spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(
                    query_string)

        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }