Beispiel #1
0
def test_filter():
    schema = fields.Schema(id=fields.STORED, path=fields.ID, text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=1, path=u("/a/1"), text=u("alfa bravo charlie"))
    w.add_document(id=2, path=u("/b/1"), text=u("bravo charlie delta"))
    w.add_document(id=3, path=u("/c/1"), text=u("charlie delta echo"))
    w.commit(merge=False)
    w = ix.writer()
    w.add_document(id=4, path=u("/a/2"), text=u("delta echo alfa"))
    w.add_document(id=5, path=u("/b/2"), text=u("echo alfa bravo"))
    w.add_document(id=6, path=u("/c/2"), text=u("alfa bravo charlie"))
    w.commit(merge=False)
    w = ix.writer()
    w.add_document(id=7, path=u("/a/3"), text=u("bravo charlie delta"))
    w.add_document(id=8, path=u("/b/3"), text=u("charlie delta echo"))
    w.add_document(id=9, path=u("/c/3"), text=u("delta echo alfa"))
    w.commit(merge=False)

    with ix.searcher() as s:
        fq = query.Or([query.Prefix("path", "/a"),
                       query.Prefix("path", "/b")])
        r = s.search(query.Term("text", "alfa"), filter=fq)
        assert_equal([d["id"] for d in r], [1, 4, 5])

        r = s.search(query.Term("text", "bravo"), filter=fq)
        assert_equal([d["id"] for d in r], [1, 2, 5, 7, ])
Beispiel #2
0
def test_searching():
    with make_index().searcher() as s:

        def _runq(q, result, **kwargs):
            r = s.search(q, **kwargs)
            assert_equal([d["id"] for d in r], result)

        _runq(query.Term("text", u("format")), ["format", "vector"])
        _runq(query.Term("text", u("the")),
              ["fieldtype", "format", "const", "vector", "stored"])
        _runq(query.Prefix("text", u("st")), ["format", "vector", "stored"])
        _runq(query.Wildcard("id", u("*st*")), ["stored", "const"])
        _runq(query.TermRange("id", u("c"), u("s")),
              ["fieldtype", "format", "const"])
        _runq(query.NumericRange("subs", 10, 100),
              ["fieldtype", "format", "vector", "scorable"])
        _runq(query.Phrase("text", ["this", "field"]),
              ["scorable", "unique", "stored"],
              limit=None)
        _runq(query.Every(), [
            "fieldtype", "format", "vector", "scorable", "stored", "unique",
            "const"
        ])
        _runq(query.Every("subs"), [
            "fieldtype", "format", "vector", "scorable", "stored", "unique",
            "const"
        ])
def test_termdocs():
    schema = fields.Schema(key=fields.TEXT, city=fields.ID)
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(key=u"ant", city=u"london")
        w.add_document(key=u"anteater", city=u"roma")
        w.add_document(key=u"bear", city=u"london")
        w.add_document(key=u"bees", city=u"roma")
        w.add_document(key=u"anorak", city=u"london")
        w.add_document(key=u"antimatter", city=u"roma")
        w.add_document(key=u"angora", city=u"london")
        w.add_document(key=u"angels", city=u"roma")

    with ix.searcher() as s:
        cond_q = query.Term("city", u"london")
        pref_q = query.Prefix("key", u"an")
        q = query.And([cond_q, pref_q]).normalize()
        r = s.search(q, scored=False, terms=True)

        field = s.schema["key"]
        terms = [
            field.from_bytes(term) for fieldname, term in r.termdocs
            if fieldname == "key"
        ]
        assert sorted(terms) == [u"angora", u"anorak", u"ant"]
Beispiel #4
0
def test_multi():
    schema = fields.Schema(id=fields.ID(stored=True),
                           content=fields.KEYWORD(stored=True))
    with TempIndex(schema, "multi") as ix:
        writer = ix.writer()
        # Deleted 1
        writer.add_document(id=u("1"), content=u("alfa bravo charlie"))
        # Deleted 1
        writer.add_document(id=u("2"), content=u("bravo charlie delta echo"))
        # Deleted 2
        writer.add_document(id=u("3"), content=u("charlie delta echo foxtrot"))
        writer.commit()

        writer = ix.writer()
        writer.delete_by_term("id", "1")
        writer.delete_by_term("id", "2")
        writer.add_document(id=u("4"), content=u("apple bear cherry donut"))
        writer.add_document(id=u("5"), content=u("bear cherry donut eggs"))
        # Deleted 2
        writer.add_document(id=u("6"), content=u("delta echo foxtrot golf"))
        # no d
        writer.add_document(id=u("7"), content=u("echo foxtrot golf hotel"))
        writer.commit(merge=False)

        writer = ix.writer()
        writer.delete_by_term("id", "3")
        writer.delete_by_term("id", "6")
        writer.add_document(id=u("8"), content=u("cherry donut eggs falafel"))
        writer.add_document(id=u("9"), content=u("donut eggs falafel grape"))
        writer.add_document(id=u("A"), content=u(" foxtrot golf hotel india"))
        writer.commit(merge=False)

        assert ix.doc_count() == 6

        with ix.searcher() as s:
            r = s.search(query.Prefix("content", u("d")), optimize=False)
            assert sorted([d["id"] for d in r]) == ["4", "5", "8", "9"]

            r = s.search(query.Prefix("content", u("d")))
            assert sorted([d["id"] for d in r]) == ["4", "5", "8", "9"]

            r = s.search(query.Prefix("content", u("d")), limit=None)
            assert sorted([d["id"] for d in r]) == ["4", "5", "8", "9"]
def test_quoted_prefix():
    qp = default.QueryParser("f", None)

    expr = r"(^|(?<=[ (]))(?P<text>\w+|[*]):"
    qp.replace_plugin(plugins.FieldsPlugin(expr))

    q = qp.parse(u('foo url:http://apple.com:8080/bar* baz'))
    assert isinstance(q, query.And)
    assert q[0] == query.Term("f", "foo")
    assert q[1] == query.Prefix("url", "http://apple.com:8080/bar")
    assert q[2] == query.Term("f", "baz")
    assert len(q) == 3
Beispiel #6
0
def test_too_many_prefix_positions():
    from whoosh import matching

    schema = fields.Schema(id=fields.STORED, text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        for i in xrange(200):
            text = u("a%s" % i)
            w.add_document(id=i, text=text)

    q = query.Prefix("text", u("a"))
    q.TOO_MANY_CLAUSES = 100

    with ix.searcher() as s:
        m = q.matcher(s)
        assert_equal(m.__class__, matching.ListMatcher)
        assert m.supports("positions")
        items = list(m.items_as("positions"))
        assert_equal([(i, [0]) for i in xrange(200)], items)
Beispiel #7
0
def test_patterns():
    domain = u("aaron able acre adage aether after ago ahi aim ajax akimbo "
               "alembic all amiga amount ampere").split()
    schema = fields.Schema(word=fields.KEYWORD(stored=True))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        for word in domain:
            w.add_document(word=word)

    with ix.reader() as r:
        assert_equal(list(r.lexicon("word")), domain)

        assert_equal(list(r.expand_prefix("word", "al")), ["alembic", "all"])
        q = query.Prefix("word", "al")
        assert_equal(q.simplify(r).__unicode__(), "(word:alembic OR word:all)")

        q = query.Wildcard("word", "a*[ae]")
        assert_equal(q.simplify(r).__unicode__(),
                     "(word:able OR word:acre OR word:adage OR " +
                     "word:amiga OR word:ampere)")
        assert_equal(q._find_prefix(q.text), "a")

        q = query.Regex("word", "am.*[ae]")
        assert_equal(q.simplify(r).__unicode__(),
                     "(word:amiga OR word:ampere)")
        assert_equal(q._find_prefix(q.text), "am")

        q = query.Regex("word", "able|ago")
        assert_equal(q.simplify(r).__unicode__(), "(word:able OR word:ago)")
        assert_equal(q._find_prefix(q.text), "")

        # special case: ? may mean "zero occurences"
        q = query.Regex("word", "ah?i")
        assert_equal(q.simplify(r).__unicode__(), "(word:ahi OR word:aim)")
        assert_equal(q._find_prefix(q.text), "a")

        # special case: * may mean "zero occurences"
        q = query.Regex("word", "ah*i")
        assert_equal(q.simplify(r).__unicode__(), "(word:ahi OR word:aim)")
        assert_equal(q._find_prefix(q.text), "a")
Beispiel #8
0
 def parse_query(self, fieldname, qstring, boost=1.0):
     text = self.process_text(qstring)
     from whoosh import query
     return query.Prefix(fieldname, text, boost=boost)
 def prefix_query(query_string):
     if len(query_string) == 1:
         return query.Prefix(_TEXT_FIELD, query_string)
     else:
         return query.Prefix(_TEXT_FIELD, query_string)
Beispiel #10
0
 def make_prefix(self, fieldname, text):
     fieldname = fieldname or self.default_field
     text = self._analyze(fieldname, text)
     return query.Prefix(fieldname, text)
def filter_corpus(corpus_ind_dir, query_list, year_from, year_to):
    ix = index.open_dir(corpus_ind_dir)  #load index

    with ix.searcher() as searcher:

        parser = QueryParser("content", ix.schema)
        term_list_T = []
        term_list_Y = []

        for t in query_list:
            t = re.sub(r'[^a-zA-Z0-9_ ]', '', t).lower()
            splitted = t.split()
            if len(splitted) > 1:
                term_list_T.append(query.Phrase("content", splitted))
            else:
                term_list_T.append(query.Term("content", t))

        for y in range(year_from, year_to + 1):
            term_list_Y.append(query.Term("year", str(y)))

        q1 = query.Or(term_list_T)
        q2 = query.Or(term_list_Y)

        q_f = query.And([q1, q2])

        results = searcher.search(q_f, limit=None)

        result_list = []
        relevant_article_ids = []
        i = 0

        for r in results:
            i += 1
            article_id = r["id"].split('_')[0]
            if not article_id in relevant_article_ids:
                relevant_article_ids.append(article_id)

        new_corpus = []
        for r_article_id in sorted(relevant_article_ids):
            article_id = r_article_id + "_"
            q = query.Prefix("id", article_id)
            x = 0
            row_data = {}
            for r in searcher.search(q, limit=None):
                if x == 0:
                    for key in r:
                        if key == "content":
                            row_data["sentences"] = r['content']
                            x += 1
                        elif key == "id":
                            row_data["id"] = article_id[:-1]
                        else:
                            row_data[key] = r[key]

                else:
                    sent = " " + r['content']
                    row_data["sentences"] += sent
            new_corpus.append(row_data)

        pd_save = pd.DataFrame.from_records(new_corpus)
        cols = ['id'] + [col for col in pd_save if col != 'id']
        pd_save = pd_save[cols]
        return pd_save.to_csv(encoding='utf-8')