Exemple #1
0
 def create_index(self, path, *args, **kwargs):
     self._path = path
     self.index = xapian.WritableDatabase(path,
                                          xapian.DB_CREATE_OR_OVERWRITE)
Exemple #2
0
def test_postingsource():
    """Simple test of the PostingSource class.

    """
    class OddPostingSource(xapian.PostingSource):
        def __init__(self, max):
            xapian.PostingSource.__init__(self)
            self.max = max

        def init(self, db):
            self.current = -1

        def get_termfreq_min(self):
            return 0

        def get_termfreq_est(self):
            return int(self.max / 2)

        def get_termfreq_max(self):
            return self.max

        def next(self, minweight):
            self.current += 2

        def at_end(self):
            return self.current > self.max

        def get_docid(self):
            return self.current

    dbpath = 'db_test_postingsource'
    db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OVERWRITE)
    for id in range(10):
        doc = xapian.Document()
        db.add_document(doc)

    # Do a dance to check that the posting source doesn't get dereferenced too
    # soon in various cases.
    def mkenq(db):
        # First - check that it's kept when the source goes out of scope.
        def mkquery():
            source = OddPostingSource(10)
            return xapian.Query(xapian.Query.OP_OR, [xapian.Query(source)])

        # Check that it's kept when the query goes out of scope.
        def submkenq():
            query = mkquery()
            enquire = xapian.Enquire(db)
            enquire.set_query(query)
            return enquire

        # Check it's kept when the query is retrieved from enquire and put into
        # a new enquire.
        def submkenq2():
            enq1 = submkenq()
            enquire = xapian.Enquire(db)
            enquire.set_query(enq1.get_query())
            return enquire

        return submkenq2()

    enquire = mkenq(db)
    mset = enquire.get_mset(0, 10)

    expect([item.docid for item in mset], [1, 3, 5, 7, 9])

    db.close()
    shutil.rmtree(dbpath)
Exemple #3
0
def rebuild_database(pathname, debian_sources=True, appstream_sources=False):
    #cache = apt.Cache(memonly=True)
    cache = get_pkg_info()
    cache.open()
    old_path = pathname + "_old"
    rebuild_path = pathname + "_rb"

    if not os.path.exists(rebuild_path):
        try:
            os.makedirs(rebuild_path)
        except:
            LOG.warn("Problem creating rebuild path '%s'." % rebuild_path)
            LOG.warn("Please check you have the relevant permissions.")
            return False

    # check permission
    if not os.access(pathname, os.W_OK):
        LOG.warn("Cannot write to '%s'." % pathname)
        LOG.warn("Please check you have the relevant permissions.")
        return False

    #check if old unrequired version of db still exists on filesystem
    if os.path.exists(old_path):
        LOG.warn("Existing xapian old db was not previously cleaned: '%s'." %
                 old_path)
        if os.access(old_path, os.W_OK):
            #remove old unrequired db before beginning
            shutil.rmtree(old_path)
        else:
            LOG.warn("Cannot write to '%s'." % old_path)
            LOG.warn("Please check you have the relevant permissions.")
            return False

    # write it
    db = xapian.WritableDatabase(rebuild_path, xapian.DB_CREATE_OR_OVERWRITE)

    if debian_sources:
        update(db, cache)
    if appstream_sources:
        update_from_appstream_xml(db, cache)

    # write the database version into the filep
    db.set_metadata("db-schema-version", DB_SCHEMA_VERSION)
    # update the mo file stamp for the langpack checks
    mofile = gettext.find("app-install-data")
    if mofile:
        mo_time = os.path.getctime(mofile)
        db.set_metadata("app-install-mo-time", str(mo_time))
    db.flush()

    # use shutil.move() instead of os.rename() as this will automatically
    # figure out if it can use os.rename or needs to do the move "manually"
    try:
        shutil.move(pathname, old_path)
        shutil.move(rebuild_path, pathname)
        shutil.rmtree(old_path)
        return True
    except:
        LOG.warn("Cannot copy refreshed database to correct location: '%s'." %
                 pathname)
        return False
Exemple #4
0
    def __init__(self, root, writable=False, create=False, force=False):
        # xapers root
        self.root = os.path.abspath(os.path.expanduser(root))

        # xapers db directory
        xapers_path = os.path.join(self.root, '.xapers')

        # xapes directory initialization
        if not os.path.exists(xapers_path):
            if create:
                if os.path.exists(self.root):
                    if os.listdir(self.root) and not force:
                        raise DatabaseInitializationError(
                            'Uninitialized Xapers root directory exists but is not empty.'
                        )
                os.makedirs(xapers_path)
            else:
                if os.path.exists(self.root):
                    raise DatabaseInitializationError(
                        "Xapers directory '%s' does not contain a database." %
                        (self.root))
                else:
                    raise DatabaseUninitializedError(
                        "Xapers directory '%s' not found." % (self.root))

        # the Xapian db
        xapian_path = os.path.join(xapers_path, 'xapian')
        if writable:
            try:
                self.xapian = xapian.WritableDatabase(xapian_path,
                                                      xapian.DB_CREATE_OR_OPEN)
            except xapian.DatabaseLockError:
                raise DatabaseLockError("Xapers database locked.")
        else:
            self.xapian = xapian.Database(xapian_path)

        stemmer = xapian.Stem("english")

        # The Xapian TermGenerator
        # http://trac.xapian.org/wiki/FAQ/TermGenerator
        self.term_gen = xapian.TermGenerator()
        self.term_gen.set_stemmer(stemmer)

        # The Xapian QueryParser
        self.query_parser = xapian.QueryParser()
        self.query_parser.set_database(self.xapian)
        self.query_parser.set_stemmer(stemmer)
        self.query_parser.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
        self.query_parser.set_default_op(xapian.Query.OP_AND)

        # add boolean internal prefixes
        for name, prefix in self.BOOLEAN_PREFIX.items():
            self.query_parser.add_boolean_prefix(name, prefix)
        # for prefixes that can be applied multiply to the same
        # document (like tags) set the filter grouping to use AND:
        # https://xapian.org/docs/apidoc/html/classXapian_1_1QueryParser.html#a67d25f9297bb98c2101a03ff3d60cf30
        for name, prefix in self.BOOLEAN_PREFIX_MULTI.items():
            self.query_parser.add_boolean_prefix(name, prefix, False)

        # add probabalistic prefixes
        for name, prefix in self.PROBABILISTIC_PREFIX.items():
            self.query_parser.add_prefix(name, prefix)

        # add value facets
        for name, facet in self.NUMBER_VALUE_FACET.items():
            self.query_parser.add_valuerangeprocessor(
                xapian.NumberValueRangeProcessor(facet, name + ':'))

        # register known source prefixes
        # FIXME: can we do this by just finding all XSOURCE terms in
        #        db?  Would elliminate dependence on source modules at
        #        search time.
        for source in Sources():
            name = source.name
            self.query_parser.add_boolean_prefix(
                name, self._make_source_prefix(name))
Exemple #5
0
def test_all():
    # Test the version number reporting functions give plausible results.
    v = "%d.%d.%d" % (xapian.major_version(),
                      xapian.minor_version(),
                      xapian.revision())
    v2 = xapian.version_string()
    expect(v2, v, "Unexpected version output")

    # A regexp check would be better, but seems to create a bogus "leak" of -1
    # objects in Python 3.
    expect(len(xapian.__version__.split('.')), 3, 'xapian.__version__ not X.Y.Z')
    expect((xapian.__version__.split('.'))[0], '1', 'xapian.__version__ not "1.Y.Z"')

    def access_cvar():
        res = xapian.cvar
        print "Unhandled constants: ", res
        return res

    # Check that SWIG isn't generating cvar (regression test for ticket#297).
    expect_exception(AttributeError, "'module' object has no attribute 'cvar'",
                     access_cvar)

    stem = xapian.Stem("english")
    expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)")

    doc = xapian.Document()
    doc.set_data("a\0b")
    if doc.get_data() == "a":
        raise TestFail("get_data+set_data truncates at a zero byte")
    expect(doc.get_data(), "a\0b", "get_data+set_data doesn't transparently handle a zero byte")
    doc.set_data("is there anybody out there?")
    doc.add_term("XYzzy")
    doc.add_posting(stem("is"), 1)
    doc.add_posting(stem("there"), 2)
    doc.add_posting(stem("anybody"), 3)
    doc.add_posting(stem("out"), 4)
    doc.add_posting(stem("there"), 5)

    db = xapian.WritableDatabase('', xapian.DB_BACKEND_INMEMORY)
    db.add_document(doc)
    expect(db.get_doccount(), 1, "Unexpected db.get_doccount()")
    terms = ["smoke", "test", "terms"]
    expect_query(xapian.Query(xapian.Query.OP_OR, terms),
                 "(smoke OR test OR terms)")
    query1 = xapian.Query(xapian.Query.OP_PHRASE, ("smoke", "test", "tuple"))
    query2 = xapian.Query(xapian.Query.OP_XOR, (xapian.Query("smoke"), query1, "string"))
    expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)")
    expect_query(query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)")
    subqs = ["a", "b"]
    expect_query(xapian.Query(xapian.Query.OP_OR, subqs), "(a OR b)")
    expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, '1', '4'),
                 "VALUE_RANGE 0 1 4")

    # Check database factory functions are wrapped as expected:

    expect_exception(xapian.DatabaseNotFoundError, None,
                     xapian.Database, "nosuchdir/nosuchdb", xapian.DB_BACKEND_STUB)
    expect_exception(xapian.DatabaseNotFoundError, None,
                     xapian.WritableDatabase, "nosuchdir/nosuchdb", xapian.DB_OPEN|xapian.DB_BACKEND_STUB)

    expect_exception(xapian.NetworkError, None,
                     xapian.remote_open, "/bin/false", "")
    expect_exception(xapian.NetworkError, None,
                     xapian.remote_open_writable, "/bin/false", "")

    expect_exception(xapian.NetworkError, None,
                     xapian.remote_open, "127.0.0.1", 0, 1)
    expect_exception(xapian.NetworkError, None,
                     xapian.remote_open_writable, "127.0.0.1", 0, 1)

    # Check wrapping of MatchAll and MatchNothing:

    expect_query(xapian.Query.MatchAll, "<alldocuments>")
    expect_query(xapian.Query.MatchNothing, "")

    # Feature test for Query.__iter__
    term_count = 0
    for term in query2:
        term_count += 1
    expect(term_count, 4, "Unexpected number of terms in query2")

    enq = xapian.Enquire(db)

    # Check Xapian::BAD_VALUENO is wrapped suitably.
    enq.set_collapse_key(xapian.BAD_VALUENO)

    enq.set_query(xapian.Query(xapian.Query.OP_OR, "there", "is"))
    mset = enq.get_mset(0, 10)
    expect(mset.size(), 1, "Unexpected mset.size()")
    expect(len(mset), 1, "Unexpected mset.size()")

    # Feature test for Enquire.matching_terms(docid)
    term_count = 0
    for term in enq.matching_terms(mset.get_hit(0)):
        term_count += 1
    expect(term_count, 2, "Unexpected number of matching terms")

    # Feature test for MSet.__iter__
    msize = 0
    for match in mset:
        msize += 1
    expect(msize, mset.size(), "Unexpected number of entries in mset")

    terms = " ".join(enq.matching_terms(mset.get_hit(0)))
    expect(terms, "is there", "Unexpected terms")

    # Feature test for ESet.__iter__
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enq.get_eset(10, rset)
    term_count = 0
    for term in eset:
        term_count += 1
    expect(term_count, 3, "Unexpected number of expand terms")

    # Feature test for Database.__iter__
    term_count = 0
    for term in db:
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db")

    # Feature test for Database.allterms
    term_count = 0
    for term in db.allterms():
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db.allterms")

    # Feature test for Database.postlist
    count = 0
    for posting in db.postlist("there"):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('there')")

    # Feature test for Database.postlist with empty term (alldocspostlist)
    count = 0
    for posting in db.postlist(""):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('')")

    # Feature test for Database.termlist
    count = 0
    for term in db.termlist(1):
        count += 1
    expect(count, 5, "Unexpected number of entries in db.termlist(1)")

    # Feature test for Database.positionlist
    count = 0
    for term in db.positionlist(1, "there"):
        count += 1
    expect(count, 2, "Unexpected number of entries in db.positionlist(1, 'there')")

    # Feature test for Document.termlist
    count = 0
    for term in doc.termlist():
        count += 1
    expect(count, 5, "Unexpected number of entries in doc.termlist()")

    # Feature test for TermIter.skip_to
    term = doc.termlist()
    term.skip_to('n')
    while True:
        try:
            x = next(term)
        except StopIteration:
            break
        if x.term < 'n':
            raise TestFail("TermIter.skip_to didn't skip term '%s'" % x.term)

    # Feature test for Document.values
    count = 0
    for term in doc.values():
        count += 1
    expect(count, 0, "Unexpected number of entries in doc.values")

    # Check exception handling for Xapian::DocNotFoundError
    expect_exception(xapian.DocNotFoundError, "Docid 3 not found", db.get_document, 3)

    # Check value of OP_ELITE_SET
    expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET")

    # Feature test for MatchDecider
    doc = xapian.Document()
    doc.set_data("Two")
    doc.add_posting(stem("out"), 1)
    doc.add_posting(stem("outside"), 1)
    doc.add_posting(stem("source"), 2)
    doc.add_value(0, "yes")
    db.add_document(doc)

    class testmatchdecider(xapian.MatchDecider):
        def __call__(self, doc):
            return doc.get_value(0) == "yes"

    query = xapian.Query(stem("out"))
    enquire = xapian.Enquire(db)
    enquire.set_query(query)
    mset = enquire.get_mset(0, 10, None, testmatchdecider())
    expect(mset.size(), 1, "Unexpected number of documents returned by match decider")
    expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in")

    # Feature test for ExpandDecider
    class testexpanddecider(xapian.ExpandDecider):
        def __call__(self, term):
            return (not term.startswith('a'))

    enquire = xapian.Enquire(db)
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, testexpanddecider())
    eset_terms = [item.term for item in eset]
    expect(len(eset_terms), eset.size(), "Unexpected number of terms returned by expand")
    if [t for t in eset_terms if t.startswith('a')]:
        raise TestFail("ExpandDecider was not used")

    # Check min_wt argument to get_eset() works (new in 1.2.5).
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ)
    expect([i.weight for i in eset][-1] < 1.9, True, "test get_eset() without min_wt")
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, None, 1.9)
    expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt")

    # Check QueryParser parsing error.
    qp = xapian.QueryParser()
    expect_exception(xapian.QueryParserError, "Syntax: <expression> AND <expression>", qp.parse_query, "test AND")

    # Check QueryParser pure NOT option
    qp = xapian.QueryParser()
    expect_query(qp.parse_query("NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
                 "(0 * <alldocuments> AND_NOT test@1)")

    # Check QueryParser partial option
    qp = xapian.QueryParser()
    qp.set_database(db)
    qp.set_default_op(xapian.Query.OP_AND)
    qp.set_stemming_strategy(qp.STEM_SOME)
    qp.set_stemmer(xapian.Stem('en'))
    expect_query(qp.parse_query("foo ox", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND (WILDCARD SYNONYM ox OR Zox@2))")

    expect_query(qp.parse_query("foo outside", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND (WILDCARD SYNONYM outside OR Zoutsid@2))")

    # Test supplying unicode strings
    expect_query(xapian.Query(xapian.Query.OP_OR, (u'foo', u'bar')),
                 '(foo OR bar)')
    expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', u'bar\xa3')),
                 '(foo OR bar\xc2\xa3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', 'bar\xc2\xa3')),
                 '(foo OR bar\xc2\xa3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, u'foo', u'bar'),
                 '(foo OR bar)')

    expect_query(qp.parse_query(u"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
                 "(0 * <alldocuments> AND_NOT Zt\xc3\xa9st@1)")

    doc = xapian.Document()
    doc.set_data(u"Unicode with an acc\xe9nt")
    doc.add_posting(stem(u"out\xe9r"), 1)
    expect(doc.get_data(), u"Unicode with an acc\xe9nt".encode('utf-8'))
    term = doc.termlist().next().term
    expect(term, u"out\xe9r".encode('utf-8'))

    # Check simple stopper
    stop = xapian.SimpleStopper()
    qp.set_stopper(stop)
    expect(stop('a'), False)
    expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    stop.add('a')
    expect(stop('a'), True)
    expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Feature test for custom Stopper
    class my_b_stopper(xapian.Stopper):
        def __call__(self, term):
            return term == "b"

        def get_description(self):
            return u"my_b_stopper"

    stop = my_b_stopper()
    expect(stop.get_description(), u"my_b_stopper")
    qp.set_stopper(stop)
    expect(stop('a'), False)
    expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    expect(stop('b'), True)
    expect_query(qp.parse_query(u"foo bar b", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Test SimpleStopper initialised from a file.
    try:
        srcdir = os.environ['srcdir']
    except KeyError:
        srcdir = '.'
    stop = xapian.SimpleStopper(srcdir + '/../shortstop.list')
    expect(stop('a'), True)
    expect(stop('am'), False)
    expect(stop('an'), True)
    expect(stop('the'), True)

    expect_exception(xapian.InvalidArgumentError, None, xapian.SimpleStopper, 'nosuchfile')

    # Test TermGenerator
    termgen = xapian.TermGenerator()
    doc = xapian.Document()
    termgen.set_document(doc)
    termgen.index_text('foo bar baz foo')
    expect([(item.term, item.wdf, [pos for pos in item.positer]) for item in doc.termlist()], [('bar', 1, [2]), ('baz', 1, [3]), ('foo', 2, [1, 4])])


    # Check DateRangeProcessor works
    context("checking that DateRangeProcessor works")
    qp = xapian.QueryParser()
    rpdate = xapian.DateRangeProcessor(1, xapian.RP_DATE_PREFER_MDY, 1960)
    qp.add_rangeprocessor(rpdate)
    query = qp.parse_query('12/03/99..12/04/01')
    expect(str(query), 'Query(VALUE_RANGE 1 19991203 20011204)')

    # Feature test for xapian.FieldProcessor
    context("running feature test for xapian.FieldProcessor")
    class testfieldprocessor(xapian.FieldProcessor):
        def __call__(self, s):
            if s == 'spam':
                raise Exception('already spam')
            return xapian.Query("spam")

    qp.add_prefix('spam', testfieldprocessor())
    qp.add_boolean_prefix('boolspam', testfieldprocessor())
    qp.add_boolean_prefix('boolspam2', testfieldprocessor(), False) # Old-style
    qp.add_boolean_prefix('boolspam3', testfieldprocessor(), '')
    qp.add_boolean_prefix('boolspam4', testfieldprocessor(), 'group')
    qp.add_boolean_prefix('boolspam5', testfieldprocessor(), None)
    query = qp.parse_query('spam:ignored')
    expect(str(query), 'Query(spam)')

    expect_exception(Exception, 'already spam', qp.parse_query, 'spam:spam')

    # Regression tests copied from PHP (probably always worked in python, but
    # let's check...)
    context("running regression tests for issues which were found in PHP")

    # PHP overload resolution involving boolean types failed.
    enq.set_sort_by_value(1, True)

    # Regression test - fixed in 0.9.10.1.
    oqparser = xapian.QueryParser()
    oquery = oqparser.parse_query("I like tea")

    # Regression test for bug fixed in 1.4.4:
    # https://bugs.debian.org/849722
    oqparser.add_boolean_prefix('tag', 'K', '')
    # Make sure other cases also work:
    oqparser.add_boolean_prefix('zag', 'XR', False) # Old-style
    oqparser.add_boolean_prefix('rag', 'XR', None)
    oqparser.add_boolean_prefix('nag', 'XB', '')
    oqparser.add_boolean_prefix('bag', 'XB', 'blergh')
    oqparser.add_boolean_prefix('gag', 'XB', u'blergh')
    oqparser.add_boolean_prefix('jag', 'XB', b'blergh')

    # Regression test for bug#192 - fixed in 1.0.3.
    enq.set_cutoff(100)

    # Test setting and getting metadata
    expect(db.get_metadata('Foo'), '')
    db.set_metadata('Foo', 'Foo')
    expect(db.get_metadata('Foo'), 'Foo')
    expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '')
    expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.set_metadata, '', 'Foo')
    expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '')

    # Test OP_SCALE_WEIGHT and corresponding constructor
    expect_query(xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query('foo'), 5),
                 "5 * foo")
def test_replication_concurrency():
    """Test concurrent replication and modification

    """

    builddir = os.environ['abs_builddir']
    dbsdir = os.path.join(builddir, 'dbs_replication')
    if not os.path.isdir(dbsdir):
        os.makedirs(dbsdir)

    masterpath = os.path.join(dbsdir, 'master')
    firstpath = os.path.join(dbsdir, 'first')
    secondpath = os.path.join(dbsdir, 'second')
    slavepath = os.path.join(dbsdir, 'slave')
    if os.path.isdir(masterpath):
        shutil.rmtree(masterpath)
    if os.path.isdir(slavepath):
        shutil.rmtree(slavepath)
    port = 7876

    expect_exception(
        xapian.DatabaseOpeningError,
        "Couldn't stat '" + dbsdir + "/slave' (No such file or directory)",
        xapian.Database, slavepath)

    clientp = None
    serverp = subprocess.Popen((
        '../../xapian-core/bin/xapian-replicate-server',
        dbsdir,
        '--port=7876',
    ), )

    doccount1 = 10000
    doccount2 = 1000

    starttime = time.time()
    if not os.path.isdir(firstpath):
        firstdb = xapian.WritableDatabase(firstpath,
                                          xapian.DB_CREATE_OR_OVERWRITE)
        # Make an initial, large database
        print
        print "Building initial database ..."
        for num in xrange(1, doccount1):
            doc = xapian.Document()
            val = 'val%d' % num
            doc.add_value(1, val)
            firstdb.add_document(doc)
            if num % 100000 == 0:
                print "%d documents..." % num
        firstdb.set_metadata('dbname', '1')
        firstdb.commit()
        print "built"

    # The secondary database gets modified during the test, so needs to be
    # cleared now.
    shutil.rmtree(secondpath)
    if not os.path.isdir(secondpath):
        seconddb = xapian.WritableDatabase(secondpath,
                                           xapian.DB_CREATE_OR_OVERWRITE)
        # Make second, small database
        print
        print "Building secondary database ..."
        for num in xrange(1, doccount2):
            doc = xapian.Document()
            val = 'val%d' % num
            doc.add_value(1, val)
            seconddb.add_document(doc)
            if num % 100000 == 0:
                print "%d documents..." % num
        seconddb.set_metadata('dbname', '2')
        seconddb.commit()
        print "built"

    if time.time() - starttime < 1:
        time.sleep(1)  # Give server time to start

    try:
        set_master(masterpath, firstpath)
        clientp = subprocess.Popen((
            '../../xapian-core/bin/xapian-replicate',
            '--host=127.0.0.1',
            '--master=master',
            os.path.join(dbsdir, 'slave'),
            '--interval=0',
            '--port=7876',
            '-r 0',
        ), )
        time.sleep(1)  # Give client time to start
        expect(xapian.Database(slavepath).get_metadata('dbname'), '1')

        for count in xrange(10):
            # Test that swapping between databases doesn't confuse replication.
            for count2 in xrange(2):
                set_master(masterpath, secondpath)
                time.sleep(0.1)
                set_master(masterpath, firstpath)
                time.sleep(0.1)

            # Test making changes to the database.
            set_master(masterpath, secondpath)
            masterdb = xapian.WritableDatabase(masterpath, xapian.DB_OPEN)
            print "making 100 changes"
            for num in xrange(100):
                masterdb.set_metadata('num%d' % num, str(num + count))
                masterdb.commit()
            print "changes done"
            masterdb.close()

            # Allow time for the replication client to catch up with the
            # changes.
            time.sleep(2)
            expect(xapian.Database(slavepath).get_metadata('dbname'), '2')
            expect(
                xapian.Database(slavepath).get_metadata('num99'),
                str(99 + count))

    finally:
        if clientp is not None:
            os.kill(clientp.pid, 9)
            clientp.wait()
        os.kill(serverp.pid, 9)
        serverp.wait()
Exemple #7
0
db.authenticate('root','root')
db = connection.weibo
print 'pymongo success'

#stopwords
stopwords = set([line.strip('\r\n') for line in file('ext_stopword.dic')])
#emotionlist
emotionlist = [unicode(line.strip('\r\n'),'utf-8') for line in file('emotionlist.txt')]

if len(sys.argv) != 2:
    print >> sys.stderr, "Usage: %s PATH_TO_DATABASE" % sys.argv[0]
    sys.exit(1)

try:
    # Open the database for update, creating a new database if necessary.
    database = xapian.WritableDatabase(sys.argv[1], xapian.DB_CREATE_OR_OPEN)
    print database,'open database weibo'
    emotionvi = 0
    keywordsvi = 1
    timestampvi = 2
    loctvi = 3
    reploctvi = 4
    emotiononlyvi = 5
    usernamevi = 6
    hashtagsvi = 7
    uidvi = 8
    repnameslistvi = 9
    widvi = 10

    """
    weibos = ''
Exemple #8
0
 def create_index(self):
     """ Create a new index, and set up its field structure """
     self.db = xapian.WritableDatabase(self.dbpath,
                                       xapian.DB_CREATE_OR_OPEN)
     self.indexer = xapian.TermGenerator()
     self.indexer.set_stemmer(xapian.Stem("en"))
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import xapian
import sys

if __name__ == '__main__':
    try:
        sample_file = sys.argv[1]
        popcon = xapian.WritableDatabase(sys.argv[2], xapian.DB_OPEN)
    except:
        print "Usage: extract-sample-db sample_file popcon_index"
        exit(1)
    enquire = xapian.Enquire(popcon)
    print sample_file.split("/")
    new_popcon = xapian.WritableDatabase(
        sys.argv[2] + "-" + sample_file.split("/")[-1],
        xapian.DB_CREATE_OR_OVERWRITE)
    print("Popcon repository size: %d" % popcon.get_doccount())
    for submission in open(sample_file):
        print "ID" + submission.strip()
        query = xapian.Query("ID" + submission.strip())
        enquire.set_query(query)
        mset = enquire.get_mset(0, 20)
        for m in mset:
Exemple #10
0
 def _read_write_db(self):
     """Retruns a read-write xapian Database object."""
     return xapian.WritableDatabase(settings.SEARH_INDEX_PATH,
                                    xapian.DB_CREATE_OR_OPEN)
Exemple #11
0
 def create_database(self):
     database = xapian.WritableDatabase(
         self._path,
         xapian.DB_CREATE_OR_OPEN,
     )
     del database
Exemple #12
0
def _open_collection(path, rw=READ):
    if rw == READ:
        return xapian.Database(path)
    elif rw == WRITE:
        return xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN)
Exemple #13
0



from config import SEARCH_DB_PATH
from mmseg.search import seg_txt_search, seg_title_search, seg_txt_2_dict
from os import makedirs
from os.path import join, exists
import xapian
from collections import defaultdict

PATH = join(SEARCH_DB_PATH, 'zsite')
if not exists(PATH):
    makedirs(PATH)

SEARCH_DB = xapian.WritableDatabase(PATH, xapian.DB_CREATE_OR_OPEN)

def flush_db():
    SEARCH_DB.flush()


def index(keyword_iter):
    for id, cid, rank, kw in keyword_iter():
        doc = xapian.Document()
        doc.add_value(0, id)
        doc.add_value(1, xapian.sortable_serialise(rank))
        doc.add_value(2, cid)

        for word, value in kw:
            if word:
                if not word.startswith('>'):
# Import system modules
import os
import xapian
import datetime
# Import custom modules
from query_process_simplified import TextMachine

# Load the Xapian database
databasePath = os.path.abspath('xapian-database')
database = xapian.WritableDatabase(databasePath, xapian.DB_OPEN)
# Set slot constants
xapian_file_name, xapian_when, xapian_owner_id = xrange(3)


def search(queryString, byDate=False, ownerID=None, extractLength=32):
    # Parse query string
    queryParser = xapian.QueryParser()
    queryParser.set_stemmer(xapian.Stem('english'))
    queryParser.set_database(database)
    queryParser.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
    query = queryParser.parse_query(queryString)
    # Set offset and limit for pagination
    offset, limit = 0, database.get_doccount()
    # Start query session
    enquire = xapian.Enquire(database)
    enquire.set_query(query)
    # Sort by date
    if byDate:
        enquire.set_sort_by_value(xapian_when)
    if ownerID == None:
        matches = enquire.get_mset(offset, limit)
Exemple #15
0
 def _get_database(self):
     index = os.path.join(self.db_path, 'text-index')
     return xapian.WritableDatabase(index, xapian.DB_CREATE_OR_OPEN)
Exemple #16
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import xapian

kw = ["苹果", "成都"]

db = xapian.WritableDatabase("db/test", xapian.DB_OPEN)
parser = xapian.QueryParser()

# for w in kw:
#     print w
#     query = parser.parse_query(w)
#     query_list.append(query)

query = parser.parse_query(kw)
enquire = xapian.Enquire(db)
enquire.set_query(query)
for m in enquire.get_mset(0, 30):
    print m.docid
Exemple #17
0
 def indexer(self, **kwargs):
     path = os.path.join(self.options.dir,
                         "%s_xapian" % self.options.indexname)
     self.database = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN)
     self.ixer = xapian.TermGenerator()
Exemple #18
0
    print(data_file, "is complete!")
    return curr_docid


if __name__ == '__main__':
    CORPUS_DIR = './wiki-pages-text/'
    DATA_FILES = os.listdir(CORPUS_DIR)
    DB_PATH = './xdb/'
    DB_NAME = 'wiki.db'

    # try to make a db in pwd
    try:
        os.mkdir(DB_PATH)
        print("create dir", DB_PATH)
    except (OSError, IOError) as e:
        if e.errno != errno.EEXIST:
            raise

    START = time()
    with closing(
            xapian.WritableDatabase(join(DB_PATH, DB_NAME),
                                    xapian.DB_CREATE_OR_OPEN)) as x_db:

        curr_docid = 1
        for data_file in tqdm(DATA_FILES):
            if not data_file.endswith('.txt'):
                continue
            curr_docid = save_2_db(x_db, CORPUS_DIR, data_file, curr_docid)

    print("took", time() - START, "seconds to finish")
Exemple #19
0
if __name__ == "__main__":
    import sys
    from time import time
    import linecache
    import glob
    import traceback
    import linecache
    import xapian
    import re

    stem = xapian.Stem("french")
    ti = xapian.inmemory_open()
    ti = xapian.WritableDatabase("test.ti", xapian.DB_CREATE_OR_OPEN)
    # ti = xapian.quartz_open('test.idx')

    #     start = time()
    #     lines = 0
    #     for f in glob.glob('*.txt'):
    #         print f,
    #         for linenumber, line in enumerate(file(f,'rb')):
    #             lines += 1
    #             line = line.strip()
    #             doc = xapian.Document()
    #             doc.set_data('%12s:%04i'%(f,linenumber))
    #             for word_number, word in enumerate(re.findall(r'\w+',line.lower())):
    #                 doc.add_posting(word,word_number)
    #             ti.add_document(doc)
    #             if linenumber % 100 == 0:
    #                 sys.stdout.write('.')
    #         print 'OK'
    #     print 'Indexing time : %.2fs for %i lines'%(time()-start,lines)
Exemple #20
0
    def spin(self):
        cursor = self._spinner[self._counter % len(self._spinner)]
        self._counter += 1
        print('\b' + cursor, end='')
        sys.stdout.flush()


spinner = Spinner()

# http://stackoverflow.com/questions/898669/how-can-i-detect-if-a-file-is-binary-non-text-in-python
textchars = bytearray({7, 8, 9, 10, 12, 13, 27}
                      | set(range(0x20, 0x100)) - {0x7f})
is_binary_string = lambda bytes: bool(bytes.translate(None, textchars))

try:
    xdb = xapian.WritableDatabase("__xdb__", xapian.DB_CREATE_OR_OVERWRITE)

    indexer = xapian.TermGenerator()
    stemmer = xapian.Stem("english")
    indexer.set_stemmer(stemmer)

    # scan the project
    counter_indexed = 0
    pathlist_indexed = []
    for dirpath, _, filenames in os.walk("."):
        if ".git" in dirpath or "__xdb__" in dirpath:
            continue
        for filename in filenames:
            cursor = os.path.join(dirpath, filename)

            # skip non-plain files
Exemple #21
0
def test_all():
    # Test the version number reporting functions give plausible results.
    v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(),
                      xapian.revision())
    v2 = xapian.version_string()
    expect(v2, v, "Unexpected version output")

    # A regexp check would be better, but seems to create a bogus "leak" of -1
    # objects in Python 3.
    expect(len(xapian.__version__.split('.')), 3,
           'xapian.__version__ not X.Y.Z')
    expect((xapian.__version__.split('.'))[0], '1',
           'xapian.__version__ not "1.Y.Z"')

    def access_cvar():
        res = xapian.cvar
        print("Unhandled constants: ", res)
        return res

    # Check that SWIG isn't generating cvar (regression test for ticket#297).
    #
    # Python 3.5 generates a different exception message here to earlier
    # versions, so we need a check which matches both.
    expect_exception(AttributeError,
                     lambda msg: msg.find("has no attribute 'cvar'") != -1,
                     access_cvar)

    stem = xapian.Stem(b"english")
    expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)")

    doc = xapian.Document()
    doc.set_data(b"a\0b")
    if doc.get_data() == b"a":
        raise TestFail("get_data+set_data truncates at a zero byte")
    expect(doc.get_data(), b"a\0b",
           "get_data+set_data doesn't transparently handle a zero byte")
    doc.set_data(b"is there anybody out there?")
    doc.add_term(b"XYzzy")
    doc.add_posting(stem(b"is"), 1)
    doc.add_posting(stem(b"there"), 2)
    doc.add_posting(stem(b"anybody"), 3)
    doc.add_posting(stem(b"out"), 4)
    doc.add_posting(stem(b"there"), 5)

    db = xapian.WritableDatabase('', xapian.DB_BACKEND_INMEMORY)
    db.add_document(doc)
    expect(db.get_doccount(), 1, "Unexpected db.get_doccount()")
    terms = ["smoke", "test", "terms"]
    expect_query(
        xapian.Query(xapian.Query.OP_OR, [t.encode('utf-8') for t in terms]),
        "(smoke OR test OR terms)")
    query1 = xapian.Query(xapian.Query.OP_PHRASE,
                          (b"smoke", b"test", b"tuple"))
    query2 = xapian.Query(xapian.Query.OP_XOR,
                          (xapian.Query(b"smoke"), query1, b"string"))
    expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)")
    expect_query(
        query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)")
    subqs = ["a", "b"]
    expect_query(
        xapian.Query(xapian.Query.OP_OR, [s.encode('utf-8') for s in subqs]),
        "(a OR b)")
    expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, b'1', b'4'),
                 "VALUE_RANGE 0 1 4")

    # Check database factory functions are wrapped as expected (or not wrapped
    # in the first cases):

    expect_exception(
        AttributeError,
        lambda msg: msg.find("has no attribute 'open_stub'") != -1,
        lambda: xapian.open_stub(b"nosuchdir/nosuchdb"))
    expect_exception(
        AttributeError,
        lambda msg: msg.find("has no attribute 'open_stub'") != -1,
        lambda: xapian.open_stub(b"nosuchdir/nosuchdb", xapian.DB_OPEN))

    expect_exception(
        xapian.DatabaseOpeningError, None,
        lambda: xapian.Database(b"nosuchdir/nosuchdb", xapian.DB_BACKEND_STUB))
    expect_exception(
        xapian.DatabaseOpeningError, None, lambda: xapian.WritableDatabase(
            b"nosuchdir/nosuchdb", xapian.DB_OPEN | xapian.DB_BACKEND_STUB))

    expect_exception(
        xapian.DatabaseOpeningError, None, lambda: xapian.Database(
            b"nosuchdir/nosuchdb", xapian.DB_BACKEND_GLASS))
    expect_exception(
        xapian.DatabaseCreateError, None, lambda: xapian.WritableDatabase(
            b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_GLASS))

    expect_exception(
        xapian.FeatureUnavailableError, None, lambda: xapian.Database(
            b"nosuchdir/nosuchdb", xapian.DB_BACKEND_CHERT))
    expect_exception(
        xapian.FeatureUnavailableError, None, lambda: xapian.WritableDatabase(
            b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_CHERT))

    expect_exception(xapian.NetworkError, None, xapian.remote_open,
                     b"/bin/false", b"")
    expect_exception(xapian.NetworkError, None, xapian.remote_open_writable,
                     b"/bin/false", b"")

    expect_exception(xapian.NetworkError, None, xapian.remote_open,
                     b"127.0.0.1", 0, 1)
    expect_exception(xapian.NetworkError, None, xapian.remote_open_writable,
                     b"127.0.0.1", 0, 1)

    # Check wrapping of MatchAll and MatchNothing:

    expect_query(xapian.Query.MatchAll, "<alldocuments>")
    expect_query(xapian.Query.MatchNothing, "")

    # Feature test for Query.__iter__
    term_count = 0
    for term in query2:
        term_count += 1
    expect(term_count, 4, "Unexpected number of terms in query2")

    enq = xapian.Enquire(db)
    enq.set_query(xapian.Query(xapian.Query.OP_OR, b"there", b"is"))
    mset = enq.get_mset(0, 10)
    expect(mset.size(), 1, "Unexpected mset.size()")
    expect(len(mset), 1, "Unexpected mset.size()")

    # Feature test for Enquire.matching_terms(docid)
    term_count = 0
    for term in enq.matching_terms(mset.get_hit(0)):
        term_count += 1
    expect(term_count, 2, "Unexpected number of matching terms")

    # Feature test for MSet.__iter__
    msize = 0
    for match in mset:
        msize += 1
    expect(msize, mset.size(), "Unexpected number of entries in mset")

    terms = b" ".join(enq.matching_terms(mset.get_hit(0)))
    expect(terms, b"is there", "Unexpected terms")

    # Feature test for ESet.__iter__
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enq.get_eset(10, rset)
    term_count = 0
    for term in eset:
        term_count += 1
    expect(term_count, 3, "Unexpected number of expand terms")

    # Feature test for Database.__iter__
    term_count = 0
    for term in db:
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db")

    # Feature test for Database.allterms
    term_count = 0
    for term in db.allterms():
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db.allterms")

    # Feature test for Database.postlist
    count = 0
    for posting in db.postlist(b"there"):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('there')")

    # Feature test for Database.postlist with empty term (alldocspostlist)
    count = 0
    for posting in db.postlist(b""):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('')")

    # Feature test for Database.termlist
    count = 0
    for term in db.termlist(1):
        count += 1
    expect(count, 5, "Unexpected number of entries in db.termlist(1)")

    # Feature test for Database.positionlist
    count = 0
    for term in db.positionlist(1, b"there"):
        count += 1
    expect(count, 2,
           "Unexpected number of entries in db.positionlist(1, 'there')")

    # Feature test for Document.termlist
    count = 0
    for term in doc.termlist():
        count += 1
    expect(count, 5, "Unexpected number of entries in doc.termlist()")

    # Feature test for TermIter.skip_to
    term = doc.termlist()
    term.skip_to(b'n')
    while True:
        try:
            x = next(term)
        except StopIteration:
            break
        if x.term < b'n':
            raise TestFail("TermIter.skip_to didn't skip term '%s'" %
                           x.term.decode('utf-8'))

    # Feature test for Document.values
    count = 0
    for term in list(doc.values()):
        count += 1
    expect(count, 0, "Unexpected number of entries in doc.values")

    # Check exception handling for Xapian::DocNotFoundError
    expect_exception(xapian.DocNotFoundError, "Docid 3 not found",
                     db.get_document, 3)

    # Check value of OP_ELITE_SET
    expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET")

    # Feature test for MatchDecider
    doc = xapian.Document()
    doc.set_data(b"Two")
    doc.add_posting(stem(b"out"), 1)
    doc.add_posting(stem(b"outside"), 1)
    doc.add_posting(stem(b"source"), 2)
    doc.add_value(0, b"yes")
    db.add_document(doc)

    class testmatchdecider(xapian.MatchDecider):
        def __call__(self, doc):
            return doc.get_value(0) == b"yes"

    query = xapian.Query(stem(b"out"))
    enquire = xapian.Enquire(db)
    enquire.set_query(query)
    mset = enquire.get_mset(0, 10, None, testmatchdecider())
    expect(mset.size(), 1,
           "Unexpected number of documents returned by match decider")
    expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in")

    # Feature test for ExpandDecider
    class testexpanddecider(xapian.ExpandDecider):
        def __call__(self, term):
            return (not term.startswith(b'a'))

    enquire = xapian.Enquire(db)
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0,
                            testexpanddecider())
    eset_terms = [item.term for item in eset]
    expect(len(eset_terms), eset.size(),
           "Unexpected number of terms returned by expand")
    if [t for t in eset_terms if t.startswith(b'a')]:
        raise TestFail("ExpandDecider was not used")

    # Check min_wt argument to get_eset() works (new in 1.2.5).
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ)
    expect([i.weight for i in eset][-1] < 1.9, True,
           "test get_eset() without min_wt")
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0,
                            None, 1.9)
    expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt")

    # Check QueryParser parsing error.
    qp = xapian.QueryParser()
    expect_exception(xapian.QueryParserError,
                     "Syntax: <expression> AND <expression>", qp.parse_query,
                     b"test AND")

    # Check QueryParser pure NOT option
    qp = xapian.QueryParser()
    expect_query(
        qp.parse_query(b"NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
        "(<alldocuments> AND_NOT test@1)")

    # Check QueryParser partial option
    qp = xapian.QueryParser()
    qp.set_database(db)
    qp.set_default_op(xapian.Query.OP_AND)
    qp.set_stemming_strategy(qp.STEM_SOME)
    qp.set_stemmer(xapian.Stem(b'en'))
    expect_query(qp.parse_query(b"foo o", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND ((SYNONYM WILDCARD OR o) OR Zo@2))")

    expect_query(qp.parse_query(b"foo outside", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND ((SYNONYM WILDCARD OR outside) OR Zoutsid@2))")

    # Test supplying unicode strings
    expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar')),
                 '(foo OR bar)')
    expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xa3')),
                 '(foo OR bar\\xa3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xc2\xa3')),
                 '(foo OR bar\u00a3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, b'foo', b'bar'),
                 '(foo OR bar)')

    expect_query(
        qp.parse_query(b"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
        "(<alldocuments> AND_NOT Zt\u00e9st@1)")

    doc = xapian.Document()
    doc.set_data(b"Unicode with an acc\xe9nt")
    doc.add_posting(stem(b"out\xe9r"), 1)
    expect(doc.get_data(), b"Unicode with an acc\xe9nt")
    term = next(doc.termlist()).term
    expect(term, b"out\xe9r")

    # Check simple stopper
    stop = xapian.SimpleStopper()
    qp.set_stopper(stop)
    expect(stop(b'a'), False)
    expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    stop.add(b'a')
    expect(stop(b'a'), True)
    expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Feature test for custom Stopper
    class my_b_stopper(xapian.Stopper):
        def __call__(self, term):
            return term == b"b"

        def get_description(self):
            return "my_b_stopper"

    stop = my_b_stopper()
    expect(stop.get_description(), "my_b_stopper")
    qp.set_stopper(stop)
    expect(stop(b'a'), False)
    expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    expect(stop(b'b'), True)
    expect_query(qp.parse_query(b"foo bar b", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Test TermGenerator
    termgen = xapian.TermGenerator()
    doc = xapian.Document()
    termgen.set_document(doc)
    termgen.index_text(b'foo bar baz foo')
    expect([(item.term, item.wdf, [pos for pos in item.positer])
            for item in doc.termlist()], [(b'bar', 1, [2]), (b'baz', 1, [3]),
                                          (b'foo', 2, [1, 4])])

    # Check DateValueRangeProcessor works
    context("checking that DateValueRangeProcessor works")
    qp = xapian.QueryParser()
    vrpdate = xapian.DateValueRangeProcessor(1, 1, 1960)
    qp.add_valuerangeprocessor(vrpdate)
    query = qp.parse_query(b'12/03/99..12/04/01')
    expect(str(query), 'Query(0 * VALUE_RANGE 1 19991203 20011204)')

    # Regression test for bug#193, fixed in 1.0.3.
    context("running regression test for bug#193")
    vrp = xapian.NumberValueRangeProcessor(0, b'$', True)
    a = '$10'
    b = '20'
    slot, a, b = vrp(a, b.encode('utf-8'))
    expect(slot, 0)
    expect(xapian.sortable_unserialise(a), 10)
    expect(xapian.sortable_unserialise(b), 20)

    # Feature test for xapian.FieldProcessor
    context("running feature test for xapian.FieldProcessor")

    class testfieldprocessor(xapian.FieldProcessor):
        def __call__(self, s):
            if s == 'spam':
                raise Exception('already spam')
            return xapian.Query("spam")

    qp.add_prefix('spam', testfieldprocessor())
    qp.add_boolean_prefix('boolspam', testfieldprocessor())
    query = qp.parse_query('spam:ignored')
    expect(str(query), 'Query(spam)')

    # FIXME: This doesn't currently work:
    # expect_exception(Exception, 'already spam', qp.parse_query, 'spam:spam')

    # Regression tests copied from PHP (probably always worked in python, but
    # let's check...)
    context("running regression tests for issues which were found in PHP")

    # PHP overload resolution involving boolean types failed.
    enq.set_sort_by_value(1, True)

    # Regression test - fixed in 0.9.10.1.
    oqparser = xapian.QueryParser()
    oquery = oqparser.parse_query(b"I like tea")

    # Regression test for bug#192 - fixed in 1.0.3.
    enq.set_cutoff(100)

    # Test setting and getting metadata
    expect(db.get_metadata(b'Foo'), b'')
    db.set_metadata(b'Foo', b'Foo')
    expect(db.get_metadata(b'Foo'), b'Foo')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.get_metadata, b'')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.set_metadata, b'',
                     b'Foo')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.get_metadata, b'')

    # Test OP_SCALE_WEIGHT and corresponding constructor
    expect_query(
        xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query(b'foo'), 5),
        "5 * foo")
 def test_application_details(self):
     db = xapian.WritableDatabase("./data/test.db",
                                  xapian.DB_CREATE_OR_OVERWRITE)
     res = update_from_app_install_data(db,
                                        self.cache,
                                        datadir="./data/desktop")
     self.assertTrue(res)
     db = StoreDatabase("./data/test.db", self.cache)
     db.open(use_axi=False, use_agent=False)
     self.assertEqual(len(db), 5)
     # test details
     app = Application("Ubuntu Software Center Test", "software-center")
     details = app.get_details(db)
     self.assertNotEqual(details, None)
     self.assertEqual(details.component, "main")
     self.assertEqual(details.pkgname, "software-center")
     # get the first document
     for doc in db:
         if doc.get_data() == "Ubuntu Software Center Test":
             appdetails = AppDetails(db, doc=doc)
             break
     # test get_appname and get_pkgname
     self.assertEqual(db.get_appname(doc), "Ubuntu Software Center Test")
     self.assertEqual(db.get_pkgname(doc), "software-center")
     # test appdetails
     self.assertEqual(appdetails.name, "Ubuntu Software Center Test")
     self.assertEqual(appdetails.pkgname, "software-center")
     # FIXME: add a dekstop file with a real channel to test
     #        and monkey-patch/modify the APP_INSTALL_CHANNELS_PATH
     self.assertEqual(appdetails.channelname, None)
     self.assertEqual(appdetails.channelfile, None)
     self.assertEqual(appdetails.component, "main")
     self.assertNotEqual(appdetails.pkg, None)
     # from the fake test/data/appdetails/var/lib/dpkg/status
     self.assertEqual(appdetails.pkg.is_installed, True)
     self.assertTrue(appdetails.pkg_state in (PkgStates.INSTALLED,
                                              PkgStates.UPGRADABLE))
     # FIXME: test description for unavailable pkg
     self.assertTrue(
         appdetails.description.startswith(
             "Ubuntu Software Center lets you"))
     # FIXME: test appdetails.website
     self.assertEqual(appdetails.icon, "softwarecenter")
     # crude, crude
     self.assertTrue(len(appdetails.version) > 2)
     # FIXME: screenshots will only work on ubuntu
     self.assertTrue(
         re.match(
             "http://screenshots.ubuntu.com/screenshot-with-version/software-center/[\d.]+",
             appdetails.screenshot))
     self.assertTrue(
         re.match(
             "http://screenshots.ubuntu.com/thumbnail-with-version/software-center/[\d.]+",
             appdetails.thumbnail))
     # FIXME: add document that has a price
     self.assertEqual(appdetails.price, '')
     self.assertEqual(appdetails.license, "Open source")
     # test lazy history loading for installation date
     self.ensure_installation_date_and_lazy_history_loading(appdetails)
     # test apturl replacements
     # $kernel
     app = Application("", "linux-headers-$kernel",
                       "channel=$distro-partner")
     self.assertEqual(app.pkgname, 'linux-headers-' + os.uname()[2])
     # $distro
     details = app.get_details(db)
     distro = get_distro().get_codename()
     self.assertEqual(app.request, 'channel=' + distro + '-partner')
    def update_xapiandb(self, kwargs):
        database = xapian.WritableDatabase(XAPIAN_DB_PATH, xapian.DB_OPEN)
        DB = xapian.Database(XAPIAN_DB_PATH)
        enquire = xapian.Enquire(database)
        indexer = xapian.TermGenerator()

        if "" == kwargs["pkgname"]:
            modified_num = 0
            add_num = 0
            xapiandb_update = "No"

            query_xapiandb_version = xapian.Query("the_#ukxapiandb#_version")
            enquire.set_query(query_xapiandb_version)
            matches = enquire.get_mset(0, 1)
            for re in matches:
                docid_for_xapiandb_version = re.document.get_docid()
                doc_for_xapiandb_version = re.document
                doc_data = doc_for_xapiandb_version.get_data()
                if (isinstance(doc_data,bytes)):
                    doc_data = doc_data.decode(encoding='utf-8')
                if ("XAPIANDB_VERSION" == doc_data):
                    the_latest_update_time = doc_for_xapiandb_version.get_value(2) #valueslot:2 xapiandb update time
                    if (isinstance(the_latest_update_time,bytes)):
                        the_latest_update_time = the_latest_update_time.decode(encoding='utf-8')
                else:
                    the_latest_update_time = time.strftime('%Y-%m-%dT%H:%M:%S',time.localtime())
                    if (Globals.DEBUG_SWITCH):
                        print("Failed to get the latest update time from client xapiandb,use default time.localtime()")
            reslist = self.premoter.newerapp_for_xapianupdate(the_latest_update_time)

            for app in reslist:
                app_name = str(app["app_name"])
                display_name_cn = str(app["display_name_cn"])
                keywords_for_search = str(app["keywords_for_search"])

                query = xapian.Query(app_name)
                enquire.set_query(query)
                doccount = DB.get_doccount()
                matches = enquire.get_mset(0,doccount)
                flag = 1
                if matches.size() != 0:
                    for re in matches:
                        get_name = re.document.get_data()
                        if (isinstance(get_name,bytes)):
                            get_name = get_name.decode(encoding='utf-8')
                        if get_name == app_name:
                            flag = 0
                            docid = re.docid
                            doc = re.document
                            doc.clear_terms()
                            indexer.set_document(doc)
                            doc.add_term(app_name,10)
                            if keywords_for_search != "None":
                                keywords = display_name_cn+";"+keywords_for_search+";"+app_name
                            else:
                                keywords = display_name_cn+";"+app_name
                            indexer.index_text(keywords,10)

                            try:
                                from mmseg.search import seg_txt_search,seg_txt_2_dict
                                for word, value in seg_txt_2_dict(keywords).items():
                                    if word != "none":
                                        doc.add_term(word,10)
                                    else:
                                        pass
                            except:
                                if (Globals.DEBUG_SWITCH):
                                    print("----No mmseg model---")

                            database.replace_document(docid,doc)
                            xapiandb_update = "Yes"
                            modified_num = modified_num + 1

                        else:
                            continue
                if flag:
                    doc = xapian.Document()
                    doc.set_data(app_name)
                    doc.add_term(app_name,10)
                    indexer.set_document(doc)
                    if keywords_for_search != "None":
                        keywords = display_name_cn+";"+keywords_for_search+";"+app_name
                    else:
                        keywords = display_name_cn+";"+app_name
                    indexer.index_text(keywords,10)

                    try:
                        for word,value in seg_txt_2_dict(keywords).items():
                            if word != "none":
                                doc.add_term(word,10)
                            else:
                                pass
                    except:
                        pass
                    database.add_document(doc)
                    add_num = add_num + 1
                    if (Globals.DEBUG_SWITCH):
                        print("App:",doc.get_data(),"  ","terms:", end=' ')
                    for itr in doc.termlist():
                        if (Globals.DEBUG_SWITCH):
                            print(itr.term, end=' ')
                    xapiandb_update = "Yes"
                    if (Globals.DEBUG_SWITCH):
                        print("  ")

            try:
                if xapiandb_update == "Yes":
                    now = time.strftime('%Y-%m-%dT%H:%M:%S',time.localtime())
                    doc_for_xapiandb_version.add_value(2,now)
                    database.replace_document(docid_for_xapiandb_version, doc_for_xapiandb_version)
                    database.commit()
                    if (Globals.DEBUG_SWITCH):
                        print("Xapiandb has updated . %d app modified, %d app add.  Tatal: %d app updated"%(modified_num,add_num,len(reslist)))
            except:
                if (Globals.DEBUG_SWITCH):
                    print("The xapian database (/home/ice_bird/.cache/uksc/xapiandb) is crashed,please remove it and install a new one!")
            if (Globals.DEBUG_SWITCH):
                print("update uksc xapiandb over")

        else:
            appinfo_query = xapian.Query(kwargs["pkgname"])
            enquire.set_query(appinfo_query)
            matches = enquire.get_mset(0, DB.get_doccount())
            for re in matches:
                doc_for_appinfo = re.document
                doc_data = doc_for_appinfo.get_data()
                if kwargs["pkgname"] == doc_data:
                    return

            doc = xapian.Document()
            doc.set_data(kwargs["pkgname"])
            doc.add_term(kwargs["pkgname"], 10)
            if (Globals.DEBUG_SWITCH):
                print("debfile path:", kwargs["path"])

            deb = DebFile(kwargs["path"])
            terms = kwargs["pkgname"]
            try:
                terms = terms + " " + deb.description
            except:
                if (Globals.DEBUG_SWITCH):
                    print("Failed to get app description")
            indexer.set_document(doc)
            indexer.index_text(terms)
            database.add_document(doc)
            database.commit()
            if (Globals.DEBUG_SWITCH):
                print("update xapiandb over: ", kwargs["pkgname"], "terms:", end=' ')
            for itr in doc.termlist():
                if (Globals.DEBUG_SWITCH):
                    print(itr.term, end=' ')
            if (Globals.DEBUG_SWITCH):
                print(" ")
        else:
            # create a new database
            if not create_allowed:
                raise OSError("Indexer: skipping database creation")
            try:
                # create the parent directory if it does not exist
                parent_path = os.path.dirname(self.location)
                if not os.path.isdir(parent_path):
                    # recursively create all directories up to parent_path
                    os.makedirs(parent_path)
            except IOError, err_msg:
                raise OSError("Indexer: failed to create the parent " \
                        + "directory (%s) of the indexing database: %s" \
                        % (parent_path, str(err_msg)))
            try:
                self.writer = xapian.WritableDatabase(self.location,
                                                      xapian.DB_CREATE_OR_OPEN)
                self.flush()
            except xapian.DatabaseOpeningError, err_msg:
                raise OSError("Indexer: failed to open or create a xapian " \
                        + "database (%s): %s" % (self.location, str(err_msg)))

    def __del__(self):
        self.reader = None
        self._writer_close()

    def flush(self, optimize=False):
        """force to write the current changes to disk immediately

        @param optimize: ignored for xapian
        @type optimize: bool
        """
##export XAPIAN_FLUSH_THRESHHOLD=200000; python index_alldoc.py
import xapian
import time
DATA_FILEPATH = "/Users/neesergparajuli/Dropbox/Webtext/Data/wiki-pages-text/"
DATABASE_FILEPATH = "/Users/neesergparajuli/Dropbox/Webtext/Data/XxapianDatabase"
start = time.time()
db = xapian.WritableDatabase(DATABASE_FILEPATH, xapian.DB_CREATE_OR_OPEN)

termgenerator = xapian.TermGenerator()
termgenerator.set_stemmer(xapian.Stem("en"))

for i in range(1, 110):
    st = "wiki-{:03d}.txt".format(i)
    cyclestart = time.time()
    print(st)
    j = 0
    with open(DATA_FILEPATH + st) as file:

        #Create the databse
        for line in file:
            words = line.split(' ')

            #extract the title from the id

            id1 = words[0]
            title = id1.split('_')
            title = ' '.join(title)
            title = title.split('-')
            title = ' '.join(title)

            #check fact number is given and create doc ID
Exemple #26
0
 def __init__(self, root):
     self.root = root
     self.db = xapian.WritableDatabase(self.root, xapian.DB_CREATE_OR_OPEN)