def update_xapiandb(self, kwargs):
        database = xapian.WritableDatabase(XAPIAN_DB_PATH, xapian.DB_OPEN)
        DB = xapian.Database(XAPIAN_DB_PATH)
        enquire = xapian.Enquire(database)
        indexer = xapian.TermGenerator()

        if "" == kwargs["pkgname"]:
            modified_num = 0
            add_num = 0
            xapiandb_update = "No"

            query_xapiandb_version = xapian.Query("the_#ukxapiandb#_version")
            enquire.set_query(query_xapiandb_version)
            matches = enquire.get_mset(0, 1)
            for re in matches:
                docid_for_xapiandb_version = re.document.get_docid()
                doc_for_xapiandb_version = re.document
                doc_data = doc_for_xapiandb_version.get_data()
                if (isinstance(doc_data, bytes)):
                    doc_data = doc_data.decode(encoding='utf-8')
                if ("XAPIANDB_VERSION" == doc_data):
                    the_latest_update_time = doc_for_xapiandb_version.get_value(
                        2)  #valueslot:2 xapiandb update time
                    if (isinstance(the_latest_update_time, bytes)):
                        the_latest_update_time = the_latest_update_time.decode(
                            encoding='utf-8')
                else:
                    the_latest_update_time = time.strftime(
                        '%Y-%m-%dT%H:%M:%S', time.localtime())
                    if (Globals.DEBUG_SWITCH):
                        print(
                            "Failed to get the latest update time from client xapiandb,use default time.localtime()"
                        )
            reslist = self.premoter.newerapp_for_xapianupdate(
                the_latest_update_time)

            for app in reslist:
                app_name = str(app["app_name"])
                display_name_cn = str(app["display_name_cn"])
                keywords_for_search = str(app["keywords_for_search"])

                query = xapian.Query(app_name)
                enquire.set_query(query)
                doccount = DB.get_doccount()
                matches = enquire.get_mset(0, doccount)
                if matches.size() != 0:
                    for re in matches:
                        get_name = re.document.get_data()
                        if (isinstance(get_name, bytes)):
                            get_name = get_name.decode(encoding='utf-8')
                        if get_name == app_name:
                            docid = re.docid
                            doc = re.document
                            doc.clear_terms()
                            indexer.set_document(doc)
                            doc.add_term(app_name, 10)
                            if keywords_for_search != "None":
                                keywords = display_name_cn + ";" + keywords_for_search + ";" + app_name
                            else:
                                keywords = display_name_cn + ";" + app_name
                            indexer.index_text(keywords, 10)

                            try:
                                from mmseg.search import seg_txt_search, seg_txt_2_dict
                                for word, value in seg_txt_2_dict(
                                        keywords).items():
                                    if word != "none":
                                        doc.add_term(word, 10)
                                    else:
                                        pass
                            except:
                                if (Globals.DEBUG_SWITCH):
                                    print("----No mmseg model---")

                            database.replace_document(docid, doc)
                            xapiandb_update = "Yes"
                            modified_num = modified_num + 1

                        else:
                            continue
                else:
                    doc = xapian.Document()
                    doc.set_data(app_name)
                    doc.add_term(app_name, 10)
                    indexer.set_document(doc)
                    if keywords_for_search != "None":
                        keywords = display_name_cn + ";" + keywords_for_search + ";" + app_name
                    else:
                        keywords = display_name_cn + ";" + app_name
                    indexer.index_text(keywords, 10)

                    try:
                        for word, value in seg_txt_2_dict(keywords).items():
                            if word != "none":
                                doc.add_term(word, 10)
                            else:
                                pass
                    except:
                        pass
                    database.add_document(doc)
                    add_num = add_num + 1
                    if (Globals.DEBUG_SWITCH):
                        print("App:", doc.get_data(), "  ", "terms:", end=' ')
                    for itr in doc.termlist():
                        if (Globals.DEBUG_SWITCH):
                            print(itr.term, end=' ')
                    xapiandb_update = "Yes"
                    if (Globals.DEBUG_SWITCH):
                        print("  ")

            try:
                if xapiandb_update == "Yes":
                    now = time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime())
                    doc_for_xapiandb_version.add_value(2, now)
                    database.replace_document(docid_for_xapiandb_version,
                                              doc_for_xapiandb_version)
                    database.commit()
                    if (Globals.DEBUG_SWITCH):
                        print(
                            "Xapiandb has updated . %d app modified, %d app add.  Tatal: %d app updated"
                            % (modified_num, add_num, len(reslist)))
            except:
                if (Globals.DEBUG_SWITCH):
                    print(
                        "The xapian database (/home/ice_bird/.cache/uksc/xapiandb) is crashed,please remove it and install a new one!"
                    )
            if (Globals.DEBUG_SWITCH):
                print("update uksc xapiandb over")

        else:
            appinfo_query = xapian.Query(kwargs["pkgname"])
            enquire.set_query(appinfo_query)
            matches = enquire.get_mset(0, DB.get_doccount())
            for re in matches:
                doc_for_appinfo = re.document
                doc_data = doc_for_appinfo.get_data()
                if kwargs["pkgname"] == doc_data:
                    return

            doc = xapian.Document()
            doc.set_data(kwargs["pkgname"])
            doc.add_term(kwargs["pkgname"], 10)
            if (Globals.DEBUG_SWITCH):
                print("debfile path:", kwargs["path"])

            deb = DebFile(kwargs["path"])
            terms = kwargs["pkgname"]
            try:
                terms = terms + " " + deb.description
            except:
                if (Globals.DEBUG_SWITCH):
                    print("Failed to get app description")
            indexer.set_document(doc)
            indexer.index_text(terms)
            database.add_document(doc)
            database.commit()
            if (Globals.DEBUG_SWITCH):
                print("update xapiandb over: ",
                      kwargs["pkgname"],
                      "terms:",
                      end=' ')
            for itr in doc.termlist():
                if (Globals.DEBUG_SWITCH):
                    print(itr.term, end=' ')
            if (Globals.DEBUG_SWITCH):
                print(" ")
def make_doc_from_parser(parser, cache):
    # XXX 2012-01-19 michaeln I'm just pulling this code out from
    # index_app_info_from_parser, but it'd be great to further
    # refactor it - it looks quite scary :-)
    doc = xapian.Document()
    # app name is the data
    if parser.has_option_desktop("X-Ubuntu-Software-Center-Name"):
        name = parser.get_desktop("X-Ubuntu-Software-Center-Name")
        untranslated_name = parser.get_desktop("X-Ubuntu-Software-Center-Name",
            translated=False)
    elif parser.has_option_desktop("X-GNOME-FullName"):
        name = parser.get_desktop("X-GNOME-FullName")
        untranslated_name = parser.get_desktop("X-GNOME-FullName",
            translated=False)
    else:
        name = parser.get_desktop("Name")
        untranslated_name = parser.get_desktop("Name", translated=False)

    doc.set_data(name)
    doc.add_value(XapianValues.APPNAME_UNTRANSLATED, untranslated_name)

    # check if we should ignore this file
    if parser.has_option_desktop("X-AppInstall-Ignore"):
        ignore = parser.get_desktop("X-AppInstall-Ignore")
        if ignore.strip().lower() == "true":
            LOG.debug("X-AppInstall-Ignore found for '%s'" % parser.desktopf)
            return
    # architecture
    pkgname_extension = ''
    if parser.has_option_desktop("X-AppInstall-Architectures"):
        arches = parser.get_desktop("X-AppInstall-Architectures")
        doc.add_value(XapianValues.ARCHIVE_ARCH, arches)
        native_archs = get_current_arch() in arches.split(',')
        foreign_archs = list(set(arches.split(',')) &
            set(get_foreign_architectures()))
        if not (native_archs or foreign_archs):
            return
        if not native_archs and foreign_archs:
            pkgname_extension = ':' + foreign_archs[0]
    # package name
    pkgname = parser.get_desktop("X-AppInstall-Package") + pkgname_extension
    doc.add_term("AP" + pkgname)
    if '-' in pkgname:
        # we need this to work around xapian oddness
        doc.add_term(pkgname.replace('-', '_'))
    doc.add_value(XapianValues.PKGNAME, pkgname)
    doc.add_value(XapianValues.DESKTOP_FILE, parser.desktopf)
    # display name
    if "display_name" in axi_values:
        doc.add_value(axi_values["display_name"], name)
    # cataloged_times
    if "catalogedtime" in axi_values:
        if pkgname in cataloged_times:
            doc.add_value(axi_values["catalogedtime"],
                          xapian.sortable_serialise(cataloged_times[pkgname]))
    # pocket (main, restricted, ...)
    if parser.has_option_desktop("X-AppInstall-Section"):
        archive_section = parser.get_desktop("X-AppInstall-Section")
        doc.add_term("AS" + archive_section)
        doc.add_value(XapianValues.ARCHIVE_SECTION, archive_section)
    # section (mail, base, ..)
    if pkgname in cache and cache[pkgname].candidate:
        section = cache[pkgname].section
        doc.add_term("AE" + section)
    # channel (third party stuff)
    if parser.has_option_desktop("X-AppInstall-Channel"):
        archive_channel = parser.get_desktop("X-AppInstall-Channel")
        doc.add_term("AH" + archive_channel)
        doc.add_value(XapianValues.ARCHIVE_CHANNEL, archive_channel)
    # signing key (third party)
    if parser.has_option_desktop("X-AppInstall-Signing-Key-Id"):
        keyid = parser.get_desktop("X-AppInstall-Signing-Key-Id")
        doc.add_value(XapianValues.ARCHIVE_SIGNING_KEY_ID, keyid)
    # license (third party)
    if parser.has_option_desktop("X-AppInstall-License"):
        license = parser.get_desktop("X-AppInstall-License")
        doc.add_value(XapianValues.LICENSE, license)
    # date published
    if parser.has_option_desktop("X-AppInstall-Date-Published"):
        date_published = parser.get_desktop("X-AppInstall-Date-Published")
        if (date_published and
            re.match("\d+-\d+-\d+ \d+:\d+:\d+", date_published)):
            # strip the subseconds from the end of the published date string
            date_published = str(date_published).split(".")[0]
            doc.add_value(XapianValues.DATE_PUBLISHED,
                          date_published)
            # we use the date published value for the cataloged time as well
            if "catalogedtime" in axi_values:
                LOG.debug(
                        ("pkgname: %s, date_published cataloged time is: %s" %
                             (pkgname, parser.get_desktop("date_published"))))
                date_published_sec = time.mktime(
                                        time.strptime(date_published,
                                                      "%Y-%m-%d  %H:%M:%S"))
                doc.add_value(axi_values["catalogedtime"],
                              xapian.sortable_serialise(date_published_sec))
    # purchased date
    if parser.has_option_desktop("X-AppInstall-Purchased-Date"):
        date = parser.get_desktop("X-AppInstall-Purchased-Date")
        # strip the subseconds from the end of the date string
        doc.add_value(XapianValues.PURCHASED_DATE, str(date).split(".")[0])
    # deb-line (third party)
    if parser.has_option_desktop("X-AppInstall-Deb-Line"):
        debline = parser.get_desktop("X-AppInstall-Deb-Line")
        doc.add_value(XapianValues.ARCHIVE_DEB_LINE, debline)
    # license key (third party)
    if parser.has_option_desktop("X-AppInstall-License-Key"):
        key = parser.get_desktop("X-AppInstall-License-Key")
        doc.add_value(XapianValues.LICENSE_KEY, key)
    # license keypath (third party)
    if parser.has_option_desktop("X-AppInstall-License-Key-Path"):
        path = parser.get_desktop("X-AppInstall-License-Key-Path")
        doc.add_value(XapianValues.LICENSE_KEY_PATH, path)
    # PPA (third party stuff)
    if parser.has_option_desktop("X-AppInstall-PPA"):
        archive_ppa = parser.get_desktop("X-AppInstall-PPA")
        if archive_ppa:
            doc.add_value(XapianValues.ARCHIVE_PPA, archive_ppa)
            # add archive origin data here so that its available even if
            # the PPA is not (yet) enabled
            doc.add_term("XOO" + "lp-ppa-%s" % archive_ppa.replace("/", "-"))
    # screenshot (for third party)
    if parser.has_option_desktop("X-AppInstall-Screenshot-Url"):
        url = parser.get_desktop("X-AppInstall-Screenshot-Url")
        doc.add_value(XapianValues.SCREENSHOT_URLS, url)
    # thumbnail (for third party)
    if parser.has_option_desktop("X-AppInstall-Thumbnail-Url"):
        url = parser.get_desktop("X-AppInstall-Thumbnail-Url")
        doc.add_value(XapianValues.THUMBNAIL_URL, url)
    # video support (for third party mostly)
    if parser.has_option_desktop("X-AppInstall-Video-Url"):
        url = parser.get_desktop("X-AppInstall-Video-Url")
        doc.add_value(XapianValues.VIDEO_URL, url)
    # icon (for third party)
    if parser.has_option_desktop("X-AppInstall-Icon-Url"):
        url = parser.get_desktop("X-AppInstall-Icon-Url")
        doc.add_value(XapianValues.ICON_URL, url)
        if not parser.has_option_desktop("X-AppInstall-Icon"):
            # prefix pkgname to avoid name clashes
            doc.add_value(XapianValues.ICON, "%s-icon-%s" % (
                    pkgname, os.path.basename(url)))

    # price (pay stuff)
    if parser.has_option_desktop("X-AppInstall-Price"):
        price = parser.get_desktop("X-AppInstall-Price")
        doc.add_value(XapianValues.PRICE, price)
        # since this is a commercial app, indicate it in the component value
        doc.add_value(XapianValues.ARCHIVE_SECTION, "commercial")
    # support url (mainly pay stuff)
    if parser.has_option_desktop("X-AppInstall-Support-Url"):
        url = parser.get_desktop("X-AppInstall-Support-Url")
        doc.add_value(XapianValues.SUPPORT_SITE_URL, url)
    # icon
    if parser.has_option_desktop("Icon"):
        icon = parser.get_desktop("Icon")
        doc.add_value(XapianValues.ICON, icon)
    # write out categories
    for cat in parser.get_desktop_categories():
        doc.add_term("AC" + cat.lower())
    categories_string = ";".join(parser.get_desktop_categories())
    doc.add_value(XapianValues.CATEGORIES, categories_string)
    for mime in parser.get_desktop_mimetypes():
        doc.add_term("AM" + mime.lower())
    # get type (to distinguish between apps and packages
    if parser.has_option_desktop("Type"):
        type = parser.get_desktop("Type")
        doc.add_term("AT" + type.lower())
    # check gettext domain
    if parser.has_option_desktop("X-Ubuntu-Gettext-Domain"):
        domain = parser.get_desktop("X-Ubuntu-Gettext-Domain")
        doc.add_value(XapianValues.GETTEXT_DOMAIN, domain)
    # Description (software-center extension)
    if parser.has_option_desktop("X-AppInstall-Description"):
        descr = parser.get_desktop("X-AppInstall-Description")
        doc.add_value(XapianValues.SC_DESCRIPTION, descr)
    if parser.has_option_desktop("Supported-Distros"):
        doc.add_value(XapianValues.SC_SUPPORTED_DISTROS,
            json.dumps(parser.get_desktop("Supported-Distros")))
    # version support (for e.g. the scagent)
    if parser.has_option_desktop("X-AppInstall-Version"):
        ver = parser.get_desktop("X-AppInstall-Version")
        doc.add_value(XapianValues.VERSION_INFO, ver)

    # (deb)tags (in addition to the pkgname debtags
    if parser.has_option_desktop("X-AppInstall-Tags"):
        # register tags
        tags_string = parser.get_desktop("X-AppInstall-Tags")
        if tags_string:
            tags = [tag.strip().lower() for tag in tags_string.split(",")]
            for tag in tags:
                doc.add_term("XT" + tag)
            region = get_region_cached()
            if region:
                # ENFORCE region blacklist/whitelist by not registering
                #          the app at all
                countrycode = region["countrycode"].lower()
                if "%s%s" % (REGION_BLACKLIST_TAG, countrycode) in tags:
                    LOG.info("skipping region restricted app: '%s'"
                             " (blacklisted) " % name)
                    return
                # whitelist
                for tag in tags:
                    if (tag.startswith(REGION_WHITELIST_TAG) and not
                        "%s%s" % (REGION_WHITELIST_TAG, countrycode) in tag):
                        LOG.info("skipping region restricted app: '%s'"
                                 " (not whitelisted)" % name)
                        return

    # popcon
    # FIXME: popularity not only based on popcon but also
    #        on archive section, third party app etc
    if parser.has_option_desktop("X-AppInstall-Popcon"):
        popcon = float(parser.get_desktop("X-AppInstall-Popcon"))
        # sort_by_value uses string compare, so we need to pad here
        doc.add_value(XapianValues.POPCON,
                      xapian.sortable_serialise(popcon))
        global popcon_max
        popcon_max = max(popcon_max, popcon)

    # comment goes into the summary data if there is one,
    # other wise we try GenericName and if nothing else,
    # the summary of the package
    if parser.has_option_desktop("Comment"):
        s = parser.get_desktop("Comment")
        doc.add_value(XapianValues.SUMMARY, s)
    elif parser.has_option_desktop("GenericName"):
        s = parser.get_desktop("GenericName")
        if s != name:
            doc.add_value(XapianValues.SUMMARY, s)
    elif pkgname in cache and cache[pkgname].candidate:
        s = cache[pkgname].candidate.summary
        doc.add_value(XapianValues.SUMMARY, s)

    return doc
Exemple #3
0
def gen_search_index(db_session, namespace):
    log.info("Generating search index for namespace {0}".format(namespace.id))
    dbpath = db_path_for(namespace.id)
    mkdirp(dbpath)
    database = x_.WritableDatabase(dbpath, x_.DB_CREATE_OR_OPEN)

    indexer = x_.TermGenerator()
    stemmer = x_.Stem("english")
    indexer.set_stemmer(stemmer)
    indexer.set_database(database)
    indexer.set_flags(indexer.FLAG_SPELLING)

    last_docid = database.get_lastdocid()
    msg_query = db_session.query(Message).filter(
            Message.namespace_id == namespace.id,
            Message.id > last_docid).options(joinedload('parts')) \
                    .order_by(Message.id.desc())
    log.info("Have {0} messages to process".format(msg_query.count()))

    # for each message part, create unprocessed documents with date/subject/to/from
    # metadata and the plaintext part, and then process them!
    total = msg_query.count()
    done = 0
    for msg in msg_query.yield_per(1000):
        text = strip_tags(msg.sanitized_body)

        # XXX also index attachments (add a 'type' field or something to
        # differentiate)

        if text is not None:
            doc = x_.Document()
            doc.set_data(text)

            indexer.set_document(doc)

            # NOTE: the integer here is a multiplier on the term frequency
            # (used for calculating relevance). We add terms with and without
            # a field prefix, so documents are returned on a generic search
            # *and* when fields are specifically searched for, e.g. to:[email protected]
            if msg.subject is not None:
                indexer.index_text(msg.subject, 10)
                indexer.index_text(msg.subject, 10, 'XSUBJECT')
            if msg.from_addr is not None:
                from_ = to_indexable(msg.from_addr)
                indexer.index_text(from_, 1)
                indexer.index_text(from_, 1, 'XFROM')
            if msg.to_addr is not None:
                to = ' '.join(
                    [to_indexable(parsed_addr) for parsed_addr in msg.to_addr])
                indexer.index_text(to, 5)
                indexer.index_text(to, 5, 'XTO')
            if msg.cc_addr is not None:
                cc = ' '.join(
                    [to_indexable(parsed_addr) for parsed_addr in msg.cc_addr])
                indexer.index_text(cc, 3)
                indexer.index_text(cc, 3, 'XCC')
            if msg.bcc_addr is not None:
                bcc = ' '.join([
                    to_indexable(parsed_addr) for parsed_addr in msg.bcc_addr
                ])
                indexer.index_text(bcc, 3)
                indexer.index_text(bcc, 3, 'XBCC')
            # "Values" are other data that you can use for e.g. sorting by
            # date
            doc.add_value(
                0,
                x_.sortable_serialise(timegm(msg.internaldate.utctimetuple())))
            database.replace_document(msg.id, doc)

        done += 1
        log.info("Indexed %i of %i (%.2f%%)" %
                 (done, total, done / total * 100))

    indexed_msgs = {k for k in database.metadata_keys()}
    msgs = [
        id for id, in db_session.query(distinct(Message.id)).filter_by(
            id=namespace.id)
    ]
    to_delete = indexed_msgs - msgs
    log.info("{0} documents to remove...".format(len(to_delete)))

    for msg_id in to_delete:
        database.delete_document(msg_id)

    database.close()
    log.info("done.")
 def setUp(self):
     pkgs_list = ["gimp", "eog", "inkscape"]
     self.decider = PkgMatchDecider(pkgs_list)
     self.doc = xapian.Document()
Exemple #5
0
def test_all():
    # Test the version number reporting functions give plausible results.
    v = "%d.%d.%d" % (xapian.major_version(),
                      xapian.minor_version(),
                      xapian.revision())
    v2 = xapian.version_string()
    expect(v2, v, "Unexpected version output")

    # A regexp check would be better, but seems to create a bogus "leak" of -1
    # objects in Python 3.
    expect(len(xapian.__version__.split('.')), 3, 'xapian.__version__ not X.Y.Z')
    expect((xapian.__version__.split('.'))[0], '1', 'xapian.__version__ not "1.Y.Z"')

    def access_cvar():
        res = xapian.cvar
        print "Unhandled constants: ", res
        return res

    # Check that SWIG isn't generating cvar (regression test for ticket#297).
    expect_exception(AttributeError, "'module' object has no attribute 'cvar'",
                     access_cvar)

    stem = xapian.Stem("english")
    expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)")

    doc = xapian.Document()
    doc.set_data("a\0b")
    if doc.get_data() == "a":
        raise TestFail("get_data+set_data truncates at a zero byte")
    expect(doc.get_data(), "a\0b", "get_data+set_data doesn't transparently handle a zero byte")
    doc.set_data("is there anybody out there?")
    doc.add_term("XYzzy")
    doc.add_posting(stem("is"), 1)
    doc.add_posting(stem("there"), 2)
    doc.add_posting(stem("anybody"), 3)
    doc.add_posting(stem("out"), 4)
    doc.add_posting(stem("there"), 5)

    db = xapian.WritableDatabase('', xapian.DB_BACKEND_INMEMORY)
    db.add_document(doc)
    expect(db.get_doccount(), 1, "Unexpected db.get_doccount()")
    terms = ["smoke", "test", "terms"]
    expect_query(xapian.Query(xapian.Query.OP_OR, terms),
                 "(smoke OR test OR terms)")
    query1 = xapian.Query(xapian.Query.OP_PHRASE, ("smoke", "test", "tuple"))
    query2 = xapian.Query(xapian.Query.OP_XOR, (xapian.Query("smoke"), query1, "string"))
    expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)")
    expect_query(query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)")
    subqs = ["a", "b"]
    expect_query(xapian.Query(xapian.Query.OP_OR, subqs), "(a OR b)")
    expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, '1', '4'),
                 "VALUE_RANGE 0 1 4")

    # Check database factory functions are wrapped as expected:

    expect_exception(xapian.DatabaseNotFoundError, None,
                     xapian.Database, "nosuchdir/nosuchdb", xapian.DB_BACKEND_STUB)
    expect_exception(xapian.DatabaseNotFoundError, None,
                     xapian.WritableDatabase, "nosuchdir/nosuchdb", xapian.DB_OPEN|xapian.DB_BACKEND_STUB)

    expect_exception(xapian.NetworkError, None,
                     xapian.remote_open, "/bin/false", "")
    expect_exception(xapian.NetworkError, None,
                     xapian.remote_open_writable, "/bin/false", "")

    expect_exception(xapian.NetworkError, None,
                     xapian.remote_open, "127.0.0.1", 0, 1)
    expect_exception(xapian.NetworkError, None,
                     xapian.remote_open_writable, "127.0.0.1", 0, 1)

    # Check wrapping of MatchAll and MatchNothing:

    expect_query(xapian.Query.MatchAll, "<alldocuments>")
    expect_query(xapian.Query.MatchNothing, "")

    # Feature test for Query.__iter__
    term_count = 0
    for term in query2:
        term_count += 1
    expect(term_count, 4, "Unexpected number of terms in query2")

    enq = xapian.Enquire(db)

    # Check Xapian::BAD_VALUENO is wrapped suitably.
    enq.set_collapse_key(xapian.BAD_VALUENO)

    enq.set_query(xapian.Query(xapian.Query.OP_OR, "there", "is"))
    mset = enq.get_mset(0, 10)
    expect(mset.size(), 1, "Unexpected mset.size()")
    expect(len(mset), 1, "Unexpected mset.size()")

    # Feature test for Enquire.matching_terms(docid)
    term_count = 0
    for term in enq.matching_terms(mset.get_hit(0)):
        term_count += 1
    expect(term_count, 2, "Unexpected number of matching terms")

    # Feature test for MSet.__iter__
    msize = 0
    for match in mset:
        msize += 1
    expect(msize, mset.size(), "Unexpected number of entries in mset")

    terms = " ".join(enq.matching_terms(mset.get_hit(0)))
    expect(terms, "is there", "Unexpected terms")

    # Feature test for ESet.__iter__
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enq.get_eset(10, rset)
    term_count = 0
    for term in eset:
        term_count += 1
    expect(term_count, 3, "Unexpected number of expand terms")

    # Feature test for Database.__iter__
    term_count = 0
    for term in db:
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db")

    # Feature test for Database.allterms
    term_count = 0
    for term in db.allterms():
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db.allterms")

    # Feature test for Database.postlist
    count = 0
    for posting in db.postlist("there"):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('there')")

    # Feature test for Database.postlist with empty term (alldocspostlist)
    count = 0
    for posting in db.postlist(""):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('')")

    # Feature test for Database.termlist
    count = 0
    for term in db.termlist(1):
        count += 1
    expect(count, 5, "Unexpected number of entries in db.termlist(1)")

    # Feature test for Database.positionlist
    count = 0
    for term in db.positionlist(1, "there"):
        count += 1
    expect(count, 2, "Unexpected number of entries in db.positionlist(1, 'there')")

    # Feature test for Document.termlist
    count = 0
    for term in doc.termlist():
        count += 1
    expect(count, 5, "Unexpected number of entries in doc.termlist()")

    # Feature test for TermIter.skip_to
    term = doc.termlist()
    term.skip_to('n')
    while True:
        try:
            x = next(term)
        except StopIteration:
            break
        if x.term < 'n':
            raise TestFail("TermIter.skip_to didn't skip term '%s'" % x.term)

    # Feature test for Document.values
    count = 0
    for term in doc.values():
        count += 1
    expect(count, 0, "Unexpected number of entries in doc.values")

    # Check exception handling for Xapian::DocNotFoundError
    expect_exception(xapian.DocNotFoundError, "Docid 3 not found", db.get_document, 3)

    # Check value of OP_ELITE_SET
    expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET")

    # Feature test for MatchDecider
    doc = xapian.Document()
    doc.set_data("Two")
    doc.add_posting(stem("out"), 1)
    doc.add_posting(stem("outside"), 1)
    doc.add_posting(stem("source"), 2)
    doc.add_value(0, "yes")
    db.add_document(doc)

    class testmatchdecider(xapian.MatchDecider):
        def __call__(self, doc):
            return doc.get_value(0) == "yes"

    query = xapian.Query(stem("out"))
    enquire = xapian.Enquire(db)
    enquire.set_query(query)
    mset = enquire.get_mset(0, 10, None, testmatchdecider())
    expect(mset.size(), 1, "Unexpected number of documents returned by match decider")
    expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in")

    # Feature test for ExpandDecider
    class testexpanddecider(xapian.ExpandDecider):
        def __call__(self, term):
            return (not term.startswith('a'))

    enquire = xapian.Enquire(db)
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, testexpanddecider())
    eset_terms = [item.term for item in eset]
    expect(len(eset_terms), eset.size(), "Unexpected number of terms returned by expand")
    if [t for t in eset_terms if t.startswith('a')]:
        raise TestFail("ExpandDecider was not used")

    # Check min_wt argument to get_eset() works (new in 1.2.5).
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ)
    expect([i.weight for i in eset][-1] < 1.9, True, "test get_eset() without min_wt")
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, None, 1.9)
    expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt")

    # Check QueryParser parsing error.
    qp = xapian.QueryParser()
    expect_exception(xapian.QueryParserError, "Syntax: <expression> AND <expression>", qp.parse_query, "test AND")

    # Check QueryParser pure NOT option
    qp = xapian.QueryParser()
    expect_query(qp.parse_query("NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
                 "(0 * <alldocuments> AND_NOT test@1)")

    # Check QueryParser partial option
    qp = xapian.QueryParser()
    qp.set_database(db)
    qp.set_default_op(xapian.Query.OP_AND)
    qp.set_stemming_strategy(qp.STEM_SOME)
    qp.set_stemmer(xapian.Stem('en'))
    expect_query(qp.parse_query("foo ox", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND (WILDCARD SYNONYM ox OR Zox@2))")

    expect_query(qp.parse_query("foo outside", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND (WILDCARD SYNONYM outside OR Zoutsid@2))")

    # Test supplying unicode strings
    expect_query(xapian.Query(xapian.Query.OP_OR, (u'foo', u'bar')),
                 '(foo OR bar)')
    expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', u'bar\xa3')),
                 '(foo OR bar\xc2\xa3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', 'bar\xc2\xa3')),
                 '(foo OR bar\xc2\xa3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, u'foo', u'bar'),
                 '(foo OR bar)')

    expect_query(qp.parse_query(u"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
                 "(0 * <alldocuments> AND_NOT Zt\xc3\xa9st@1)")

    doc = xapian.Document()
    doc.set_data(u"Unicode with an acc\xe9nt")
    doc.add_posting(stem(u"out\xe9r"), 1)
    expect(doc.get_data(), u"Unicode with an acc\xe9nt".encode('utf-8'))
    term = doc.termlist().next().term
    expect(term, u"out\xe9r".encode('utf-8'))

    # Check simple stopper
    stop = xapian.SimpleStopper()
    qp.set_stopper(stop)
    expect(stop('a'), False)
    expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    stop.add('a')
    expect(stop('a'), True)
    expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Feature test for custom Stopper
    class my_b_stopper(xapian.Stopper):
        def __call__(self, term):
            return term == "b"

        def get_description(self):
            return u"my_b_stopper"

    stop = my_b_stopper()
    expect(stop.get_description(), u"my_b_stopper")
    qp.set_stopper(stop)
    expect(stop('a'), False)
    expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    expect(stop('b'), True)
    expect_query(qp.parse_query(u"foo bar b", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Test SimpleStopper initialised from a file.
    try:
        srcdir = os.environ['srcdir']
    except KeyError:
        srcdir = '.'
    stop = xapian.SimpleStopper(srcdir + '/../shortstop.list')
    expect(stop('a'), True)
    expect(stop('am'), False)
    expect(stop('an'), True)
    expect(stop('the'), True)

    expect_exception(xapian.InvalidArgumentError, None, xapian.SimpleStopper, 'nosuchfile')

    # Test TermGenerator
    termgen = xapian.TermGenerator()
    doc = xapian.Document()
    termgen.set_document(doc)
    termgen.index_text('foo bar baz foo')
    expect([(item.term, item.wdf, [pos for pos in item.positer]) for item in doc.termlist()], [('bar', 1, [2]), ('baz', 1, [3]), ('foo', 2, [1, 4])])


    # Check DateRangeProcessor works
    context("checking that DateRangeProcessor works")
    qp = xapian.QueryParser()
    rpdate = xapian.DateRangeProcessor(1, xapian.RP_DATE_PREFER_MDY, 1960)
    qp.add_rangeprocessor(rpdate)
    query = qp.parse_query('12/03/99..12/04/01')
    expect(str(query), 'Query(VALUE_RANGE 1 19991203 20011204)')

    # Feature test for xapian.FieldProcessor
    context("running feature test for xapian.FieldProcessor")
    class testfieldprocessor(xapian.FieldProcessor):
        def __call__(self, s):
            if s == 'spam':
                raise Exception('already spam')
            return xapian.Query("spam")

    qp.add_prefix('spam', testfieldprocessor())
    qp.add_boolean_prefix('boolspam', testfieldprocessor())
    qp.add_boolean_prefix('boolspam2', testfieldprocessor(), False) # Old-style
    qp.add_boolean_prefix('boolspam3', testfieldprocessor(), '')
    qp.add_boolean_prefix('boolspam4', testfieldprocessor(), 'group')
    qp.add_boolean_prefix('boolspam5', testfieldprocessor(), None)
    query = qp.parse_query('spam:ignored')
    expect(str(query), 'Query(spam)')

    expect_exception(Exception, 'already spam', qp.parse_query, 'spam:spam')

    # Regression tests copied from PHP (probably always worked in python, but
    # let's check...)
    context("running regression tests for issues which were found in PHP")

    # PHP overload resolution involving boolean types failed.
    enq.set_sort_by_value(1, True)

    # Regression test - fixed in 0.9.10.1.
    oqparser = xapian.QueryParser()
    oquery = oqparser.parse_query("I like tea")

    # Regression test for bug fixed in 1.4.4:
    # https://bugs.debian.org/849722
    oqparser.add_boolean_prefix('tag', 'K', '')
    # Make sure other cases also work:
    oqparser.add_boolean_prefix('zag', 'XR', False) # Old-style
    oqparser.add_boolean_prefix('rag', 'XR', None)
    oqparser.add_boolean_prefix('nag', 'XB', '')
    oqparser.add_boolean_prefix('bag', 'XB', 'blergh')
    oqparser.add_boolean_prefix('gag', 'XB', u'blergh')
    oqparser.add_boolean_prefix('jag', 'XB', b'blergh')

    # Regression test for bug#192 - fixed in 1.0.3.
    enq.set_cutoff(100)

    # Test setting and getting metadata
    expect(db.get_metadata('Foo'), '')
    db.set_metadata('Foo', 'Foo')
    expect(db.get_metadata('Foo'), 'Foo')
    expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '')
    expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.set_metadata, '', 'Foo')
    expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '')

    # Test OP_SCALE_WEIGHT and corresponding constructor
    expect_query(xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query('foo'), 5),
                 "5 * foo")
Exemple #6
0
    def _create_document(self, package, old_doc=None):
        doc = xapian.Document()
        self.indexer.set_document(doc)
        filtered_name = filter_search_string(package['name'])
        filtered_summary = filter_search_string(package['summary'])
        filtered_description = filter_search_string(package['description'])
        filtered_owner = filter_search_string(package['devel_owner'])

        self.indexer.index_text_without_positions('EX__' + filtered_name + '__EX', 10, '')
        self.indexer.index_text_without_positions('EX__' + filtered_owner + '__EX', 10, '')

        name_parts = filtered_name.split('_')
        for i in range(20):
            if len(name_parts) > 1:
                for part in name_parts:
                    self.indexer.index_text_without_positions(part)
            self.indexer.index_text_without_positions(filtered_name, 10, '')

        for i in range(4):
            self.indexer.index_text_without_positions(filtered_summary)
        self.indexer.index_text_without_positions(filtered_description)

        self.index_files_of_interest(doc, package)

        for sub_package in package['sub_pkgs']:
            filtered_sub_package_name = filter_search_string(sub_package['name'])
            log.info("       indexing subpackage %s" % sub_package['name'])

            self.indexer.index_text_without_positions(filtered_sub_package_name)
            self.indexer.index_text_without_positions('EX__' + filtered_sub_package_name
                                                      + '__EX', 10, '')

            self.index_files_of_interest(doc, sub_package)

            # Set special sub-package icon if appstream has one
            sub_package['icon'] = self.icon_cache.get(
                sub_package['name'], self.default_icon)

            # If the parent has a dull icon, give it ours!
            if sub_package['icon'] != self.default_icon \
                    and package['icon'] == self.default_icon:
                package['icon'] = sub_package['icon']

            # remove anything we don't want to store
            del sub_package['package']

        # @@: Right now we're only indexing the first part of the
        # provides/requires, and not boolean comparison or version
        # for requires in package.requires:
        #    print requires[0]
        #    doc.fields.append(xappy.Field('requires', requires[0]))
        # for provides in package.provides:
        #    doc.fields.append(xappy.Field('provides', provides[0]))

        # remove anything we don't want to store and then store data in
        # json format
        del package['package']

        doc.set_data(json.dumps(package))

        # It seems that xapian db.replace_document still creates a new
        # document. In order to avoid duplicating the document we are
        # using add_document and then delete the old document.
        self.db.add_document(doc)
        if old_doc is not None:
            self.db.delete_document(old_doc.get_docid())
        self.db.commit()
Exemple #7
0
    def update(self,
               documents=None,
               after_index=None,
               per_page=10000,
               commit_each=False):
        """
        Update the database with the documents.
        There are some default value and terms in a document:
         * Values:
           1. Used to store the ID of the document
           2. Store the model of the object (in the string format, like
              "project.app.model")
           3. Store the indexer descriptor (module path)
           4..10. Free

         * Terms
           UID: Used to store the ID of the document, so we can replace
                the document by the ID
        """
        # Open Xapian Database
        database = self._db.open(write=True)

        # If doesnt have any document at all
        if documents is None:
            update_queue = self._model.objects.all()
        else:
            update_queue = documents

        commiter = Commiter.create(commit_each)(
            #lambda: database.begin_transaction(flush=True),
            database.begin_transaction,
            database.commit_transaction,
            database.cancel_transaction)

        # Get each document received
        for page in paginate(update_queue, per_page):
            try:
                commiter.begin_page()

                for obj in page.object_list:
                    commiter.begin_object()

                    try:
                        if not self.trigger(obj):
                            self.delete(obj.pk, database)
                            continue

                        doc = xapian.Document()

                        # Add default terms and values
                        uid = self._create_uid(obj)
                        doc.add_term(self._create_uid(obj))
                        self._insert_meta_values(doc, obj)

                        generator = xapian.TermGenerator()
                        generator.set_database(database)
                        generator.set_document(doc)
                        generator.set_flags(xapian.TermGenerator.FLAG_SPELLING)

                        #stem_lang = self._get_stem_language(obj)
                        #if stem_lang:
                        #generator.set_stemmer(xapian.Stem(stem_lang))
                        #stopper = self.get_stopper(stem_lang)
                        #if stopper:
                        #generator.set_stopper(stopper)
                        stemming_lang = self._get_stem_language(obj)
                        if stemming_lang:
                            stemmer = self.get_stemmer(stemming_lang)
                            generator.set_stemmer(stemmer)
                            stopper = self.get_stopper(stemming_lang)
                            if stopper:
                                generator.set_stopper(stopper)

                        #for field in self.fields + self.tags:
                        # Trying to resolve field value or skip it
                        #try:
                        #value = field.resolve(obj)
                        #except AttributeError:
                        #continue
                        #if field.prefix:
                        #index_value = field.convert(value, self._model)
                        #if index_value is not None:
                        #doc.add_value(field.number, smart_text(index_value))
                        #prefix = smart_text(field.get_tag())
                        #generator.index_text(smart_text(value), field.weight, prefix)
                        #if prefix:  # if prefixed then also index without prefix
                        #generator.index_text(smart_text(value), field.weight)
                        #database.replace_document(uid, doc)
                        #if after_index:
                        #after_index(obj)

                        # Get a weight for the object
                        obj_weight = self._get_object_weight(obj)
                        # Index fields
                        self._do_index_fields(doc, generator, obj, obj_weight)

                        database.replace_document(uid, doc)
                        if after_index:
                            after_index(obj)

                        commiter.commit_object()
                    except Exception:
                        commiter.cancel_object()
                        raise

                commiter.commit_page()
            except Exception:
                commiter.cancel_page()
                raise

        database.flush()
    def put_data(self, key, data):

        try:
            data['_date'] = first_of(data, 'meta.date', 'image.created',
                                     'annex.added')
            if isinstance(data['_date'], (list, tuple)):
                data['_date'] = data['_date'][0]
        except KeyError:
            data['_date'] = ''

        logger.debug("Sort key: %r", data['_date'])
        sortvalue = encode_sortable_date(data['_date'])

        doc = xapian.Document()
        self.term_generator.set_document(doc)

        git = data.get('git', {})

        if git.get('branch'):

            # add the sort date
            d = term_date(data['_date'])
            doc.add_term('D' + d, 0)
            doc.add_term('Y' + d[:4], 0)
            doc.add_term(d[:4], 0)

            for branch, p in git.get('branch', {}).items():
                folder, filename = os.path.split(p)
                name, _ = os.path.splitext(filename)
                self.term_generator.index_text(name, 0, 'F')
                self.term_generator.index_text(name)
                self.term_generator.increase_termpos()
                for t in folder.split(os.sep):
                    if t:
                        doc.add_term("P" + t.lower(), 0)

            for section in data:

                if section[0] == '_': continue

                if data[section] is None: continue

                for field, values in data[section].items():

                    prefix = None

                    # handle arrays and straight values
                    if isinstance(values, (dict, )):
                        values = list(values)
                    if not isinstance(values, (list, tuple)):
                        values = [values]

                    # handle prefixed unstemmed boolean terms
                    if field in terms.PREFIXED_UNSTEMMED_BOOLEAN_TERMS:
                        field = terms.PREFIXED_UNSTEMMED_BOOLEAN_TERMS[field]

                        for value in values:
                            doc.add_term(field + value.lower(), 0)

                            # some terms should be added to the full text index
                            if field in terms.BOOLEAN_UNPREFIXED_STEMMED:
                                self.term_generator.index_text(value)
                                self.term_generator.increase_termpos()
                        continue

                    # handle prefixed unstemmed terms
                    if field in terms.PREFIXED_UNSTEMMED_TERMS:
                        field = terms.PREFIXED_UNSTEMMED_TERMS[field]

                        for value in values:
                            if field[0] == 'D':
                                value = term_date(value)
                            doc.add_term(field + value.lower(), 0)
                        continue

                    # handle free terms
                    if field in terms.STEMMED_TERMS:

                        for value in values:
                            self.term_generator.index_text(
                                value, 1, terms.STEMMED_TERMS[field])
                            self.term_generator.index_text(value)
                            self.term_generator.increase_termpos()
            doc.add_term('XSok')
        else:
            doc.add_term('XSdropped')

        doc.set_data(json.dumps(data))
        doc.add_value(0, key)
        doc.add_value(1, sortvalue)

        idterm = "QK{0}".format(key)
        doc.add_boolean_term(idterm)

        if logger.isEnabledFor(logging.DEBUG):
            logger.debug("Data: %r", data)
            logger.debug("Terms: %r", [x.term for x in doc.termlist()])

        self.db.replace_document(idterm, doc)
Exemple #9
0
def index_app_info_from_parser(parser, db, cache):
    term_generator = xapian.TermGenerator()
    term_generator.set_database(db)
    try:
        # this tests if we have spelling suggestions (there must be
        # a better way?!?) - this is needed as inmemory does not have
        # spelling corrections, but it allows setting the flag and will
        # raise a exception much later
        db.add_spelling("test")
        db.remove_spelling("test")
        # this enables the flag for it (we only reach this line if
        # the db supports spelling suggestions)
        term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING)
    except xapian.UnimplementedError:
        pass
    doc = xapian.Document()
    term_generator.set_document(doc)
    # app name is the data
    if parser.has_option_desktop("X-Ubuntu-Software-Center-Name"):
        name = parser.get_desktop("X-Ubuntu-Software-Center-Name")
        untranslated_name = parser.get_desktop("X-Ubuntu-Software-Center-Name",
                                               translated=False)
    elif parser.has_option_desktop("X-GNOME-FullName"):
        name = parser.get_desktop("X-GNOME-FullName")
        untranslated_name = parser.get_desktop("X-GNOME-FullName",
                                               translated=False)
    else:
        name = parser.get_desktop("Name")
        untranslated_name = parser.get_desktop("Name", translated=False)
    if name in seen:
        LOG.debug("duplicated name '%s' (%s)" % (name, parser.desktopf))
    LOG.debug("indexing app '%s'" % name)
    seen.add(name)
    doc.set_data(name)
    index_name(doc, name, term_generator)
    doc.add_value(XapianValues.APPNAME_UNTRANSLATED, untranslated_name)

    # check if we should ignore this file
    if parser.has_option_desktop("X-AppInstall-Ignore"):
        ignore = parser.get_desktop("X-AppInstall-Ignore")
        if ignore.strip().lower() == "true":
            LOG.debug("X-AppInstall-Ignore found for '%s'" % parser.desktopf)
            return
    # architecture
    pkgname_extension = ''
    if parser.has_option_desktop("X-AppInstall-Architectures"):
        arches = parser.get_desktop("X-AppInstall-Architectures")
        doc.add_value(XapianValues.ARCHIVE_ARCH, arches)
        native_archs = get_current_arch() in arches.split(',')
        foreign_archs = list(
            set(arches.split(',')) & set(get_foreign_architectures()))
        if not (native_archs or foreign_archs): return
        if not native_archs and foreign_archs:
            pkgname_extension = ':' + foreign_archs[0]
    # package name
    pkgname = parser.get_desktop("X-AppInstall-Package") + pkgname_extension
    doc.add_term("AP" + pkgname)
    if '-' in pkgname:
        # we need this to work around xapian oddness
        doc.add_term(pkgname.replace('-', '_'))
    doc.add_value(XapianValues.PKGNAME, pkgname)
    doc.add_value(XapianValues.DESKTOP_FILE, parser.desktopf)
    # display name
    if "display_name" in axi_values:
        doc.add_value(axi_values["display_name"], name)
    # cataloged_times
    if "catalogedtime" in axi_values:
        if pkgname in cataloged_times:
            doc.add_value(axi_values["catalogedtime"],
                          xapian.sortable_serialise(cataloged_times[pkgname]))
        else:
            # also catalog apps not found in axi (e.g. for-purchase apps)
            doc.add_value(axi_values["catalogedtime"],
                          xapian.sortable_serialise(time.time()))
    # pocket (main, restricted, ...)
    if parser.has_option_desktop("X-AppInstall-Section"):
        archive_section = parser.get_desktop("X-AppInstall-Section")
        doc.add_term("AS" + archive_section)
        doc.add_value(XapianValues.ARCHIVE_SECTION, archive_section)
    # section (mail, base, ..)
    if pkgname in cache and cache[pkgname].candidate:
        section = cache[pkgname].section
        doc.add_term("AE" + section)
    # channel (third party stuff)
    if parser.has_option_desktop("X-AppInstall-Channel"):
        archive_channel = parser.get_desktop("X-AppInstall-Channel")
        doc.add_term("AH" + archive_channel)
        doc.add_value(XapianValues.ARCHIVE_CHANNEL, archive_channel)
    # signing key (third party)
    if parser.has_option_desktop("X-AppInstall-Signing-Key-Id"):
        keyid = parser.get_desktop("X-AppInstall-Signing-Key-Id")
        doc.add_value(XapianValues.ARCHIVE_SIGNING_KEY_ID, keyid)
    # license (third party)
    if parser.has_option_desktop("X-AppInstall-License"):
        license = parser.get_desktop("X-AppInstall-License")
        doc.add_value(XapianValues.LICENSE, license)
    # purchased date
    if parser.has_option_desktop("X-AppInstall-Purchased-Date"):
        date = parser.get_desktop("X-AppInstall-Purchased-Date")
        # strip the subseconds from the end of the date string
        doc.add_value(XapianValues.PURCHASED_DATE, str(date).split(".")[0])
    # deb-line (third party)
    if parser.has_option_desktop("X-AppInstall-Deb-Line"):
        debline = parser.get_desktop("X-AppInstall-Deb-Line")
        doc.add_value(XapianValues.ARCHIVE_DEB_LINE, debline)
    # license key (third party)
    if parser.has_option_desktop("X-AppInstall-License-Key"):
        key = parser.get_desktop("X-AppInstall-License-Key")
        doc.add_value(XapianValues.LICENSE_KEY, key)
    # license keypath (third party)
    if parser.has_option_desktop("X-AppInstall-License-Key-Path"):
        path = parser.get_desktop("X-AppInstall-License-Key-Path")
        doc.add_value(XapianValues.LICENSE_KEY_PATH, path)
    # PPA (third party stuff)
    if parser.has_option_desktop("X-AppInstall-PPA"):
        archive_ppa = parser.get_desktop("X-AppInstall-PPA")
        doc.add_value(XapianValues.ARCHIVE_PPA, archive_ppa)
        # add archive origin data here so that its available even if
        # the PPA is not (yet) enabled
        doc.add_term("XOO" + "lp-ppa-%s" % archive_ppa.replace("/", "-"))
    # screenshot (for third party)
    if parser.has_option_desktop("X-AppInstall-Screenshot-Url"):
        url = parser.get_desktop("X-AppInstall-Screenshot-Url")
        doc.add_value(XapianValues.SCREENSHOT_URL, url)
    # thumbnail (for third party)
    if parser.has_option_desktop("X-AppInstall-Thumbnail-Url"):
        url = parser.get_desktop("X-AppInstall-Thumbnail-Url")
        doc.add_value(XapianValues.THUMBNAIL_URL, url)
    # video support (for third party mostly)
    if parser.has_option_desktop("X-AppInstall-Video-Url"):
        url = parser.get_desktop("X-AppInstall-Video-Url")
        doc.add_value(XapianValues.VIDEO_URL, url)
    # icon (for third party)
    if parser.has_option_desktop("X-AppInstall-Icon-Url"):
        url = parser.get_desktop("X-AppInstall-Icon-Url")
        doc.add_value(XapianValues.ICON_URL, url)
        if not parser.has_option_desktop("X-AppInstall-Icon"):
            doc.add_value(XapianValues.ICON, os.path.basename(url))
    # price (pay stuff)
    if parser.has_option_desktop("X-AppInstall-Price"):
        price = parser.get_desktop("X-AppInstall-Price")
        doc.add_value(XapianValues.PRICE, price)
        # since this is a commercial app, indicate it in the component value
        doc.add_value(XapianValues.ARCHIVE_SECTION, "commercial")
    # icon
    if parser.has_option_desktop("Icon"):
        icon = parser.get_desktop("Icon")
        doc.add_value(XapianValues.ICON, icon)
    # write out categories
    for cat in parser.get_desktop_categories():
        doc.add_term("AC" + cat.lower())
    categories_string = ";".join(parser.get_desktop_categories())
    doc.add_value(XapianValues.CATEGORIES, categories_string)
    for mime in parser.get_desktop_mimetypes():
        doc.add_term("AM" + mime.lower())
    # get type (to distinguish between apps and packages
    if parser.has_option_desktop("Type"):
        type = parser.get_desktop("Type")
        doc.add_term("AT" + type.lower())
    # check gettext domain
    if parser.has_option_desktop("X-Ubuntu-Gettext-Domain"):
        domain = parser.get_desktop("X-Ubuntu-Gettext-Domain")
        doc.add_value(XapianValues.GETTEXT_DOMAIN, domain)
    # Description (software-center extension)
    if parser.has_option_desktop("X-AppInstall-Description"):
        descr = parser.get_desktop("X-AppInstall-Description")
        doc.add_value(XapianValues.SC_DESCRIPTION, descr)
    # popcon
    # FIXME: popularity not only based on popcon but also
    #        on archive section, third party app etc
    if parser.has_option_desktop("X-AppInstall-Popcon"):
        popcon = float(parser.get_desktop("X-AppInstall-Popcon"))
        # sort_by_value uses string compare, so we need to pad here
        doc.add_value(XapianValues.POPCON, xapian.sortable_serialise(popcon))
        global popcon_max
        popcon_max = max(popcon_max, popcon)

    # comment goes into the summary data if there is one,
    # other wise we try GenericName and if nothing else,
    # the summary of the package
    if parser.has_option_desktop("Comment"):
        s = parser.get_desktop("Comment")
        doc.add_value(XapianValues.SUMMARY, s)
    elif parser.has_option_desktop("GenericName"):
        s = parser.get_desktop("GenericName")
        if s != name:
            doc.add_value(XapianValues.SUMMARY, s)
    elif pkgname in cache and cache[pkgname].candidate:
        s = cache[pkgname].candidate.summary
        doc.add_value(XapianValues.SUMMARY, s)

    # add packagename as meta-data too
    term_generator.index_text_without_positions(pkgname, WEIGHT_APT_PKGNAME)

    # now add search data from the desktop file
    for key in ["GenericName", "Comment", "X-AppInstall-Description"]:
        if not parser.has_option_desktop(key):
            continue
        s = parser.get_desktop(key)
        # we need the ascii_upper here for e.g. turkish locales, see
        # bug #581207
        k = "WEIGHT_DESKTOP_" + ascii_upper(key.replace(" ", ""))
        if k in globals():
            w = globals()[k]
        else:
            LOG.debug("WEIGHT %s not found" % k)
            w = 1
        term_generator.index_text_without_positions(s, w)
    # add data from the apt cache
    if pkgname in cache and cache[pkgname].candidate:
        s = cache[pkgname].candidate.summary
        term_generator.index_text_without_positions(s, WEIGHT_APT_SUMMARY)
        s = cache[pkgname].candidate.description
        term_generator.index_text_without_positions(s, WEIGHT_APT_DESCRIPTION)
        for origin in cache[pkgname].candidate.origins:
            doc.add_term("XOA" + origin.archive)
            doc.add_term("XOC" + origin.component)
            doc.add_term("XOL" + origin.label)
            doc.add_term("XOO" + origin.origin)
            doc.add_term("XOS" + origin.site)

    # add our keywords (with high priority)
    if parser.has_option_desktop("X-AppInstall-Keywords"):
        keywords = parser.get_desktop("X-AppInstall-Keywords")
        for s in keywords.split(";"):
            if s:
                term_generator.index_text_without_positions(
                    s, WEIGHT_DESKTOP_KEYWORD)
    # now add it
    db.add_document(doc)
Exemple #10
0
    def make_doc(self, cache):
        """Build a Xapian document from the desktop info."""
        doc = xapian.Document()
        # app name is the data
        name = self._set_doc_from_key(doc, AppInfoFields.NAME)
        assert name is not None
        doc.set_data(name)
        self._set_doc_from_key(doc,
                               AppInfoFields.NAME_UNTRANSLATED,
                               translated=False)

        # check if we should ignore this file
        if self.is_ignored:
            LOG.debug("%r.make_doc: %r is ignored.", self.__class__.__name__,
                      self.desktopf)
            return

        # architecture
        pkgname_extension = ''
        arches = self._set_doc_from_key(doc, AppInfoFields.ARCH)
        if arches:
            native_archs = get_current_arch() in arches.split(',')
            foreign_archs = list(
                set(arches.split(',')) & set(get_foreign_architectures()))
            if not (native_archs or foreign_archs):
                return
            if not native_archs and foreign_archs:
                pkgname_extension = ':' + foreign_archs[0]

        # package name
        pkgname = self._set_doc_from_key(doc,
                                         AppInfoFields.PACKAGE,
                                         pkgname_extension=pkgname_extension)
        doc.add_value(XapianValues.DESKTOP_FILE, self.desktopf)

        # display name
        display_name = axi_values.get("display_name")
        if display_name is not None:
            doc.add_value(display_name, name)

        # cataloged_times
        catalogedtime = axi_values.get("catalogedtime")
        if catalogedtime is not None and pkgname in cataloged_times:
            doc.add_value(catalogedtime,
                          xapian.sortable_serialise(cataloged_times[pkgname]))

        # section (mail, base, ..)
        if pkgname in cache and cache[pkgname].candidate:
            section = cache[pkgname].section
            doc.add_term("AE" + section)

        fields = (
            AppInfoFields.CHANNEL,  # channel (third party stuff)
            AppInfoFields.DEB_LINE,  # deb-line (third party)
            AppInfoFields.DESCRIPTION,  # description software-center extension
            AppInfoFields.GETTEXT_DOMAIN,  # check gettext domain
            AppInfoFields.ICON,  # icon
            AppInfoFields.LICENSE,  # license (third party)
            AppInfoFields.LICENSE_KEY,  # license key (third party)
            AppInfoFields.LICENSE_KEY_PATH,  # license keypath (third party)
            AppInfoFields.PPA,  # PPA (third party stuff)
            AppInfoFields.PURCHASED_DATE,  # purchased date
            AppInfoFields.SCREENSHOT_URLS,  # screenshot (for third party)
            AppInfoFields.SECTION,  # pocket (main, restricted, ...)
            AppInfoFields.SIGNING_KEY_ID,  # signing key (third party)
            AppInfoFields.SUPPORT_URL,  # support url (mainly pay stuff)
            AppInfoFields.SUPPORTED_DISTROS,  # supported distros
            AppInfoFields.THUMBNAIL_URL,  # thumbnail (for third party)
            AppInfoFields.VERSION,  # version support (for e.g. the scagent)
            AppInfoFields.VIDEO_URL,  # video support (for third party mostly)
            AppInfoFields.WEBSITE,  # homepage url (developer website)
        )
        for field in fields:
            self._set_doc_from_key(doc, field)

        # date published
        date_published_str = self._set_doc_from_key(
            doc, AppInfoFields.DATE_PUBLISHED)
        # we use the date published value for the cataloged time as well
        if date_published_str is not None:
            LOG.debug("pkgname: %s, date_published cataloged time is: %s",
                      pkgname, date_published_str)
            date_published = time.mktime(
                time.strptime(date_published_str, "%Y-%m-%d  %H:%M:%S"))
            # a value for our own DB
            doc.add_value(XapianValues.DB_CATALOGED_TIME,
                          xapian.sortable_serialise(date_published))
            if "catalogedtime" in axi_values:
                # compat with a-x-i
                doc.add_value(axi_values["catalogedtime"],
                              xapian.sortable_serialise(date_published))

        # icon (for third party)
        url = self._set_doc_from_key(doc, AppInfoFields.ICON_URL)
        if url and self.get_value(AppInfoFields.ICON) is None:
            # prefix pkgname to avoid name clashes
            doc.add_value(XapianValues.ICON,
                          "%s-icon-%s" % (pkgname, os.path.basename(url)))

        # price (pay stuff)
        price = self._set_doc_from_key(doc, AppInfoFields.PRICE)
        if price:
            # this is a commercial app, indicate it in the component value
            doc.add_value(XapianValues.ARCHIVE_SECTION, "commercial")
            # this is hard-coded to US dollar for now, but if the server
            # ever changes we can update
            doc.add_value(XapianValues.CURRENCY, "US$")

        # add donwload size as string (its send as int)
        download_size = self.get_value(AppInfoFields.DOWNLOAD_SIZE)
        if download_size is not None:
            doc.add_value(XapianValues.DOWNLOAD_SIZE,
                          xapian.sortable_serialise((download_size)))

        # write out categories
        for cat in self.get_categories():
            doc.add_term("AC" + cat.lower())
        categories_string = ";".join(self.get_categories())
        doc.add_value(XapianValues.CATEGORIES, categories_string)

        # mimetypes
        for mime in self.get_mimetypes():
            doc.add_term("AM" + mime.lower())

        # get type (to distinguish between apps and packages)
        app_type = self.get_value(AppInfoFields.TYPE)
        if app_type:
            doc.add_term("AT" + app_type.lower())

        # (deb)tags (in addition to the pkgname debtags)
        tags_string = self.get_value(AppInfoFields.TAGS)
        if tags_string:
            # convert to list and register
            tags = [tag.strip().lower() for tag in tags_string.split(",")]
            for tag in tags:
                doc.add_term("XT" + tag)
            # ENFORCE region blacklist/whitelist by not registering
            #          the app at all
            region = get_region_cached()
            if region:
                countrycode = region["countrycode"].lower()
                blacklist = [
                    t.replace(REGION_BLACKLIST_TAG, "") for t in tags
                    if t.startswith(REGION_BLACKLIST_TAG)
                ]
                whitelist = [
                    t.replace(REGION_WHITELIST_TAG, "") for t in tags
                    if t.startswith(REGION_WHITELIST_TAG)
                ]

                if countrycode in blacklist:
                    if countrycode in whitelist:
                        LOG.debug(
                            "%r.make_doc: %r black AND whitelisted for "
                            "region %r. Treating as blacklisted.",
                            self.__class__.__name__, name, countrycode)

                    LOG.debug(
                        "%r.make_doc: skipping region restricted app %r "
                        "(blacklisted)", self.__class__.__name__, name)
                    return

                if len(whitelist) > 0 and countrycode not in whitelist:
                    LOG.debug(
                        "%r.make_doc: skipping region restricted "
                        "app %r (region not whitelisted)",
                        self.__class__.__name__, name)
                    return

        # popcon
        # FIXME: popularity not only based on popcon but also
        #        on archive section, third party app etc
        popcon = self._set_doc_from_key(doc, AppInfoFields.POPCON)
        if popcon is not None:
            global popcon_max
            popcon_max = max(popcon_max, popcon)

        # comment goes into the summary data if there is one,
        # otherwise we try GenericName and if nothing else,
        # the summary of the candidate package
        summary = self._set_doc_from_key(doc, AppInfoFields.SUMMARY, name=name)
        if summary is None and pkgname in cache and cache[pkgname].candidate:
            summary = cache[pkgname].candidate.summary
            doc.add_value(XapianValues.SUMMARY, summary)

        return doc
Exemple #11
0
    def __init__(self, path, popcon_dir, axi_path, tags_filter):
        """
        Set initial attributes.
        """
        self.axi = xapian.Database(axi_path)
        self.path = os.path.expanduser(path)
        self.popcon_dir = os.path.expanduser(popcon_dir)
        self.valid_pkgs = axi_get_pkgs(self.axi)
        logging.debug("Considering %d valid packages" % len(self.valid_pkgs))
        with open(tags_filter) as valid_tags:
            self.valid_tags = [
                line.strip() for line in valid_tags if not line.startswith("#")
            ]
        logging.debug("Considering %d valid tags" % len(self.valid_tags))
        if not os.path.exists(self.popcon_dir):
            os.makedirs(self.popcon_dir)
        if not os.listdir(self.popcon_dir):
            logging.critical("Popcon dir seems to be empty.")
            raise Error

        # set up directory
        shutil.rmtree(self.path, 1)
        os.makedirs(self.path)
        try:
            logging.info("Indexing popcon submissions from \'%s\'" %
                         self.popcon_dir)
            logging.info("Creating new xapian index at \'%s\'" % self.path)
            xapian.WritableDatabase.__init__(self, self.path,
                                             xapian.DB_CREATE_OR_OVERWRITE)
        except xapian.DatabaseError as e:
            logging.critical("Could not create popcon xapian index.")
            logging.critical(str(e))
            raise Error

        # build new index
        doc_count = 0
        for root, dirs, files in os.walk(self.popcon_dir):
            for popcon_file in files:
                submission = PopconSubmission(os.path.join(root, popcon_file))
                doc = xapian.Document()
                submission_pkgs = submission.get_filtered(self.valid_pkgs)
                if len(submission_pkgs) < 10:
                    logging.debug("Low profile popcon submission \'%s\' (%d)" %
                                  (submission.user_id, len(submission_pkgs)))
                else:
                    doc.set_data(submission.user_id)
                    doc.add_term("ID" + submission.user_id)
                    doc.add_term("ARCH" + submission.arch)
                    logging.debug("Parsing popcon submission \'%s\'" %
                                  submission.user_id)
                    for pkg, freq in submission_pkgs.items():
                        tags = axi_search_pkg_tags(self.axi, pkg)
                        # if the package was found in axi
                        if tags:
                            doc.add_term("XP" + pkg, freq)
                            # if the package has tags associated with it
                            if not tags == "notags":
                                for tag in tags:
                                    if tag.lstrip("XT") in self.valid_tags:
                                        doc.add_term(tag, freq)
                    doc_id = self.add_document(doc)
                    doc_count += 1
                    logging.debug("Popcon Xapian: Indexing doc %d" % doc_id)
            # python garbage collector
                gc.collect()
        # flush to disk database changes
        try:
            self.commit()
        except:
            # deprecated function, used for compatibility with old lib version
            self.flush()
Exemple #12
0
    def index(self, document, commit=False):
        database = self.database
        document_id, document_values, document_terms, document_texts, document_data, default_language, default_spelling, default_positions = document

        document = xapian.Document()

        if document_data:
            document.set_data(document_data)

        for name, value in (document_values or {}).items():
            name = name.strip()
            slot = get_slot(name)
            if slot:
                value = serialise_value(value)[0]
                if value:
                    document.add_value(slot, value)
            else:
                self.log.warning("Ignored document value name (%r)", name)

        if isinstance(document_id, basestring):
            document.add_value(get_slot('ID'), document_id)
            document_id = prefixed(document_id, DOCUMENT_ID_TERM_PREFIX)
            document.add_boolean_term(document_id)  # Make sure document_id is also a term (otherwise it doesn't replace an existing document)

        for terms in document_terms or ():
            if isinstance(terms, (tuple, list)):
                terms, weight, prefix, position = (list(terms) + [None] * 4)[:4]
            else:
                weight = prefix = position = None
            if not terms:
                continue

            weight = 1 if weight is None else weight
            prefix = '' if prefix is None else prefix

            for term, field_name, terms in find_terms(terms, None):
                if field_name:
                    boolean = not field_name.islower()
                    term_prefix = get_prefix(field_name, DOCUMENT_CUSTOM_TERM_PREFIX)
                else:
                    boolean = not prefix.islower()
                    term_prefix = prefix
                if boolean:
                    term = terms
                for term in serialise_value(term):
                    if term:
                        if not boolean:
                            term = term.lower()
                        if position is None:
                            document.add_term(prefixed(term, term_prefix), weight)
                        else:
                            document.add_posting(prefixed(term, term_prefix), position, weight)
                if boolean:
                    break

        for text in document_texts or ():
            if isinstance(text, (tuple, list)):
                text, weight, prefix, language, spelling, positions = (list(text) + [None] * 6)[:6]
            else:
                weight = prefix = language = spelling = positions = None
            if not text:
                continue

            weight = 1 if weight is None else weight
            prefix = '' if prefix is None else prefix
            language = default_language if language is None else language
            positions = default_positions if positions is None else positions
            spelling = default_spelling if spelling is None else spelling

            term_generator = xapian.TermGenerator()
            term_generator.set_document(document)
            if spelling:
                term_generator.set_database(database)
                term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING)
            if language:
                term_generator.set_stemmer(xapian.Stem(language))
            if positions:
                index_text = term_generator.index_text
            else:
                index_text = term_generator.index_text_without_positions
            index_text(normalize(text), weight, prefix.upper())

        return self.replace(document_id, document, commit=commit)
Exemple #13
0
def test_value_mods():
    """Test handling of modifications to values.

    """
    dbpath = 'db_test_value_mods'
    db = xapian.chert_open(dbpath, xapian.DB_CREATE_OR_OVERWRITE)
    random.seed(42)
    doccount = 1000
    vals = {}

    # Add a value to all the documents
    for num in range(1, doccount):
        doc = xapian.Document()
        val = 'val%d' % num
        doc.add_value(1, val)
        db.add_document(doc)
        vals[num] = val
    db.commit()
    check_vals(db, vals)

    # Modify one of the values (this is a regression test which failed with the
    # initial implementation of streaming values).
    doc = xapian.Document()
    val = 'newval0'
    doc.add_value(1, val)
    db.replace_document(2, doc)
    vals[2] = val
    db.commit()
    check_vals(db, vals)

    # Do some random modifications.
    for count in range(1, doccount * 2):
        docid = random.randint(1, doccount)
        doc = xapian.Document()

        if count % 5 == 0:
            val = ''
        else:
            val = 'newval%d' % count
            doc.add_value(1, val)
        db.replace_document(docid, doc)
        vals[docid] = val

    # Check the values before and after modification.
    check_vals(db, vals)
    db.commit()
    check_vals(db, vals)

    # Delete all the values which are non-empty, in a random order.
    keys = [key for key, val in vals.items() if val != '']
    random.shuffle(keys)
    for key in keys:
        doc = xapian.Document()
        db.replace_document(key, doc)
        vals[key] = ''
    check_vals(db, vals)
    db.commit()
    check_vals(db, vals)

    db.close()
    expect_exception(xapian.DatabaseError, "Database has been closed",
                     check_vals, db, vals)
    shutil.rmtree(dbpath)
Exemple #14
0
def test_postingsource():
    """Simple test of the PostingSource class.

    """
    class OddPostingSource(xapian.PostingSource):
        def __init__(self, max):
            xapian.PostingSource.__init__(self)
            self.max = max

        def init(self, db):
            self.current = -1

        def get_termfreq_min(self):
            return 0

        def get_termfreq_est(self):
            return int(self.max / 2)

        def get_termfreq_max(self):
            return self.max

        def next(self, minweight):
            self.current += 2

        def at_end(self):
            return self.current > self.max

        def get_docid(self):
            return self.current

    dbpath = 'db_test_postingsource'
    db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OVERWRITE)
    for id in range(10):
        doc = xapian.Document()
        db.add_document(doc)

    # Do a dance to check that the posting source doesn't get dereferenced too
    # soon in various cases.
    def mkenq(db):
        # First - check that it's kept when the source goes out of scope.
        def mkquery():
            source = OddPostingSource(10)
            return xapian.Query(xapian.Query.OP_OR, [xapian.Query(source)])

        # Check that it's kept when the query goes out of scope.
        def submkenq():
            query = mkquery()
            enquire = xapian.Enquire(db)
            enquire.set_query(query)
            return enquire

        # Check it's kept when the query is retrieved from enquire and put into
        # a new enquire.
        def submkenq2():
            enq1 = submkenq()
            enquire = xapian.Enquire(db)
            enquire.set_query(enq1.get_query())
            return enquire

        return submkenq2()

    enquire = mkenq(db)
    mset = enquire.get_mset(0, 10)

    expect([item.docid for item in mset], [1, 3, 5, 7, 9])

    db.close()
    shutil.rmtree(dbpath)
Exemple #15
0
    def _build_index(self, filepath, recreate=False):
        """
            save txt to LevelDB

            Input:
                - filepath: txt file path, support .pdf, .gzip, .bzip2, and .txt file, or dictionary which contains those files
                - recreate: bool, True will force recreate db, default is False
        """
        cached_index = filepath + ".index"

        if os.path.exists(cached_index):
            if recreate:
                shutil.rmtree(cached_index)
        else:
            recreate = True

        stemmer = xapian.Stem("english")

        if not recreate:
            database = xapian.Database(cached_index)
        else:
            database = xapian.WritableDatabase(cached_index, xapian.DB_CREATE_OR_OPEN)
            indexer = xapian.TermGenerator()
            indexer.set_stemmer(stemmer)

            if os.path.isdir(filepath):
                filepaths = glob.glob(os.path.join(filepath, "*.*"))
            else:
                filepaths = [filepath]

            for filepath in filepaths:
                ext = os.path.splitext(filepath)[-1]
                open_func = open
                if ext == ".pdf":
                    filepath2 = filepath + ".txt"
                    if not os.path.exists(filepath2):
                        subprocess.Popen(('pdftotext', filepath, filepath2)).wait()
                    filepath = filepath2
                elif ext == ".bz2":   
                    import bz2
                    open_func = bz2.open
                elif ext == ".gz":
                    import gzip
                    open_func = gzip.open
                else:
                    continue

                with open_func(filepath, mode="rt", encoding="utf-8") as f:
                    for l in tqdm(f, desc="Building index for " + filepath, unit=" lines"):
                        l = l.strip()
                        if len(l) < 1 :
                            continue
                        sent_combined = []
                        sent_len = 0
                        for sent in nltk.sent_tokenize(l):
                            sent = sent.strip()
                            tokens = wordpunct_tokenize(sent)
                            if sent_len > 0 and sent_len+len(tokens) > self.max_seq_len/2:
                                combined = "\t" .join(sent_combined)
                                doc = xapian.Document()
                                doc.set_data(combined)
                                indexer.set_document(doc)
                                indexer.index_text(combined)
                                database.add_document(doc)
                                sent_combined = []
                                sent_len = 0
                            sent_len += len(tokens)
                            sent_combined.append(sent)
                        if sent_len > 0:
                            combined = "\t" .join(sent_combined)
                            doc = xapian.Document()
                            doc.set_data(combined)
                            indexer.set_document(doc)
                            indexer.index_text(combined)
                            database.add_document(doc)

        self.parser = xapian.QueryParser()
        self.parser.set_stemmer(stemmer)
        self.parser.set_database(database)
        self.parser.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
        self.enquire = xapian.Enquire(database)
def test_replication_concurrency():
    """Test concurrent replication and modification

    """

    builddir = os.environ['abs_builddir']
    dbsdir = os.path.join(builddir, 'dbs_replication')
    if not os.path.isdir(dbsdir):
        os.makedirs(dbsdir)

    masterpath = os.path.join(dbsdir, 'master')
    firstpath = os.path.join(dbsdir, 'first')
    secondpath = os.path.join(dbsdir, 'second')
    slavepath = os.path.join(dbsdir, 'slave')
    if os.path.isdir(masterpath):
        shutil.rmtree(masterpath)
    if os.path.isdir(slavepath):
        shutil.rmtree(slavepath)
    port = 7876

    expect_exception(
        xapian.DatabaseOpeningError,
        "Couldn't stat '" + dbsdir + "/slave' (No such file or directory)",
        xapian.Database, slavepath)

    clientp = None
    serverp = subprocess.Popen((
        '../../xapian-core/bin/xapian-replicate-server',
        dbsdir,
        '--port=7876',
    ), )

    doccount1 = 10000
    doccount2 = 1000

    starttime = time.time()
    if not os.path.isdir(firstpath):
        firstdb = xapian.WritableDatabase(firstpath,
                                          xapian.DB_CREATE_OR_OVERWRITE)
        # Make an initial, large database
        print
        print "Building initial database ..."
        for num in xrange(1, doccount1):
            doc = xapian.Document()
            val = 'val%d' % num
            doc.add_value(1, val)
            firstdb.add_document(doc)
            if num % 100000 == 0:
                print "%d documents..." % num
        firstdb.set_metadata('dbname', '1')
        firstdb.commit()
        print "built"

    # The secondary database gets modified during the test, so needs to be
    # cleared now.
    shutil.rmtree(secondpath)
    if not os.path.isdir(secondpath):
        seconddb = xapian.WritableDatabase(secondpath,
                                           xapian.DB_CREATE_OR_OVERWRITE)
        # Make second, small database
        print
        print "Building secondary database ..."
        for num in xrange(1, doccount2):
            doc = xapian.Document()
            val = 'val%d' % num
            doc.add_value(1, val)
            seconddb.add_document(doc)
            if num % 100000 == 0:
                print "%d documents..." % num
        seconddb.set_metadata('dbname', '2')
        seconddb.commit()
        print "built"

    if time.time() - starttime < 1:
        time.sleep(1)  # Give server time to start

    try:
        set_master(masterpath, firstpath)
        clientp = subprocess.Popen((
            '../../xapian-core/bin/xapian-replicate',
            '--host=127.0.0.1',
            '--master=master',
            os.path.join(dbsdir, 'slave'),
            '--interval=0',
            '--port=7876',
            '-r 0',
        ), )
        time.sleep(1)  # Give client time to start
        expect(xapian.Database(slavepath).get_metadata('dbname'), '1')

        for count in xrange(10):
            # Test that swapping between databases doesn't confuse replication.
            for count2 in xrange(2):
                set_master(masterpath, secondpath)
                time.sleep(0.1)
                set_master(masterpath, firstpath)
                time.sleep(0.1)

            # Test making changes to the database.
            set_master(masterpath, secondpath)
            masterdb = xapian.WritableDatabase(masterpath, xapian.DB_OPEN)
            print "making 100 changes"
            for num in xrange(100):
                masterdb.set_metadata('num%d' % num, str(num + count))
                masterdb.commit()
            print "changes done"
            masterdb.close()

            # Allow time for the replication client to catch up with the
            # changes.
            time.sleep(2)
            expect(xapian.Database(slavepath).get_metadata('dbname'), '2')
            expect(
                xapian.Database(slavepath).get_metadata('num99'),
                str(99 + count))

    finally:
        if clientp is not None:
            os.kill(clientp.pid, 9)
            clientp.wait()
        os.kill(serverp.pid, 9)
        serverp.wait()
Exemple #17
0
        if ".git" in dirpath or "__xdb__" in dirpath:
            continue
        for filename in filenames:
            cursor = os.path.join(dirpath, filename)

            # skip non-plain files
            with open(cursor, 'rb') as cursor_file:
                cursor_content = cursor_file.read()
                if is_binary_string(cursor_content):
                    spinner.print("D: skip non-plain file {}".format(cursor))
                    continue
            #print("I: {:04d} : processing {}".format(counter_indexed, cursor))
            spinner.spin()

            try:
                doc = xapian.Document()
                with open(cursor, 'r') as cursor_file:
                    cursor_document = cursor_file.read()

                doc.set_data(cursor_document)

                indexer.set_document(doc)
                indexer.index_text(cursor_content)

                xdb.add_document(doc)

                counter_indexed = counter_indexed + 1
                pathlist_indexed.append(cursor)
            except:
                spinner.print("W: skip problematic file {}".format(cursor))
                pass
    def update(self, index, iterable):
        """
        Updates the `index` with any objects in `iterable` by adding/updating
        the database as needed.
        
        Required arguments:
            `index` -- The `SearchIndex` to process
            `iterable` -- An iterable of model instances to index
        
        For each object in `iterable`, a document is created containing all
        of the terms extracted from `index.full_prepare(obj)` with field prefixes, 
        and 'as-is' as needed.  Also, if the field type is 'text' it will be 
        stemmed and stored with the 'Z' prefix as well.
        
        eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest, XCONTENTtest`
        
        Each document also contains an extra term in the format:
        
        `XCONTENTTYPE<app_name>.<model_name>`
        
        As well as a unique identifier in the the format:
        
        `Q<app_name>.<model_name>.<pk>`
        
        eg.: foo.bar (pk=1) ==> `Qfoo.bar.1`, `XCONTENTTYPEfoo.bar`
        
        This is useful for querying for a specific document corresponding to
        a model instance.
        
        The document also contains a pickled version of the object itself and
        the document ID in the document data field.
        
        Finally, we also store field values to be used for sorting data.  We
        store these in the document value slots (position zero is reserver
        for the document ID).  All values are stored as unicode strings with
        conversion of float, int, double, values being done by Xapian itself
        through the use of the :method:xapian.sortable_serialise method.
        """
        database = self._database(writable=True)
        try:
            for obj in iterable:
                document = xapian.Document()

                term_generator = xapian.TermGenerator()
                term_generator.set_database(database)
                term_generator.set_stemmer(xapian.Stem(self.language))
                if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
                    term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING)
                term_generator.set_document(document)

                document_id = DOCUMENT_ID_TERM_PREFIX + get_identifier(obj)
                data = index.full_prepare(obj)
                weights = index.get_field_weights()
                for field in self.schema:
                    if field['field_name'] in data.keys():
                        prefix = DOCUMENT_CUSTOM_TERM_PREFIX + field['field_name'].upper()
                        value = data[field['field_name']]
                        try:
                            weight = int(weights[field['field_name']])
                        except KeyError:
                            weight = 1
                        if field['type'] == 'text':
                            if field['multi_valued'] == 'false':
                                term = _marshal_term(value)
                                term_generator.index_text(term, weight)
                                term_generator.index_text(term, weight, prefix)
                                if len(term.split()) == 1:
                                    document.add_term(term, weight)
                                    document.add_term(prefix + term, weight)
                                document.add_value(field['column'], _marshal_value(value))
                            else:
                                for term in value:
                                    term = _marshal_term(term)
                                    term_generator.index_text(term, weight)
                                    term_generator.index_text(term, weight, prefix)
                                    if len(term.split()) == 1:
                                        document.add_term(term, weight)
                                        document.add_term(prefix + term, weight)
                        else:
                            if field['multi_valued'] == 'false':
                                term = _marshal_term(value)
                                if len(term.split()) == 1:
                                    document.add_term(term, weight)
                                    document.add_term(prefix + term, weight)
                                    document.add_value(field['column'], _marshal_value(value))
                            else:
                                for term in value:
                                    term = _marshal_term(term)
                                    if len(term.split()) == 1:
                                        document.add_term(term, weight)
                                        document.add_term(prefix + term, weight)

                document.set_data(pickle.dumps(
                    (obj._meta.app_label, obj._meta.module_name, obj.pk, data),
                    pickle.HIGHEST_PROTOCOL
                ))
                document.add_term(document_id)
                document.add_term(
                    DOCUMENT_CT_TERM_PREFIX + u'%s.%s' %
                    (obj._meta.app_label, obj._meta.module_name)
                )
                database.replace_document(document_id, document)

        except UnicodeDecodeError:
            sys.stderr.write('Chunk failed.\n')
            pass

        finally:
            database = None
Exemple #19
0
    def index(self,
              fieldname,
              value,
              search_default=False,
              store_facet=True,
              spelling=False,
              weight=1,
              isdocid=False):
        """Index a field value.
        
        `fieldname` is the field to index.
        `value` is the value to index. This can be a string, int, float, or
            datetime object. Flax will attempt to index each appropriately.
            This should either be a unicode object or a UTF-8 string.
        `store_facet` specifies whether to store facet values (filter fields
            only) and is True by default.
        `spelling` specifies whether to add spellings to the database.
        `weight` allows the WDF to be set (1 by default)
        `isdocid` uses this field value as a docid (filter fields only).
            False by default.
        
        """

        if not value:
            return

        if isdocid and self._docid:
            raise IndexingError, 'docid has already been set'

        if isinstance(value, unicode):
            value = value.encode('utf-8', 'ignore')

        prefix, valnum, isfilter = self._fieldmap[fieldname]

        if not isfilter or search_default or spelling:
            termgen = xapian.TermGenerator()
            if self._stemmer:
                termgen.set_stemmer(self._stemmer)

        if isfilter:
            if isinstance(value, basestring):
                term = u'%s%s%s' % (prefix, ':' if value[0].isupper() else '',
                                    value.decode('utf-8', 'ignore'))
                term = term.encode('utf-8', 'ignore')
                if isinstance(value, unicode):
                    value = value.encode('utf-8', 'ignore')

                self._doc.add_term(term)

                if store_facet:
                    if _multivalues:
                        self._facets.setdefault(
                            valnum,
                            xapian.StringListSerialiser()).append(value)
                    else:
                        if self._facets.get(valnum):
                            raise IndexingError, \
                                'facet value already set for "%s" field' % fieldname
                        self._facets[valnum] = value

                if isdocid:
                    self._docid = term

            elif isinstance(value, float) or isinstance(value, int):
                self._doc.add_value(valnum, xapian.sortable_serialise(value))
                # FIXME - helper terms?
                # FIXME - numeric facets

                if isdocid:
                    self._docid = '%s%s' % (prefix, value)

            elif isinstance(value, datetime):
                self._doc.add_term('%s%04d' % (prefix, value.year))
                self._doc.add_term('%s%04d%02d' %
                                   (prefix, value.year, value.month))
                self._doc.add_term(
                    '%s%04d%02d%02d' %
                    (prefix, value.year, value.month, value.day))
                #                self._doc.add_value(valnum, '%04d%02d%02d%02d%02d%02d' % (
                #                    value.year, value.month, value.day,
                #                    value.hour, value.minute, value.second))
                self._doc.add_value(
                    valnum,
                    xapian.sortable_serialise(time.mktime(value.timetuple())))

                if isdocid:
                    raise IndexingError, 'cannot use date as docid'
        else:
            if isinstance(value, str):
                termgen.set_document(self._doc)
                termgen.index_text(value, weight, prefix)
            else:
                raise IndexingError, 'non-filter field requires string value'

            if isdocid:
                raise IndexingError, 'cannot use non-filter field as docid'

        # spelling only works for prefix-less terms
        if search_default or spelling:
            if search_default:
                termgen.set_document(self._doc)
            else:
                termgen.set_document(xapian.Document())  # dummy document

            if spelling:
                if self.database is None:
                    raise IndexingError, 'spelling requires document.database to be set'
                termgen.set_database(self.database)
                termgen.set_flags(termgen.FLAG_SPELLING)

            termgen.index_text(value)
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xapian


stem = xapian.Stem('english')

db = xapian.inmemory_open()
doc = xapian.Document()
doc.add_posting(stem("is"), 1)
doc.add_posting(stem("there"), 2)
doc.add_posting(stem("anybody"), 3)
doc.add_posting(stem("out"), 4)
doc.add_posting(stem("there"), 5)
db.add_document(doc)

doc1 = xapian.Document()
doc1.add_posting(stem("is"), 1)
doc1.add_posting(stem("there"), 2)
doc1.add_posting(stem("anybody"), 3)
doc1.add_posting(stem("out"), 4)
doc1.add_posting(stem("there"), 5)
db.add_document(doc1)
db.commit()

for term in db.allterms():
    print term.term, term.termfreq
"""
    anybodi 2
    is 2
Exemple #21
0
def test_all():
    # Test the version number reporting functions give plausible results.
    v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(),
                      xapian.revision())
    v2 = xapian.version_string()
    expect(v2, v, "Unexpected version output")

    # A regexp check would be better, but seems to create a bogus "leak" of -1
    # objects in Python 3.
    expect(len(xapian.__version__.split('.')), 3,
           'xapian.__version__ not X.Y.Z')
    expect((xapian.__version__.split('.'))[0], '1',
           'xapian.__version__ not "1.Y.Z"')

    def access_cvar():
        return xapian.cvar

    # Check that SWIG isn't generating cvar (regression test for ticket#297).
    expect_exception(AttributeError, "'module' object has no attribute 'cvar'",
                     access_cvar)

    stem = xapian.Stem(b"english")
    expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)")

    doc = xapian.Document()
    doc.set_data(b"a\0b")
    if doc.get_data() == b"a":
        raise TestFail("get_data+set_data truncates at a zero byte")
    expect(doc.get_data(), b"a\0b",
           "get_data+set_data doesn't transparently handle a zero byte")
    doc.set_data(b"is there anybody out there?")
    doc.add_term(b"XYzzy")
    doc.add_posting(stem(b"is"), 1)
    doc.add_posting(stem(b"there"), 2)
    doc.add_posting(stem(b"anybody"), 3)
    doc.add_posting(stem(b"out"), 4)
    doc.add_posting(stem(b"there"), 5)

    db = xapian.inmemory_open()
    db.add_document(doc)
    expect(db.get_doccount(), 1, "Unexpected db.get_doccount()")
    terms = ["smoke", "test", "terms"]
    expect_query(
        xapian.Query(xapian.Query.OP_OR, [t.encode('utf-8') for t in terms]),
        "(smoke OR test OR terms)")
    query1 = xapian.Query(xapian.Query.OP_PHRASE,
                          (b"smoke", b"test", b"tuple"))
    query2 = xapian.Query(xapian.Query.OP_XOR,
                          (xapian.Query(b"smoke"), query1, b"string"))
    expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)")
    expect_query(
        query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)")
    subqs = ["a", "b"]
    expect_query(
        xapian.Query(xapian.Query.OP_OR, [s.encode('utf-8') for s in subqs]),
        "(a OR b)")
    expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, b'1', b'4'),
                 "VALUE_RANGE 0 1 4")

    # Check database factory functions are wrapped as expected:

    expect_exception(xapian.DatabaseOpeningError, None, xapian.open_stub,
                     b"nosuchdir/nosuchdb")
    expect_exception(xapian.DatabaseOpeningError, None, xapian.open_stub,
                     b"nosuchdir/nosuchdb", xapian.DB_OPEN)

    expect_exception(xapian.DatabaseOpeningError, None, xapian.brass_open,
                     b"nosuchdir/nosuchdb")
    expect_exception(xapian.DatabaseCreateError, None, xapian.brass_open,
                     b"nosuchdir/nosuchdb", xapian.DB_CREATE)

    expect_exception(xapian.DatabaseOpeningError, None, xapian.chert_open,
                     b"nosuchdir/nosuchdb")
    expect_exception(xapian.DatabaseCreateError, None, xapian.chert_open,
                     b"nosuchdir/nosuchdb", xapian.DB_CREATE)

    expect_exception(xapian.NetworkError, None, xapian.remote_open,
                     b"/bin/false", b"")
    expect_exception(xapian.NetworkError, None, xapian.remote_open_writable,
                     b"/bin/false", b"")

    expect_exception(xapian.NetworkError, None, xapian.remote_open,
                     b"127.0.0.1", 0, 1)
    expect_exception(xapian.NetworkError, None, xapian.remote_open_writable,
                     b"127.0.0.1", 0, 1)

    # Check wrapping of MatchAll and MatchNothing:

    expect_query(xapian.Query.MatchAll, "<alldocuments>")
    expect_query(xapian.Query.MatchNothing, "")

    # Feature test for Query.__iter__
    term_count = 0
    for term in query2:
        term_count += 1
    expect(term_count, 4, "Unexpected number of terms in query2")

    enq = xapian.Enquire(db)
    enq.set_query(xapian.Query(xapian.Query.OP_OR, b"there", b"is"))
    mset = enq.get_mset(0, 10)
    expect(mset.size(), 1, "Unexpected mset.size()")
    expect(len(mset), 1, "Unexpected mset.size()")

    # Feature test for Enquire.matching_terms(docid)
    term_count = 0
    for term in enq.matching_terms(mset.get_hit(0)):
        term_count += 1
    expect(term_count, 2, "Unexpected number of matching terms")

    # Feature test for MSet.__iter__
    msize = 0
    for match in mset:
        msize += 1
    expect(msize, mset.size(), "Unexpected number of entries in mset")

    terms = b" ".join(enq.matching_terms(mset.get_hit(0)))
    expect(terms, b"is there", "Unexpected terms")

    # Feature test for ESet.__iter__
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enq.get_eset(10, rset)
    term_count = 0
    for term in eset:
        term_count += 1
    expect(term_count, 3, "Unexpected number of expand terms")

    # Feature test for Database.__iter__
    term_count = 0
    for term in db:
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db")

    # Feature test for Database.allterms
    term_count = 0
    for term in db.allterms():
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db.allterms")

    # Feature test for Database.postlist
    count = 0
    for posting in db.postlist(b"there"):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('there')")

    # Feature test for Database.postlist with empty term (alldocspostlist)
    count = 0
    for posting in db.postlist(b""):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('')")

    # Feature test for Database.termlist
    count = 0
    for term in db.termlist(1):
        count += 1
    expect(count, 5, "Unexpected number of entries in db.termlist(1)")

    # Feature test for Database.positionlist
    count = 0
    for term in db.positionlist(1, b"there"):
        count += 1
    expect(count, 2,
           "Unexpected number of entries in db.positionlist(1, 'there')")

    # Feature test for Document.termlist
    count = 0
    for term in doc.termlist():
        count += 1
    expect(count, 5, "Unexpected number of entries in doc.termlist()")

    # Feature test for TermIter.skip_to
    term = doc.termlist()
    term.skip_to(b'n')
    while True:
        try:
            x = next(term)
        except StopIteration:
            break
        if x.term < b'n':
            raise TestFail("TermIter.skip_to didn't skip term '%s'" %
                           x.term.decode('utf-8'))

    # Feature test for Document.values
    count = 0
    for term in list(doc.values()):
        count += 1
    expect(count, 0, "Unexpected number of entries in doc.values")

    # Check exception handling for Xapian::DocNotFoundError
    expect_exception(xapian.DocNotFoundError, "Docid 3 not found",
                     db.get_document, 3)

    # Check value of OP_ELITE_SET
    expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET")

    # Feature test for MatchDecider
    doc = xapian.Document()
    doc.set_data(b"Two")
    doc.add_posting(stem(b"out"), 1)
    doc.add_posting(stem(b"outside"), 1)
    doc.add_posting(stem(b"source"), 2)
    doc.add_value(0, b"yes")
    db.add_document(doc)

    class testmatchdecider(xapian.MatchDecider):
        def __call__(self, doc):
            return doc.get_value(0) == b"yes"

    query = xapian.Query(stem(b"out"))
    enquire = xapian.Enquire(db)
    enquire.set_query(query)
    mset = enquire.get_mset(0, 10, None, testmatchdecider())
    expect(mset.size(), 1,
           "Unexpected number of documents returned by match decider")
    expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in")

    # Feature test for ExpandDecider
    class testexpanddecider(xapian.ExpandDecider):
        def __call__(self, term):
            return (not term.startswith(b'a'))

    enquire = xapian.Enquire(db)
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0,
                            testexpanddecider())
    eset_terms = [item.term for item in eset]
    expect(len(eset_terms), eset.size(),
           "Unexpected number of terms returned by expand")
    if [t for t in eset_terms if t.startswith(b'a')]:
        raise TestFail("ExpandDecider was not used")

    # Check min_wt argument to get_eset() works (new in 1.2.5).
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ)
    expect([i.weight for i in eset][-1] < 1.9, True,
           "test get_eset() without min_wt")
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0,
                            None, 1.9)
    expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt")

    # Check QueryParser parsing error.
    qp = xapian.QueryParser()
    expect_exception(xapian.QueryParserError,
                     "Syntax: <expression> AND <expression>", qp.parse_query,
                     b"test AND")

    # Check QueryParser pure NOT option
    qp = xapian.QueryParser()
    expect_query(
        qp.parse_query(b"NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
        "(<alldocuments> AND_NOT test@1)")

    # Check QueryParser partial option
    qp = xapian.QueryParser()
    qp.set_database(db)
    qp.set_default_op(xapian.Query.OP_AND)
    qp.set_stemming_strategy(qp.STEM_SOME)
    qp.set_stemmer(xapian.Stem(b'en'))
    expect_query(qp.parse_query(b"foo o", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND ((out@2 SYNONYM outsid@2) OR Zo@2))")

    expect_query(qp.parse_query(b"foo outside", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND Zoutsid@2)")

    # Test supplying unicode strings
    expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar')),
                 '(foo OR bar)')
    expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xa3')),
                 '(foo OR bar\\xa3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xc2\xa3')),
                 '(foo OR bar\u00a3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, b'foo', b'bar'),
                 '(foo OR bar)')

    expect_query(
        qp.parse_query(b"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
        "(<alldocuments> AND_NOT Zt\u00e9st@1)")

    doc = xapian.Document()
    doc.set_data(b"Unicode with an acc\xe9nt")
    doc.add_posting(stem(b"out\xe9r"), 1)
    expect(doc.get_data(), b"Unicode with an acc\xe9nt")
    term = next(doc.termlist()).term
    expect(term, b"out\xe9r")

    # Check simple stopper
    stop = xapian.SimpleStopper()
    qp.set_stopper(stop)
    expect(stop(b'a'), False)
    expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    stop.add(b'a')
    expect(stop(b'a'), True)
    expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Feature test for custom Stopper
    class my_b_stopper(xapian.Stopper):
        def __call__(self, term):
            return term == b"b"

        def get_description(self):
            return "my_b_stopper"

    stop = my_b_stopper()
    expect(stop.get_description(), "my_b_stopper")
    qp.set_stopper(stop)
    expect(stop(b'a'), False)
    expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    expect(stop(b'b'), True)
    expect_query(qp.parse_query(b"foo bar b", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Test TermGenerator
    termgen = xapian.TermGenerator()
    doc = xapian.Document()
    termgen.set_document(doc)
    termgen.index_text(b'foo bar baz foo')
    expect([(item.term, item.wdf, [pos for pos in item.positer])
            for item in doc.termlist()], [(b'bar', 1, [2]), (b'baz', 1, [3]),
                                          (b'foo', 2, [1, 4])])

    # Check DateValueRangeProcessor works
    context("checking that DateValueRangeProcessor works")
    qp = xapian.QueryParser()
    vrpdate = xapian.DateValueRangeProcessor(1, 1, 1960)
    qp.add_valuerangeprocessor(vrpdate)
    query = qp.parse_query(b'12/03/99..12/04/01')
    expect(str(query), 'Query(0 * VALUE_RANGE 1 19991203 20011204)')

    # Regression test for bug#193, fixed in 1.0.3.
    context("running regression test for bug#193")
    vrp = xapian.NumberValueRangeProcessor(0, b'$', True)
    a = '$10'
    b = '20'
    slot, a, b = vrp(a, b.encode('utf-8'))
    expect(slot, 0)
    expect(xapian.sortable_unserialise(a), 10)
    expect(xapian.sortable_unserialise(b), 20)

    # Regression tests copied from PHP (probably always worked in python, but
    # let's check...)
    context("running regression tests for issues which were found in PHP")

    # PHP overload resolution involving boolean types failed.
    enq.set_sort_by_value(1, True)

    # Regression test - fixed in 0.9.10.1.
    oqparser = xapian.QueryParser()
    oquery = oqparser.parse_query(b"I like tea")

    # Regression test for bug#192 - fixed in 1.0.3.
    enq.set_cutoff(100)

    # Test setting and getting metadata
    expect(db.get_metadata(b'Foo'), b'')
    db.set_metadata(b'Foo', b'Foo')
    expect(db.get_metadata(b'Foo'), b'Foo')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.get_metadata, b'')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.set_metadata, b'',
                     b'Foo')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.get_metadata, b'')

    # Test OP_SCALE_WEIGHT and corresponding constructor
    expect_query(
        xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query(b'foo'), 5),
        "5 * foo")
Exemple #22
0
    def update(self,
               documents=None,
               after_index=None,
               per_page=10000,
               commit_each=False):
        """
        Update the database with the documents.
        There are some default value and terms in a document:
         * Values:
           1. Used to store the ID of the document
           2. Store the model of the object (in the string format, like
              "project.app.model")
           3. Store the indexer descriptor (module path)
           4..10. Free

         * Terms
           UID: Used to store the ID of the document, so we can replace
                the document by the ID
        """
        # Open Xapian Database
        database = self._db.open(write=True)

        # If doesnt have any document at all
        if documents is None:
            update_queue = self._model.objects.all()
        else:
            update_queue = documents

        commiter = Commiter.create(commit_each)(
            lambda: database.begin_transaction(flush=True),
            database.commit_transaction, database.cancel_transaction)

        # Get each document received
        for page in paginate(update_queue, per_page):
            try:
                commiter.begin_page()

                for obj in page.object_list:
                    commiter.begin_object()

                    try:
                        if not self.trigger(obj):
                            self.delete(obj.pk, database)
                            continue

                        doc = xapian.Document()

                        # Add default terms and values
                        uid = self._create_uid(obj)
                        doc.add_term(self._create_uid(obj))
                        self._insert_meta_values(doc, obj)

                        generator = xapian.TermGenerator()
                        generator.set_database(database)
                        generator.set_document(doc)
                        generator.set_flags(xapian.TermGenerator.FLAG_SPELLING)

                        stemming_lang = self._get_stem_language(obj)
                        if stemming_lang:
                            stemmer = self.get_stemmer(stemming_lang)
                            generator.set_stemmer(stemmer)

                            stopper = self.get_stopper(stemming_lang)
                            if stopper:
                                generator.set_stopper(stopper)

                        # Get a weight for the object
                        obj_weight = self._get_object_weight(obj)
                        # Index fields
                        self._do_index_fields(doc, generator, obj, obj_weight)

                        database.replace_document(uid, doc)
                        if after_index:
                            after_index(obj)

                        commiter.commit_object()
                    except Exception:
                        commiter.cancel_object()
                        raise

                commiter.commit_page()
            except Exception:
                commiter.cancel_page()
                raise