def update_xapiandb(self, kwargs): database = xapian.WritableDatabase(XAPIAN_DB_PATH, xapian.DB_OPEN) DB = xapian.Database(XAPIAN_DB_PATH) enquire = xapian.Enquire(database) indexer = xapian.TermGenerator() if "" == kwargs["pkgname"]: modified_num = 0 add_num = 0 xapiandb_update = "No" query_xapiandb_version = xapian.Query("the_#ukxapiandb#_version") enquire.set_query(query_xapiandb_version) matches = enquire.get_mset(0, 1) for re in matches: docid_for_xapiandb_version = re.document.get_docid() doc_for_xapiandb_version = re.document doc_data = doc_for_xapiandb_version.get_data() if (isinstance(doc_data, bytes)): doc_data = doc_data.decode(encoding='utf-8') if ("XAPIANDB_VERSION" == doc_data): the_latest_update_time = doc_for_xapiandb_version.get_value( 2) #valueslot:2 xapiandb update time if (isinstance(the_latest_update_time, bytes)): the_latest_update_time = the_latest_update_time.decode( encoding='utf-8') else: the_latest_update_time = time.strftime( '%Y-%m-%dT%H:%M:%S', time.localtime()) if (Globals.DEBUG_SWITCH): print( "Failed to get the latest update time from client xapiandb,use default time.localtime()" ) reslist = self.premoter.newerapp_for_xapianupdate( the_latest_update_time) for app in reslist: app_name = str(app["app_name"]) display_name_cn = str(app["display_name_cn"]) keywords_for_search = str(app["keywords_for_search"]) query = xapian.Query(app_name) enquire.set_query(query) doccount = DB.get_doccount() matches = enquire.get_mset(0, doccount) if matches.size() != 0: for re in matches: get_name = re.document.get_data() if (isinstance(get_name, bytes)): get_name = get_name.decode(encoding='utf-8') if get_name == app_name: docid = re.docid doc = re.document doc.clear_terms() indexer.set_document(doc) doc.add_term(app_name, 10) if keywords_for_search != "None": keywords = display_name_cn + ";" + keywords_for_search + ";" + app_name else: keywords = display_name_cn + ";" + app_name indexer.index_text(keywords, 10) try: from mmseg.search import seg_txt_search, seg_txt_2_dict for word, value in seg_txt_2_dict( keywords).items(): if word != "none": doc.add_term(word, 10) else: pass except: if (Globals.DEBUG_SWITCH): print("----No mmseg model---") database.replace_document(docid, doc) xapiandb_update = "Yes" modified_num = modified_num + 1 else: continue else: doc = xapian.Document() doc.set_data(app_name) doc.add_term(app_name, 10) indexer.set_document(doc) if keywords_for_search != "None": keywords = display_name_cn + ";" + keywords_for_search + ";" + app_name else: keywords = display_name_cn + ";" + app_name indexer.index_text(keywords, 10) try: for word, value in seg_txt_2_dict(keywords).items(): if word != "none": doc.add_term(word, 10) else: pass except: pass database.add_document(doc) add_num = add_num + 1 if (Globals.DEBUG_SWITCH): print("App:", doc.get_data(), " ", "terms:", end=' ') for itr in doc.termlist(): if (Globals.DEBUG_SWITCH): print(itr.term, end=' ') xapiandb_update = "Yes" if (Globals.DEBUG_SWITCH): print(" ") try: if xapiandb_update == "Yes": now = time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime()) doc_for_xapiandb_version.add_value(2, now) database.replace_document(docid_for_xapiandb_version, doc_for_xapiandb_version) database.commit() if (Globals.DEBUG_SWITCH): print( "Xapiandb has updated . %d app modified, %d app add. Tatal: %d app updated" % (modified_num, add_num, len(reslist))) except: if (Globals.DEBUG_SWITCH): print( "The xapian database (/home/ice_bird/.cache/uksc/xapiandb) is crashed,please remove it and install a new one!" ) if (Globals.DEBUG_SWITCH): print("update uksc xapiandb over") else: appinfo_query = xapian.Query(kwargs["pkgname"]) enquire.set_query(appinfo_query) matches = enquire.get_mset(0, DB.get_doccount()) for re in matches: doc_for_appinfo = re.document doc_data = doc_for_appinfo.get_data() if kwargs["pkgname"] == doc_data: return doc = xapian.Document() doc.set_data(kwargs["pkgname"]) doc.add_term(kwargs["pkgname"], 10) if (Globals.DEBUG_SWITCH): print("debfile path:", kwargs["path"]) deb = DebFile(kwargs["path"]) terms = kwargs["pkgname"] try: terms = terms + " " + deb.description except: if (Globals.DEBUG_SWITCH): print("Failed to get app description") indexer.set_document(doc) indexer.index_text(terms) database.add_document(doc) database.commit() if (Globals.DEBUG_SWITCH): print("update xapiandb over: ", kwargs["pkgname"], "terms:", end=' ') for itr in doc.termlist(): if (Globals.DEBUG_SWITCH): print(itr.term, end=' ') if (Globals.DEBUG_SWITCH): print(" ")
def make_doc_from_parser(parser, cache): # XXX 2012-01-19 michaeln I'm just pulling this code out from # index_app_info_from_parser, but it'd be great to further # refactor it - it looks quite scary :-) doc = xapian.Document() # app name is the data if parser.has_option_desktop("X-Ubuntu-Software-Center-Name"): name = parser.get_desktop("X-Ubuntu-Software-Center-Name") untranslated_name = parser.get_desktop("X-Ubuntu-Software-Center-Name", translated=False) elif parser.has_option_desktop("X-GNOME-FullName"): name = parser.get_desktop("X-GNOME-FullName") untranslated_name = parser.get_desktop("X-GNOME-FullName", translated=False) else: name = parser.get_desktop("Name") untranslated_name = parser.get_desktop("Name", translated=False) doc.set_data(name) doc.add_value(XapianValues.APPNAME_UNTRANSLATED, untranslated_name) # check if we should ignore this file if parser.has_option_desktop("X-AppInstall-Ignore"): ignore = parser.get_desktop("X-AppInstall-Ignore") if ignore.strip().lower() == "true": LOG.debug("X-AppInstall-Ignore found for '%s'" % parser.desktopf) return # architecture pkgname_extension = '' if parser.has_option_desktop("X-AppInstall-Architectures"): arches = parser.get_desktop("X-AppInstall-Architectures") doc.add_value(XapianValues.ARCHIVE_ARCH, arches) native_archs = get_current_arch() in arches.split(',') foreign_archs = list(set(arches.split(',')) & set(get_foreign_architectures())) if not (native_archs or foreign_archs): return if not native_archs and foreign_archs: pkgname_extension = ':' + foreign_archs[0] # package name pkgname = parser.get_desktop("X-AppInstall-Package") + pkgname_extension doc.add_term("AP" + pkgname) if '-' in pkgname: # we need this to work around xapian oddness doc.add_term(pkgname.replace('-', '_')) doc.add_value(XapianValues.PKGNAME, pkgname) doc.add_value(XapianValues.DESKTOP_FILE, parser.desktopf) # display name if "display_name" in axi_values: doc.add_value(axi_values["display_name"], name) # cataloged_times if "catalogedtime" in axi_values: if pkgname in cataloged_times: doc.add_value(axi_values["catalogedtime"], xapian.sortable_serialise(cataloged_times[pkgname])) # pocket (main, restricted, ...) if parser.has_option_desktop("X-AppInstall-Section"): archive_section = parser.get_desktop("X-AppInstall-Section") doc.add_term("AS" + archive_section) doc.add_value(XapianValues.ARCHIVE_SECTION, archive_section) # section (mail, base, ..) if pkgname in cache and cache[pkgname].candidate: section = cache[pkgname].section doc.add_term("AE" + section) # channel (third party stuff) if parser.has_option_desktop("X-AppInstall-Channel"): archive_channel = parser.get_desktop("X-AppInstall-Channel") doc.add_term("AH" + archive_channel) doc.add_value(XapianValues.ARCHIVE_CHANNEL, archive_channel) # signing key (third party) if parser.has_option_desktop("X-AppInstall-Signing-Key-Id"): keyid = parser.get_desktop("X-AppInstall-Signing-Key-Id") doc.add_value(XapianValues.ARCHIVE_SIGNING_KEY_ID, keyid) # license (third party) if parser.has_option_desktop("X-AppInstall-License"): license = parser.get_desktop("X-AppInstall-License") doc.add_value(XapianValues.LICENSE, license) # date published if parser.has_option_desktop("X-AppInstall-Date-Published"): date_published = parser.get_desktop("X-AppInstall-Date-Published") if (date_published and re.match("\d+-\d+-\d+ \d+:\d+:\d+", date_published)): # strip the subseconds from the end of the published date string date_published = str(date_published).split(".")[0] doc.add_value(XapianValues.DATE_PUBLISHED, date_published) # we use the date published value for the cataloged time as well if "catalogedtime" in axi_values: LOG.debug( ("pkgname: %s, date_published cataloged time is: %s" % (pkgname, parser.get_desktop("date_published")))) date_published_sec = time.mktime( time.strptime(date_published, "%Y-%m-%d %H:%M:%S")) doc.add_value(axi_values["catalogedtime"], xapian.sortable_serialise(date_published_sec)) # purchased date if parser.has_option_desktop("X-AppInstall-Purchased-Date"): date = parser.get_desktop("X-AppInstall-Purchased-Date") # strip the subseconds from the end of the date string doc.add_value(XapianValues.PURCHASED_DATE, str(date).split(".")[0]) # deb-line (third party) if parser.has_option_desktop("X-AppInstall-Deb-Line"): debline = parser.get_desktop("X-AppInstall-Deb-Line") doc.add_value(XapianValues.ARCHIVE_DEB_LINE, debline) # license key (third party) if parser.has_option_desktop("X-AppInstall-License-Key"): key = parser.get_desktop("X-AppInstall-License-Key") doc.add_value(XapianValues.LICENSE_KEY, key) # license keypath (third party) if parser.has_option_desktop("X-AppInstall-License-Key-Path"): path = parser.get_desktop("X-AppInstall-License-Key-Path") doc.add_value(XapianValues.LICENSE_KEY_PATH, path) # PPA (third party stuff) if parser.has_option_desktop("X-AppInstall-PPA"): archive_ppa = parser.get_desktop("X-AppInstall-PPA") if archive_ppa: doc.add_value(XapianValues.ARCHIVE_PPA, archive_ppa) # add archive origin data here so that its available even if # the PPA is not (yet) enabled doc.add_term("XOO" + "lp-ppa-%s" % archive_ppa.replace("/", "-")) # screenshot (for third party) if parser.has_option_desktop("X-AppInstall-Screenshot-Url"): url = parser.get_desktop("X-AppInstall-Screenshot-Url") doc.add_value(XapianValues.SCREENSHOT_URLS, url) # thumbnail (for third party) if parser.has_option_desktop("X-AppInstall-Thumbnail-Url"): url = parser.get_desktop("X-AppInstall-Thumbnail-Url") doc.add_value(XapianValues.THUMBNAIL_URL, url) # video support (for third party mostly) if parser.has_option_desktop("X-AppInstall-Video-Url"): url = parser.get_desktop("X-AppInstall-Video-Url") doc.add_value(XapianValues.VIDEO_URL, url) # icon (for third party) if parser.has_option_desktop("X-AppInstall-Icon-Url"): url = parser.get_desktop("X-AppInstall-Icon-Url") doc.add_value(XapianValues.ICON_URL, url) if not parser.has_option_desktop("X-AppInstall-Icon"): # prefix pkgname to avoid name clashes doc.add_value(XapianValues.ICON, "%s-icon-%s" % ( pkgname, os.path.basename(url))) # price (pay stuff) if parser.has_option_desktop("X-AppInstall-Price"): price = parser.get_desktop("X-AppInstall-Price") doc.add_value(XapianValues.PRICE, price) # since this is a commercial app, indicate it in the component value doc.add_value(XapianValues.ARCHIVE_SECTION, "commercial") # support url (mainly pay stuff) if parser.has_option_desktop("X-AppInstall-Support-Url"): url = parser.get_desktop("X-AppInstall-Support-Url") doc.add_value(XapianValues.SUPPORT_SITE_URL, url) # icon if parser.has_option_desktop("Icon"): icon = parser.get_desktop("Icon") doc.add_value(XapianValues.ICON, icon) # write out categories for cat in parser.get_desktop_categories(): doc.add_term("AC" + cat.lower()) categories_string = ";".join(parser.get_desktop_categories()) doc.add_value(XapianValues.CATEGORIES, categories_string) for mime in parser.get_desktop_mimetypes(): doc.add_term("AM" + mime.lower()) # get type (to distinguish between apps and packages if parser.has_option_desktop("Type"): type = parser.get_desktop("Type") doc.add_term("AT" + type.lower()) # check gettext domain if parser.has_option_desktop("X-Ubuntu-Gettext-Domain"): domain = parser.get_desktop("X-Ubuntu-Gettext-Domain") doc.add_value(XapianValues.GETTEXT_DOMAIN, domain) # Description (software-center extension) if parser.has_option_desktop("X-AppInstall-Description"): descr = parser.get_desktop("X-AppInstall-Description") doc.add_value(XapianValues.SC_DESCRIPTION, descr) if parser.has_option_desktop("Supported-Distros"): doc.add_value(XapianValues.SC_SUPPORTED_DISTROS, json.dumps(parser.get_desktop("Supported-Distros"))) # version support (for e.g. the scagent) if parser.has_option_desktop("X-AppInstall-Version"): ver = parser.get_desktop("X-AppInstall-Version") doc.add_value(XapianValues.VERSION_INFO, ver) # (deb)tags (in addition to the pkgname debtags if parser.has_option_desktop("X-AppInstall-Tags"): # register tags tags_string = parser.get_desktop("X-AppInstall-Tags") if tags_string: tags = [tag.strip().lower() for tag in tags_string.split(",")] for tag in tags: doc.add_term("XT" + tag) region = get_region_cached() if region: # ENFORCE region blacklist/whitelist by not registering # the app at all countrycode = region["countrycode"].lower() if "%s%s" % (REGION_BLACKLIST_TAG, countrycode) in tags: LOG.info("skipping region restricted app: '%s'" " (blacklisted) " % name) return # whitelist for tag in tags: if (tag.startswith(REGION_WHITELIST_TAG) and not "%s%s" % (REGION_WHITELIST_TAG, countrycode) in tag): LOG.info("skipping region restricted app: '%s'" " (not whitelisted)" % name) return # popcon # FIXME: popularity not only based on popcon but also # on archive section, third party app etc if parser.has_option_desktop("X-AppInstall-Popcon"): popcon = float(parser.get_desktop("X-AppInstall-Popcon")) # sort_by_value uses string compare, so we need to pad here doc.add_value(XapianValues.POPCON, xapian.sortable_serialise(popcon)) global popcon_max popcon_max = max(popcon_max, popcon) # comment goes into the summary data if there is one, # other wise we try GenericName and if nothing else, # the summary of the package if parser.has_option_desktop("Comment"): s = parser.get_desktop("Comment") doc.add_value(XapianValues.SUMMARY, s) elif parser.has_option_desktop("GenericName"): s = parser.get_desktop("GenericName") if s != name: doc.add_value(XapianValues.SUMMARY, s) elif pkgname in cache and cache[pkgname].candidate: s = cache[pkgname].candidate.summary doc.add_value(XapianValues.SUMMARY, s) return doc
def gen_search_index(db_session, namespace): log.info("Generating search index for namespace {0}".format(namespace.id)) dbpath = db_path_for(namespace.id) mkdirp(dbpath) database = x_.WritableDatabase(dbpath, x_.DB_CREATE_OR_OPEN) indexer = x_.TermGenerator() stemmer = x_.Stem("english") indexer.set_stemmer(stemmer) indexer.set_database(database) indexer.set_flags(indexer.FLAG_SPELLING) last_docid = database.get_lastdocid() msg_query = db_session.query(Message).filter( Message.namespace_id == namespace.id, Message.id > last_docid).options(joinedload('parts')) \ .order_by(Message.id.desc()) log.info("Have {0} messages to process".format(msg_query.count())) # for each message part, create unprocessed documents with date/subject/to/from # metadata and the plaintext part, and then process them! total = msg_query.count() done = 0 for msg in msg_query.yield_per(1000): text = strip_tags(msg.sanitized_body) # XXX also index attachments (add a 'type' field or something to # differentiate) if text is not None: doc = x_.Document() doc.set_data(text) indexer.set_document(doc) # NOTE: the integer here is a multiplier on the term frequency # (used for calculating relevance). We add terms with and without # a field prefix, so documents are returned on a generic search # *and* when fields are specifically searched for, e.g. to:[email protected] if msg.subject is not None: indexer.index_text(msg.subject, 10) indexer.index_text(msg.subject, 10, 'XSUBJECT') if msg.from_addr is not None: from_ = to_indexable(msg.from_addr) indexer.index_text(from_, 1) indexer.index_text(from_, 1, 'XFROM') if msg.to_addr is not None: to = ' '.join( [to_indexable(parsed_addr) for parsed_addr in msg.to_addr]) indexer.index_text(to, 5) indexer.index_text(to, 5, 'XTO') if msg.cc_addr is not None: cc = ' '.join( [to_indexable(parsed_addr) for parsed_addr in msg.cc_addr]) indexer.index_text(cc, 3) indexer.index_text(cc, 3, 'XCC') if msg.bcc_addr is not None: bcc = ' '.join([ to_indexable(parsed_addr) for parsed_addr in msg.bcc_addr ]) indexer.index_text(bcc, 3) indexer.index_text(bcc, 3, 'XBCC') # "Values" are other data that you can use for e.g. sorting by # date doc.add_value( 0, x_.sortable_serialise(timegm(msg.internaldate.utctimetuple()))) database.replace_document(msg.id, doc) done += 1 log.info("Indexed %i of %i (%.2f%%)" % (done, total, done / total * 100)) indexed_msgs = {k for k in database.metadata_keys()} msgs = [ id for id, in db_session.query(distinct(Message.id)).filter_by( id=namespace.id) ] to_delete = indexed_msgs - msgs log.info("{0} documents to remove...".format(len(to_delete))) for msg_id in to_delete: database.delete_document(msg_id) database.close() log.info("done.")
def setUp(self): pkgs_list = ["gimp", "eog", "inkscape"] self.decider = PkgMatchDecider(pkgs_list) self.doc = xapian.Document()
def test_all(): # Test the version number reporting functions give plausible results. v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(), xapian.revision()) v2 = xapian.version_string() expect(v2, v, "Unexpected version output") # A regexp check would be better, but seems to create a bogus "leak" of -1 # objects in Python 3. expect(len(xapian.__version__.split('.')), 3, 'xapian.__version__ not X.Y.Z') expect((xapian.__version__.split('.'))[0], '1', 'xapian.__version__ not "1.Y.Z"') def access_cvar(): res = xapian.cvar print "Unhandled constants: ", res return res # Check that SWIG isn't generating cvar (regression test for ticket#297). expect_exception(AttributeError, "'module' object has no attribute 'cvar'", access_cvar) stem = xapian.Stem("english") expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)") doc = xapian.Document() doc.set_data("a\0b") if doc.get_data() == "a": raise TestFail("get_data+set_data truncates at a zero byte") expect(doc.get_data(), "a\0b", "get_data+set_data doesn't transparently handle a zero byte") doc.set_data("is there anybody out there?") doc.add_term("XYzzy") doc.add_posting(stem("is"), 1) doc.add_posting(stem("there"), 2) doc.add_posting(stem("anybody"), 3) doc.add_posting(stem("out"), 4) doc.add_posting(stem("there"), 5) db = xapian.WritableDatabase('', xapian.DB_BACKEND_INMEMORY) db.add_document(doc) expect(db.get_doccount(), 1, "Unexpected db.get_doccount()") terms = ["smoke", "test", "terms"] expect_query(xapian.Query(xapian.Query.OP_OR, terms), "(smoke OR test OR terms)") query1 = xapian.Query(xapian.Query.OP_PHRASE, ("smoke", "test", "tuple")) query2 = xapian.Query(xapian.Query.OP_XOR, (xapian.Query("smoke"), query1, "string")) expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)") expect_query(query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)") subqs = ["a", "b"] expect_query(xapian.Query(xapian.Query.OP_OR, subqs), "(a OR b)") expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, '1', '4'), "VALUE_RANGE 0 1 4") # Check database factory functions are wrapped as expected: expect_exception(xapian.DatabaseNotFoundError, None, xapian.Database, "nosuchdir/nosuchdb", xapian.DB_BACKEND_STUB) expect_exception(xapian.DatabaseNotFoundError, None, xapian.WritableDatabase, "nosuchdir/nosuchdb", xapian.DB_OPEN|xapian.DB_BACKEND_STUB) expect_exception(xapian.NetworkError, None, xapian.remote_open, "/bin/false", "") expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, "/bin/false", "") expect_exception(xapian.NetworkError, None, xapian.remote_open, "127.0.0.1", 0, 1) expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, "127.0.0.1", 0, 1) # Check wrapping of MatchAll and MatchNothing: expect_query(xapian.Query.MatchAll, "<alldocuments>") expect_query(xapian.Query.MatchNothing, "") # Feature test for Query.__iter__ term_count = 0 for term in query2: term_count += 1 expect(term_count, 4, "Unexpected number of terms in query2") enq = xapian.Enquire(db) # Check Xapian::BAD_VALUENO is wrapped suitably. enq.set_collapse_key(xapian.BAD_VALUENO) enq.set_query(xapian.Query(xapian.Query.OP_OR, "there", "is")) mset = enq.get_mset(0, 10) expect(mset.size(), 1, "Unexpected mset.size()") expect(len(mset), 1, "Unexpected mset.size()") # Feature test for Enquire.matching_terms(docid) term_count = 0 for term in enq.matching_terms(mset.get_hit(0)): term_count += 1 expect(term_count, 2, "Unexpected number of matching terms") # Feature test for MSet.__iter__ msize = 0 for match in mset: msize += 1 expect(msize, mset.size(), "Unexpected number of entries in mset") terms = " ".join(enq.matching_terms(mset.get_hit(0))) expect(terms, "is there", "Unexpected terms") # Feature test for ESet.__iter__ rset = xapian.RSet() rset.add_document(1) eset = enq.get_eset(10, rset) term_count = 0 for term in eset: term_count += 1 expect(term_count, 3, "Unexpected number of expand terms") # Feature test for Database.__iter__ term_count = 0 for term in db: term_count += 1 expect(term_count, 5, "Unexpected number of terms in db") # Feature test for Database.allterms term_count = 0 for term in db.allterms(): term_count += 1 expect(term_count, 5, "Unexpected number of terms in db.allterms") # Feature test for Database.postlist count = 0 for posting in db.postlist("there"): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('there')") # Feature test for Database.postlist with empty term (alldocspostlist) count = 0 for posting in db.postlist(""): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('')") # Feature test for Database.termlist count = 0 for term in db.termlist(1): count += 1 expect(count, 5, "Unexpected number of entries in db.termlist(1)") # Feature test for Database.positionlist count = 0 for term in db.positionlist(1, "there"): count += 1 expect(count, 2, "Unexpected number of entries in db.positionlist(1, 'there')") # Feature test for Document.termlist count = 0 for term in doc.termlist(): count += 1 expect(count, 5, "Unexpected number of entries in doc.termlist()") # Feature test for TermIter.skip_to term = doc.termlist() term.skip_to('n') while True: try: x = next(term) except StopIteration: break if x.term < 'n': raise TestFail("TermIter.skip_to didn't skip term '%s'" % x.term) # Feature test for Document.values count = 0 for term in doc.values(): count += 1 expect(count, 0, "Unexpected number of entries in doc.values") # Check exception handling for Xapian::DocNotFoundError expect_exception(xapian.DocNotFoundError, "Docid 3 not found", db.get_document, 3) # Check value of OP_ELITE_SET expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET") # Feature test for MatchDecider doc = xapian.Document() doc.set_data("Two") doc.add_posting(stem("out"), 1) doc.add_posting(stem("outside"), 1) doc.add_posting(stem("source"), 2) doc.add_value(0, "yes") db.add_document(doc) class testmatchdecider(xapian.MatchDecider): def __call__(self, doc): return doc.get_value(0) == "yes" query = xapian.Query(stem("out")) enquire = xapian.Enquire(db) enquire.set_query(query) mset = enquire.get_mset(0, 10, None, testmatchdecider()) expect(mset.size(), 1, "Unexpected number of documents returned by match decider") expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in") # Feature test for ExpandDecider class testexpanddecider(xapian.ExpandDecider): def __call__(self, term): return (not term.startswith('a')) enquire = xapian.Enquire(db) rset = xapian.RSet() rset.add_document(1) eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, testexpanddecider()) eset_terms = [item.term for item in eset] expect(len(eset_terms), eset.size(), "Unexpected number of terms returned by expand") if [t for t in eset_terms if t.startswith('a')]: raise TestFail("ExpandDecider was not used") # Check min_wt argument to get_eset() works (new in 1.2.5). eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ) expect([i.weight for i in eset][-1] < 1.9, True, "test get_eset() without min_wt") eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, None, 1.9) expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt") # Check QueryParser parsing error. qp = xapian.QueryParser() expect_exception(xapian.QueryParserError, "Syntax: <expression> AND <expression>", qp.parse_query, "test AND") # Check QueryParser pure NOT option qp = xapian.QueryParser() expect_query(qp.parse_query("NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(0 * <alldocuments> AND_NOT test@1)") # Check QueryParser partial option qp = xapian.QueryParser() qp.set_database(db) qp.set_default_op(xapian.Query.OP_AND) qp.set_stemming_strategy(qp.STEM_SOME) qp.set_stemmer(xapian.Stem('en')) expect_query(qp.parse_query("foo ox", qp.FLAG_PARTIAL), "(Zfoo@1 AND (WILDCARD SYNONYM ox OR Zox@2))") expect_query(qp.parse_query("foo outside", qp.FLAG_PARTIAL), "(Zfoo@1 AND (WILDCARD SYNONYM outside OR Zoutsid@2))") # Test supplying unicode strings expect_query(xapian.Query(xapian.Query.OP_OR, (u'foo', u'bar')), '(foo OR bar)') expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', u'bar\xa3')), '(foo OR bar\xc2\xa3)') expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', 'bar\xc2\xa3')), '(foo OR bar\xc2\xa3)') expect_query(xapian.Query(xapian.Query.OP_OR, u'foo', u'bar'), '(foo OR bar)') expect_query(qp.parse_query(u"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(0 * <alldocuments> AND_NOT Zt\xc3\xa9st@1)") doc = xapian.Document() doc.set_data(u"Unicode with an acc\xe9nt") doc.add_posting(stem(u"out\xe9r"), 1) expect(doc.get_data(), u"Unicode with an acc\xe9nt".encode('utf-8')) term = doc.termlist().next().term expect(term, u"out\xe9r".encode('utf-8')) # Check simple stopper stop = xapian.SimpleStopper() qp.set_stopper(stop) expect(stop('a'), False) expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") stop.add('a') expect(stop('a'), True) expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Feature test for custom Stopper class my_b_stopper(xapian.Stopper): def __call__(self, term): return term == "b" def get_description(self): return u"my_b_stopper" stop = my_b_stopper() expect(stop.get_description(), u"my_b_stopper") qp.set_stopper(stop) expect(stop('a'), False) expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") expect(stop('b'), True) expect_query(qp.parse_query(u"foo bar b", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Test SimpleStopper initialised from a file. try: srcdir = os.environ['srcdir'] except KeyError: srcdir = '.' stop = xapian.SimpleStopper(srcdir + '/../shortstop.list') expect(stop('a'), True) expect(stop('am'), False) expect(stop('an'), True) expect(stop('the'), True) expect_exception(xapian.InvalidArgumentError, None, xapian.SimpleStopper, 'nosuchfile') # Test TermGenerator termgen = xapian.TermGenerator() doc = xapian.Document() termgen.set_document(doc) termgen.index_text('foo bar baz foo') expect([(item.term, item.wdf, [pos for pos in item.positer]) for item in doc.termlist()], [('bar', 1, [2]), ('baz', 1, [3]), ('foo', 2, [1, 4])]) # Check DateRangeProcessor works context("checking that DateRangeProcessor works") qp = xapian.QueryParser() rpdate = xapian.DateRangeProcessor(1, xapian.RP_DATE_PREFER_MDY, 1960) qp.add_rangeprocessor(rpdate) query = qp.parse_query('12/03/99..12/04/01') expect(str(query), 'Query(VALUE_RANGE 1 19991203 20011204)') # Feature test for xapian.FieldProcessor context("running feature test for xapian.FieldProcessor") class testfieldprocessor(xapian.FieldProcessor): def __call__(self, s): if s == 'spam': raise Exception('already spam') return xapian.Query("spam") qp.add_prefix('spam', testfieldprocessor()) qp.add_boolean_prefix('boolspam', testfieldprocessor()) qp.add_boolean_prefix('boolspam2', testfieldprocessor(), False) # Old-style qp.add_boolean_prefix('boolspam3', testfieldprocessor(), '') qp.add_boolean_prefix('boolspam4', testfieldprocessor(), 'group') qp.add_boolean_prefix('boolspam5', testfieldprocessor(), None) query = qp.parse_query('spam:ignored') expect(str(query), 'Query(spam)') expect_exception(Exception, 'already spam', qp.parse_query, 'spam:spam') # Regression tests copied from PHP (probably always worked in python, but # let's check...) context("running regression tests for issues which were found in PHP") # PHP overload resolution involving boolean types failed. enq.set_sort_by_value(1, True) # Regression test - fixed in 0.9.10.1. oqparser = xapian.QueryParser() oquery = oqparser.parse_query("I like tea") # Regression test for bug fixed in 1.4.4: # https://bugs.debian.org/849722 oqparser.add_boolean_prefix('tag', 'K', '') # Make sure other cases also work: oqparser.add_boolean_prefix('zag', 'XR', False) # Old-style oqparser.add_boolean_prefix('rag', 'XR', None) oqparser.add_boolean_prefix('nag', 'XB', '') oqparser.add_boolean_prefix('bag', 'XB', 'blergh') oqparser.add_boolean_prefix('gag', 'XB', u'blergh') oqparser.add_boolean_prefix('jag', 'XB', b'blergh') # Regression test for bug#192 - fixed in 1.0.3. enq.set_cutoff(100) # Test setting and getting metadata expect(db.get_metadata('Foo'), '') db.set_metadata('Foo', 'Foo') expect(db.get_metadata('Foo'), 'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.set_metadata, '', 'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '') # Test OP_SCALE_WEIGHT and corresponding constructor expect_query(xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query('foo'), 5), "5 * foo")
def _create_document(self, package, old_doc=None): doc = xapian.Document() self.indexer.set_document(doc) filtered_name = filter_search_string(package['name']) filtered_summary = filter_search_string(package['summary']) filtered_description = filter_search_string(package['description']) filtered_owner = filter_search_string(package['devel_owner']) self.indexer.index_text_without_positions('EX__' + filtered_name + '__EX', 10, '') self.indexer.index_text_without_positions('EX__' + filtered_owner + '__EX', 10, '') name_parts = filtered_name.split('_') for i in range(20): if len(name_parts) > 1: for part in name_parts: self.indexer.index_text_without_positions(part) self.indexer.index_text_without_positions(filtered_name, 10, '') for i in range(4): self.indexer.index_text_without_positions(filtered_summary) self.indexer.index_text_without_positions(filtered_description) self.index_files_of_interest(doc, package) for sub_package in package['sub_pkgs']: filtered_sub_package_name = filter_search_string(sub_package['name']) log.info(" indexing subpackage %s" % sub_package['name']) self.indexer.index_text_without_positions(filtered_sub_package_name) self.indexer.index_text_without_positions('EX__' + filtered_sub_package_name + '__EX', 10, '') self.index_files_of_interest(doc, sub_package) # Set special sub-package icon if appstream has one sub_package['icon'] = self.icon_cache.get( sub_package['name'], self.default_icon) # If the parent has a dull icon, give it ours! if sub_package['icon'] != self.default_icon \ and package['icon'] == self.default_icon: package['icon'] = sub_package['icon'] # remove anything we don't want to store del sub_package['package'] # @@: Right now we're only indexing the first part of the # provides/requires, and not boolean comparison or version # for requires in package.requires: # print requires[0] # doc.fields.append(xappy.Field('requires', requires[0])) # for provides in package.provides: # doc.fields.append(xappy.Field('provides', provides[0])) # remove anything we don't want to store and then store data in # json format del package['package'] doc.set_data(json.dumps(package)) # It seems that xapian db.replace_document still creates a new # document. In order to avoid duplicating the document we are # using add_document and then delete the old document. self.db.add_document(doc) if old_doc is not None: self.db.delete_document(old_doc.get_docid()) self.db.commit()
def update(self, documents=None, after_index=None, per_page=10000, commit_each=False): """ Update the database with the documents. There are some default value and terms in a document: * Values: 1. Used to store the ID of the document 2. Store the model of the object (in the string format, like "project.app.model") 3. Store the indexer descriptor (module path) 4..10. Free * Terms UID: Used to store the ID of the document, so we can replace the document by the ID """ # Open Xapian Database database = self._db.open(write=True) # If doesnt have any document at all if documents is None: update_queue = self._model.objects.all() else: update_queue = documents commiter = Commiter.create(commit_each)( #lambda: database.begin_transaction(flush=True), database.begin_transaction, database.commit_transaction, database.cancel_transaction) # Get each document received for page in paginate(update_queue, per_page): try: commiter.begin_page() for obj in page.object_list: commiter.begin_object() try: if not self.trigger(obj): self.delete(obj.pk, database) continue doc = xapian.Document() # Add default terms and values uid = self._create_uid(obj) doc.add_term(self._create_uid(obj)) self._insert_meta_values(doc, obj) generator = xapian.TermGenerator() generator.set_database(database) generator.set_document(doc) generator.set_flags(xapian.TermGenerator.FLAG_SPELLING) #stem_lang = self._get_stem_language(obj) #if stem_lang: #generator.set_stemmer(xapian.Stem(stem_lang)) #stopper = self.get_stopper(stem_lang) #if stopper: #generator.set_stopper(stopper) stemming_lang = self._get_stem_language(obj) if stemming_lang: stemmer = self.get_stemmer(stemming_lang) generator.set_stemmer(stemmer) stopper = self.get_stopper(stemming_lang) if stopper: generator.set_stopper(stopper) #for field in self.fields + self.tags: # Trying to resolve field value or skip it #try: #value = field.resolve(obj) #except AttributeError: #continue #if field.prefix: #index_value = field.convert(value, self._model) #if index_value is not None: #doc.add_value(field.number, smart_text(index_value)) #prefix = smart_text(field.get_tag()) #generator.index_text(smart_text(value), field.weight, prefix) #if prefix: # if prefixed then also index without prefix #generator.index_text(smart_text(value), field.weight) #database.replace_document(uid, doc) #if after_index: #after_index(obj) # Get a weight for the object obj_weight = self._get_object_weight(obj) # Index fields self._do_index_fields(doc, generator, obj, obj_weight) database.replace_document(uid, doc) if after_index: after_index(obj) commiter.commit_object() except Exception: commiter.cancel_object() raise commiter.commit_page() except Exception: commiter.cancel_page() raise database.flush()
def put_data(self, key, data): try: data['_date'] = first_of(data, 'meta.date', 'image.created', 'annex.added') if isinstance(data['_date'], (list, tuple)): data['_date'] = data['_date'][0] except KeyError: data['_date'] = '' logger.debug("Sort key: %r", data['_date']) sortvalue = encode_sortable_date(data['_date']) doc = xapian.Document() self.term_generator.set_document(doc) git = data.get('git', {}) if git.get('branch'): # add the sort date d = term_date(data['_date']) doc.add_term('D' + d, 0) doc.add_term('Y' + d[:4], 0) doc.add_term(d[:4], 0) for branch, p in git.get('branch', {}).items(): folder, filename = os.path.split(p) name, _ = os.path.splitext(filename) self.term_generator.index_text(name, 0, 'F') self.term_generator.index_text(name) self.term_generator.increase_termpos() for t in folder.split(os.sep): if t: doc.add_term("P" + t.lower(), 0) for section in data: if section[0] == '_': continue if data[section] is None: continue for field, values in data[section].items(): prefix = None # handle arrays and straight values if isinstance(values, (dict, )): values = list(values) if not isinstance(values, (list, tuple)): values = [values] # handle prefixed unstemmed boolean terms if field in terms.PREFIXED_UNSTEMMED_BOOLEAN_TERMS: field = terms.PREFIXED_UNSTEMMED_BOOLEAN_TERMS[field] for value in values: doc.add_term(field + value.lower(), 0) # some terms should be added to the full text index if field in terms.BOOLEAN_UNPREFIXED_STEMMED: self.term_generator.index_text(value) self.term_generator.increase_termpos() continue # handle prefixed unstemmed terms if field in terms.PREFIXED_UNSTEMMED_TERMS: field = terms.PREFIXED_UNSTEMMED_TERMS[field] for value in values: if field[0] == 'D': value = term_date(value) doc.add_term(field + value.lower(), 0) continue # handle free terms if field in terms.STEMMED_TERMS: for value in values: self.term_generator.index_text( value, 1, terms.STEMMED_TERMS[field]) self.term_generator.index_text(value) self.term_generator.increase_termpos() doc.add_term('XSok') else: doc.add_term('XSdropped') doc.set_data(json.dumps(data)) doc.add_value(0, key) doc.add_value(1, sortvalue) idterm = "QK{0}".format(key) doc.add_boolean_term(idterm) if logger.isEnabledFor(logging.DEBUG): logger.debug("Data: %r", data) logger.debug("Terms: %r", [x.term for x in doc.termlist()]) self.db.replace_document(idterm, doc)
def index_app_info_from_parser(parser, db, cache): term_generator = xapian.TermGenerator() term_generator.set_database(db) try: # this tests if we have spelling suggestions (there must be # a better way?!?) - this is needed as inmemory does not have # spelling corrections, but it allows setting the flag and will # raise a exception much later db.add_spelling("test") db.remove_spelling("test") # this enables the flag for it (we only reach this line if # the db supports spelling suggestions) term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING) except xapian.UnimplementedError: pass doc = xapian.Document() term_generator.set_document(doc) # app name is the data if parser.has_option_desktop("X-Ubuntu-Software-Center-Name"): name = parser.get_desktop("X-Ubuntu-Software-Center-Name") untranslated_name = parser.get_desktop("X-Ubuntu-Software-Center-Name", translated=False) elif parser.has_option_desktop("X-GNOME-FullName"): name = parser.get_desktop("X-GNOME-FullName") untranslated_name = parser.get_desktop("X-GNOME-FullName", translated=False) else: name = parser.get_desktop("Name") untranslated_name = parser.get_desktop("Name", translated=False) if name in seen: LOG.debug("duplicated name '%s' (%s)" % (name, parser.desktopf)) LOG.debug("indexing app '%s'" % name) seen.add(name) doc.set_data(name) index_name(doc, name, term_generator) doc.add_value(XapianValues.APPNAME_UNTRANSLATED, untranslated_name) # check if we should ignore this file if parser.has_option_desktop("X-AppInstall-Ignore"): ignore = parser.get_desktop("X-AppInstall-Ignore") if ignore.strip().lower() == "true": LOG.debug("X-AppInstall-Ignore found for '%s'" % parser.desktopf) return # architecture pkgname_extension = '' if parser.has_option_desktop("X-AppInstall-Architectures"): arches = parser.get_desktop("X-AppInstall-Architectures") doc.add_value(XapianValues.ARCHIVE_ARCH, arches) native_archs = get_current_arch() in arches.split(',') foreign_archs = list( set(arches.split(',')) & set(get_foreign_architectures())) if not (native_archs or foreign_archs): return if not native_archs and foreign_archs: pkgname_extension = ':' + foreign_archs[0] # package name pkgname = parser.get_desktop("X-AppInstall-Package") + pkgname_extension doc.add_term("AP" + pkgname) if '-' in pkgname: # we need this to work around xapian oddness doc.add_term(pkgname.replace('-', '_')) doc.add_value(XapianValues.PKGNAME, pkgname) doc.add_value(XapianValues.DESKTOP_FILE, parser.desktopf) # display name if "display_name" in axi_values: doc.add_value(axi_values["display_name"], name) # cataloged_times if "catalogedtime" in axi_values: if pkgname in cataloged_times: doc.add_value(axi_values["catalogedtime"], xapian.sortable_serialise(cataloged_times[pkgname])) else: # also catalog apps not found in axi (e.g. for-purchase apps) doc.add_value(axi_values["catalogedtime"], xapian.sortable_serialise(time.time())) # pocket (main, restricted, ...) if parser.has_option_desktop("X-AppInstall-Section"): archive_section = parser.get_desktop("X-AppInstall-Section") doc.add_term("AS" + archive_section) doc.add_value(XapianValues.ARCHIVE_SECTION, archive_section) # section (mail, base, ..) if pkgname in cache and cache[pkgname].candidate: section = cache[pkgname].section doc.add_term("AE" + section) # channel (third party stuff) if parser.has_option_desktop("X-AppInstall-Channel"): archive_channel = parser.get_desktop("X-AppInstall-Channel") doc.add_term("AH" + archive_channel) doc.add_value(XapianValues.ARCHIVE_CHANNEL, archive_channel) # signing key (third party) if parser.has_option_desktop("X-AppInstall-Signing-Key-Id"): keyid = parser.get_desktop("X-AppInstall-Signing-Key-Id") doc.add_value(XapianValues.ARCHIVE_SIGNING_KEY_ID, keyid) # license (third party) if parser.has_option_desktop("X-AppInstall-License"): license = parser.get_desktop("X-AppInstall-License") doc.add_value(XapianValues.LICENSE, license) # purchased date if parser.has_option_desktop("X-AppInstall-Purchased-Date"): date = parser.get_desktop("X-AppInstall-Purchased-Date") # strip the subseconds from the end of the date string doc.add_value(XapianValues.PURCHASED_DATE, str(date).split(".")[0]) # deb-line (third party) if parser.has_option_desktop("X-AppInstall-Deb-Line"): debline = parser.get_desktop("X-AppInstall-Deb-Line") doc.add_value(XapianValues.ARCHIVE_DEB_LINE, debline) # license key (third party) if parser.has_option_desktop("X-AppInstall-License-Key"): key = parser.get_desktop("X-AppInstall-License-Key") doc.add_value(XapianValues.LICENSE_KEY, key) # license keypath (third party) if parser.has_option_desktop("X-AppInstall-License-Key-Path"): path = parser.get_desktop("X-AppInstall-License-Key-Path") doc.add_value(XapianValues.LICENSE_KEY_PATH, path) # PPA (third party stuff) if parser.has_option_desktop("X-AppInstall-PPA"): archive_ppa = parser.get_desktop("X-AppInstall-PPA") doc.add_value(XapianValues.ARCHIVE_PPA, archive_ppa) # add archive origin data here so that its available even if # the PPA is not (yet) enabled doc.add_term("XOO" + "lp-ppa-%s" % archive_ppa.replace("/", "-")) # screenshot (for third party) if parser.has_option_desktop("X-AppInstall-Screenshot-Url"): url = parser.get_desktop("X-AppInstall-Screenshot-Url") doc.add_value(XapianValues.SCREENSHOT_URL, url) # thumbnail (for third party) if parser.has_option_desktop("X-AppInstall-Thumbnail-Url"): url = parser.get_desktop("X-AppInstall-Thumbnail-Url") doc.add_value(XapianValues.THUMBNAIL_URL, url) # video support (for third party mostly) if parser.has_option_desktop("X-AppInstall-Video-Url"): url = parser.get_desktop("X-AppInstall-Video-Url") doc.add_value(XapianValues.VIDEO_URL, url) # icon (for third party) if parser.has_option_desktop("X-AppInstall-Icon-Url"): url = parser.get_desktop("X-AppInstall-Icon-Url") doc.add_value(XapianValues.ICON_URL, url) if not parser.has_option_desktop("X-AppInstall-Icon"): doc.add_value(XapianValues.ICON, os.path.basename(url)) # price (pay stuff) if parser.has_option_desktop("X-AppInstall-Price"): price = parser.get_desktop("X-AppInstall-Price") doc.add_value(XapianValues.PRICE, price) # since this is a commercial app, indicate it in the component value doc.add_value(XapianValues.ARCHIVE_SECTION, "commercial") # icon if parser.has_option_desktop("Icon"): icon = parser.get_desktop("Icon") doc.add_value(XapianValues.ICON, icon) # write out categories for cat in parser.get_desktop_categories(): doc.add_term("AC" + cat.lower()) categories_string = ";".join(parser.get_desktop_categories()) doc.add_value(XapianValues.CATEGORIES, categories_string) for mime in parser.get_desktop_mimetypes(): doc.add_term("AM" + mime.lower()) # get type (to distinguish between apps and packages if parser.has_option_desktop("Type"): type = parser.get_desktop("Type") doc.add_term("AT" + type.lower()) # check gettext domain if parser.has_option_desktop("X-Ubuntu-Gettext-Domain"): domain = parser.get_desktop("X-Ubuntu-Gettext-Domain") doc.add_value(XapianValues.GETTEXT_DOMAIN, domain) # Description (software-center extension) if parser.has_option_desktop("X-AppInstall-Description"): descr = parser.get_desktop("X-AppInstall-Description") doc.add_value(XapianValues.SC_DESCRIPTION, descr) # popcon # FIXME: popularity not only based on popcon but also # on archive section, third party app etc if parser.has_option_desktop("X-AppInstall-Popcon"): popcon = float(parser.get_desktop("X-AppInstall-Popcon")) # sort_by_value uses string compare, so we need to pad here doc.add_value(XapianValues.POPCON, xapian.sortable_serialise(popcon)) global popcon_max popcon_max = max(popcon_max, popcon) # comment goes into the summary data if there is one, # other wise we try GenericName and if nothing else, # the summary of the package if parser.has_option_desktop("Comment"): s = parser.get_desktop("Comment") doc.add_value(XapianValues.SUMMARY, s) elif parser.has_option_desktop("GenericName"): s = parser.get_desktop("GenericName") if s != name: doc.add_value(XapianValues.SUMMARY, s) elif pkgname in cache and cache[pkgname].candidate: s = cache[pkgname].candidate.summary doc.add_value(XapianValues.SUMMARY, s) # add packagename as meta-data too term_generator.index_text_without_positions(pkgname, WEIGHT_APT_PKGNAME) # now add search data from the desktop file for key in ["GenericName", "Comment", "X-AppInstall-Description"]: if not parser.has_option_desktop(key): continue s = parser.get_desktop(key) # we need the ascii_upper here for e.g. turkish locales, see # bug #581207 k = "WEIGHT_DESKTOP_" + ascii_upper(key.replace(" ", "")) if k in globals(): w = globals()[k] else: LOG.debug("WEIGHT %s not found" % k) w = 1 term_generator.index_text_without_positions(s, w) # add data from the apt cache if pkgname in cache and cache[pkgname].candidate: s = cache[pkgname].candidate.summary term_generator.index_text_without_positions(s, WEIGHT_APT_SUMMARY) s = cache[pkgname].candidate.description term_generator.index_text_without_positions(s, WEIGHT_APT_DESCRIPTION) for origin in cache[pkgname].candidate.origins: doc.add_term("XOA" + origin.archive) doc.add_term("XOC" + origin.component) doc.add_term("XOL" + origin.label) doc.add_term("XOO" + origin.origin) doc.add_term("XOS" + origin.site) # add our keywords (with high priority) if parser.has_option_desktop("X-AppInstall-Keywords"): keywords = parser.get_desktop("X-AppInstall-Keywords") for s in keywords.split(";"): if s: term_generator.index_text_without_positions( s, WEIGHT_DESKTOP_KEYWORD) # now add it db.add_document(doc)
def make_doc(self, cache): """Build a Xapian document from the desktop info.""" doc = xapian.Document() # app name is the data name = self._set_doc_from_key(doc, AppInfoFields.NAME) assert name is not None doc.set_data(name) self._set_doc_from_key(doc, AppInfoFields.NAME_UNTRANSLATED, translated=False) # check if we should ignore this file if self.is_ignored: LOG.debug("%r.make_doc: %r is ignored.", self.__class__.__name__, self.desktopf) return # architecture pkgname_extension = '' arches = self._set_doc_from_key(doc, AppInfoFields.ARCH) if arches: native_archs = get_current_arch() in arches.split(',') foreign_archs = list( set(arches.split(',')) & set(get_foreign_architectures())) if not (native_archs or foreign_archs): return if not native_archs and foreign_archs: pkgname_extension = ':' + foreign_archs[0] # package name pkgname = self._set_doc_from_key(doc, AppInfoFields.PACKAGE, pkgname_extension=pkgname_extension) doc.add_value(XapianValues.DESKTOP_FILE, self.desktopf) # display name display_name = axi_values.get("display_name") if display_name is not None: doc.add_value(display_name, name) # cataloged_times catalogedtime = axi_values.get("catalogedtime") if catalogedtime is not None and pkgname in cataloged_times: doc.add_value(catalogedtime, xapian.sortable_serialise(cataloged_times[pkgname])) # section (mail, base, ..) if pkgname in cache and cache[pkgname].candidate: section = cache[pkgname].section doc.add_term("AE" + section) fields = ( AppInfoFields.CHANNEL, # channel (third party stuff) AppInfoFields.DEB_LINE, # deb-line (third party) AppInfoFields.DESCRIPTION, # description software-center extension AppInfoFields.GETTEXT_DOMAIN, # check gettext domain AppInfoFields.ICON, # icon AppInfoFields.LICENSE, # license (third party) AppInfoFields.LICENSE_KEY, # license key (third party) AppInfoFields.LICENSE_KEY_PATH, # license keypath (third party) AppInfoFields.PPA, # PPA (third party stuff) AppInfoFields.PURCHASED_DATE, # purchased date AppInfoFields.SCREENSHOT_URLS, # screenshot (for third party) AppInfoFields.SECTION, # pocket (main, restricted, ...) AppInfoFields.SIGNING_KEY_ID, # signing key (third party) AppInfoFields.SUPPORT_URL, # support url (mainly pay stuff) AppInfoFields.SUPPORTED_DISTROS, # supported distros AppInfoFields.THUMBNAIL_URL, # thumbnail (for third party) AppInfoFields.VERSION, # version support (for e.g. the scagent) AppInfoFields.VIDEO_URL, # video support (for third party mostly) AppInfoFields.WEBSITE, # homepage url (developer website) ) for field in fields: self._set_doc_from_key(doc, field) # date published date_published_str = self._set_doc_from_key( doc, AppInfoFields.DATE_PUBLISHED) # we use the date published value for the cataloged time as well if date_published_str is not None: LOG.debug("pkgname: %s, date_published cataloged time is: %s", pkgname, date_published_str) date_published = time.mktime( time.strptime(date_published_str, "%Y-%m-%d %H:%M:%S")) # a value for our own DB doc.add_value(XapianValues.DB_CATALOGED_TIME, xapian.sortable_serialise(date_published)) if "catalogedtime" in axi_values: # compat with a-x-i doc.add_value(axi_values["catalogedtime"], xapian.sortable_serialise(date_published)) # icon (for third party) url = self._set_doc_from_key(doc, AppInfoFields.ICON_URL) if url and self.get_value(AppInfoFields.ICON) is None: # prefix pkgname to avoid name clashes doc.add_value(XapianValues.ICON, "%s-icon-%s" % (pkgname, os.path.basename(url))) # price (pay stuff) price = self._set_doc_from_key(doc, AppInfoFields.PRICE) if price: # this is a commercial app, indicate it in the component value doc.add_value(XapianValues.ARCHIVE_SECTION, "commercial") # this is hard-coded to US dollar for now, but if the server # ever changes we can update doc.add_value(XapianValues.CURRENCY, "US$") # add donwload size as string (its send as int) download_size = self.get_value(AppInfoFields.DOWNLOAD_SIZE) if download_size is not None: doc.add_value(XapianValues.DOWNLOAD_SIZE, xapian.sortable_serialise((download_size))) # write out categories for cat in self.get_categories(): doc.add_term("AC" + cat.lower()) categories_string = ";".join(self.get_categories()) doc.add_value(XapianValues.CATEGORIES, categories_string) # mimetypes for mime in self.get_mimetypes(): doc.add_term("AM" + mime.lower()) # get type (to distinguish between apps and packages) app_type = self.get_value(AppInfoFields.TYPE) if app_type: doc.add_term("AT" + app_type.lower()) # (deb)tags (in addition to the pkgname debtags) tags_string = self.get_value(AppInfoFields.TAGS) if tags_string: # convert to list and register tags = [tag.strip().lower() for tag in tags_string.split(",")] for tag in tags: doc.add_term("XT" + tag) # ENFORCE region blacklist/whitelist by not registering # the app at all region = get_region_cached() if region: countrycode = region["countrycode"].lower() blacklist = [ t.replace(REGION_BLACKLIST_TAG, "") for t in tags if t.startswith(REGION_BLACKLIST_TAG) ] whitelist = [ t.replace(REGION_WHITELIST_TAG, "") for t in tags if t.startswith(REGION_WHITELIST_TAG) ] if countrycode in blacklist: if countrycode in whitelist: LOG.debug( "%r.make_doc: %r black AND whitelisted for " "region %r. Treating as blacklisted.", self.__class__.__name__, name, countrycode) LOG.debug( "%r.make_doc: skipping region restricted app %r " "(blacklisted)", self.__class__.__name__, name) return if len(whitelist) > 0 and countrycode not in whitelist: LOG.debug( "%r.make_doc: skipping region restricted " "app %r (region not whitelisted)", self.__class__.__name__, name) return # popcon # FIXME: popularity not only based on popcon but also # on archive section, third party app etc popcon = self._set_doc_from_key(doc, AppInfoFields.POPCON) if popcon is not None: global popcon_max popcon_max = max(popcon_max, popcon) # comment goes into the summary data if there is one, # otherwise we try GenericName and if nothing else, # the summary of the candidate package summary = self._set_doc_from_key(doc, AppInfoFields.SUMMARY, name=name) if summary is None and pkgname in cache and cache[pkgname].candidate: summary = cache[pkgname].candidate.summary doc.add_value(XapianValues.SUMMARY, summary) return doc
def __init__(self, path, popcon_dir, axi_path, tags_filter): """ Set initial attributes. """ self.axi = xapian.Database(axi_path) self.path = os.path.expanduser(path) self.popcon_dir = os.path.expanduser(popcon_dir) self.valid_pkgs = axi_get_pkgs(self.axi) logging.debug("Considering %d valid packages" % len(self.valid_pkgs)) with open(tags_filter) as valid_tags: self.valid_tags = [ line.strip() for line in valid_tags if not line.startswith("#") ] logging.debug("Considering %d valid tags" % len(self.valid_tags)) if not os.path.exists(self.popcon_dir): os.makedirs(self.popcon_dir) if not os.listdir(self.popcon_dir): logging.critical("Popcon dir seems to be empty.") raise Error # set up directory shutil.rmtree(self.path, 1) os.makedirs(self.path) try: logging.info("Indexing popcon submissions from \'%s\'" % self.popcon_dir) logging.info("Creating new xapian index at \'%s\'" % self.path) xapian.WritableDatabase.__init__(self, self.path, xapian.DB_CREATE_OR_OVERWRITE) except xapian.DatabaseError as e: logging.critical("Could not create popcon xapian index.") logging.critical(str(e)) raise Error # build new index doc_count = 0 for root, dirs, files in os.walk(self.popcon_dir): for popcon_file in files: submission = PopconSubmission(os.path.join(root, popcon_file)) doc = xapian.Document() submission_pkgs = submission.get_filtered(self.valid_pkgs) if len(submission_pkgs) < 10: logging.debug("Low profile popcon submission \'%s\' (%d)" % (submission.user_id, len(submission_pkgs))) else: doc.set_data(submission.user_id) doc.add_term("ID" + submission.user_id) doc.add_term("ARCH" + submission.arch) logging.debug("Parsing popcon submission \'%s\'" % submission.user_id) for pkg, freq in submission_pkgs.items(): tags = axi_search_pkg_tags(self.axi, pkg) # if the package was found in axi if tags: doc.add_term("XP" + pkg, freq) # if the package has tags associated with it if not tags == "notags": for tag in tags: if tag.lstrip("XT") in self.valid_tags: doc.add_term(tag, freq) doc_id = self.add_document(doc) doc_count += 1 logging.debug("Popcon Xapian: Indexing doc %d" % doc_id) # python garbage collector gc.collect() # flush to disk database changes try: self.commit() except: # deprecated function, used for compatibility with old lib version self.flush()
def index(self, document, commit=False): database = self.database document_id, document_values, document_terms, document_texts, document_data, default_language, default_spelling, default_positions = document document = xapian.Document() if document_data: document.set_data(document_data) for name, value in (document_values or {}).items(): name = name.strip() slot = get_slot(name) if slot: value = serialise_value(value)[0] if value: document.add_value(slot, value) else: self.log.warning("Ignored document value name (%r)", name) if isinstance(document_id, basestring): document.add_value(get_slot('ID'), document_id) document_id = prefixed(document_id, DOCUMENT_ID_TERM_PREFIX) document.add_boolean_term(document_id) # Make sure document_id is also a term (otherwise it doesn't replace an existing document) for terms in document_terms or (): if isinstance(terms, (tuple, list)): terms, weight, prefix, position = (list(terms) + [None] * 4)[:4] else: weight = prefix = position = None if not terms: continue weight = 1 if weight is None else weight prefix = '' if prefix is None else prefix for term, field_name, terms in find_terms(terms, None): if field_name: boolean = not field_name.islower() term_prefix = get_prefix(field_name, DOCUMENT_CUSTOM_TERM_PREFIX) else: boolean = not prefix.islower() term_prefix = prefix if boolean: term = terms for term in serialise_value(term): if term: if not boolean: term = term.lower() if position is None: document.add_term(prefixed(term, term_prefix), weight) else: document.add_posting(prefixed(term, term_prefix), position, weight) if boolean: break for text in document_texts or (): if isinstance(text, (tuple, list)): text, weight, prefix, language, spelling, positions = (list(text) + [None] * 6)[:6] else: weight = prefix = language = spelling = positions = None if not text: continue weight = 1 if weight is None else weight prefix = '' if prefix is None else prefix language = default_language if language is None else language positions = default_positions if positions is None else positions spelling = default_spelling if spelling is None else spelling term_generator = xapian.TermGenerator() term_generator.set_document(document) if spelling: term_generator.set_database(database) term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING) if language: term_generator.set_stemmer(xapian.Stem(language)) if positions: index_text = term_generator.index_text else: index_text = term_generator.index_text_without_positions index_text(normalize(text), weight, prefix.upper()) return self.replace(document_id, document, commit=commit)
def test_value_mods(): """Test handling of modifications to values. """ dbpath = 'db_test_value_mods' db = xapian.chert_open(dbpath, xapian.DB_CREATE_OR_OVERWRITE) random.seed(42) doccount = 1000 vals = {} # Add a value to all the documents for num in range(1, doccount): doc = xapian.Document() val = 'val%d' % num doc.add_value(1, val) db.add_document(doc) vals[num] = val db.commit() check_vals(db, vals) # Modify one of the values (this is a regression test which failed with the # initial implementation of streaming values). doc = xapian.Document() val = 'newval0' doc.add_value(1, val) db.replace_document(2, doc) vals[2] = val db.commit() check_vals(db, vals) # Do some random modifications. for count in range(1, doccount * 2): docid = random.randint(1, doccount) doc = xapian.Document() if count % 5 == 0: val = '' else: val = 'newval%d' % count doc.add_value(1, val) db.replace_document(docid, doc) vals[docid] = val # Check the values before and after modification. check_vals(db, vals) db.commit() check_vals(db, vals) # Delete all the values which are non-empty, in a random order. keys = [key for key, val in vals.items() if val != ''] random.shuffle(keys) for key in keys: doc = xapian.Document() db.replace_document(key, doc) vals[key] = '' check_vals(db, vals) db.commit() check_vals(db, vals) db.close() expect_exception(xapian.DatabaseError, "Database has been closed", check_vals, db, vals) shutil.rmtree(dbpath)
def test_postingsource(): """Simple test of the PostingSource class. """ class OddPostingSource(xapian.PostingSource): def __init__(self, max): xapian.PostingSource.__init__(self) self.max = max def init(self, db): self.current = -1 def get_termfreq_min(self): return 0 def get_termfreq_est(self): return int(self.max / 2) def get_termfreq_max(self): return self.max def next(self, minweight): self.current += 2 def at_end(self): return self.current > self.max def get_docid(self): return self.current dbpath = 'db_test_postingsource' db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OVERWRITE) for id in range(10): doc = xapian.Document() db.add_document(doc) # Do a dance to check that the posting source doesn't get dereferenced too # soon in various cases. def mkenq(db): # First - check that it's kept when the source goes out of scope. def mkquery(): source = OddPostingSource(10) return xapian.Query(xapian.Query.OP_OR, [xapian.Query(source)]) # Check that it's kept when the query goes out of scope. def submkenq(): query = mkquery() enquire = xapian.Enquire(db) enquire.set_query(query) return enquire # Check it's kept when the query is retrieved from enquire and put into # a new enquire. def submkenq2(): enq1 = submkenq() enquire = xapian.Enquire(db) enquire.set_query(enq1.get_query()) return enquire return submkenq2() enquire = mkenq(db) mset = enquire.get_mset(0, 10) expect([item.docid for item in mset], [1, 3, 5, 7, 9]) db.close() shutil.rmtree(dbpath)
def _build_index(self, filepath, recreate=False): """ save txt to LevelDB Input: - filepath: txt file path, support .pdf, .gzip, .bzip2, and .txt file, or dictionary which contains those files - recreate: bool, True will force recreate db, default is False """ cached_index = filepath + ".index" if os.path.exists(cached_index): if recreate: shutil.rmtree(cached_index) else: recreate = True stemmer = xapian.Stem("english") if not recreate: database = xapian.Database(cached_index) else: database = xapian.WritableDatabase(cached_index, xapian.DB_CREATE_OR_OPEN) indexer = xapian.TermGenerator() indexer.set_stemmer(stemmer) if os.path.isdir(filepath): filepaths = glob.glob(os.path.join(filepath, "*.*")) else: filepaths = [filepath] for filepath in filepaths: ext = os.path.splitext(filepath)[-1] open_func = open if ext == ".pdf": filepath2 = filepath + ".txt" if not os.path.exists(filepath2): subprocess.Popen(('pdftotext', filepath, filepath2)).wait() filepath = filepath2 elif ext == ".bz2": import bz2 open_func = bz2.open elif ext == ".gz": import gzip open_func = gzip.open else: continue with open_func(filepath, mode="rt", encoding="utf-8") as f: for l in tqdm(f, desc="Building index for " + filepath, unit=" lines"): l = l.strip() if len(l) < 1 : continue sent_combined = [] sent_len = 0 for sent in nltk.sent_tokenize(l): sent = sent.strip() tokens = wordpunct_tokenize(sent) if sent_len > 0 and sent_len+len(tokens) > self.max_seq_len/2: combined = "\t" .join(sent_combined) doc = xapian.Document() doc.set_data(combined) indexer.set_document(doc) indexer.index_text(combined) database.add_document(doc) sent_combined = [] sent_len = 0 sent_len += len(tokens) sent_combined.append(sent) if sent_len > 0: combined = "\t" .join(sent_combined) doc = xapian.Document() doc.set_data(combined) indexer.set_document(doc) indexer.index_text(combined) database.add_document(doc) self.parser = xapian.QueryParser() self.parser.set_stemmer(stemmer) self.parser.set_database(database) self.parser.set_stemming_strategy(xapian.QueryParser.STEM_SOME) self.enquire = xapian.Enquire(database)
def test_replication_concurrency(): """Test concurrent replication and modification """ builddir = os.environ['abs_builddir'] dbsdir = os.path.join(builddir, 'dbs_replication') if not os.path.isdir(dbsdir): os.makedirs(dbsdir) masterpath = os.path.join(dbsdir, 'master') firstpath = os.path.join(dbsdir, 'first') secondpath = os.path.join(dbsdir, 'second') slavepath = os.path.join(dbsdir, 'slave') if os.path.isdir(masterpath): shutil.rmtree(masterpath) if os.path.isdir(slavepath): shutil.rmtree(slavepath) port = 7876 expect_exception( xapian.DatabaseOpeningError, "Couldn't stat '" + dbsdir + "/slave' (No such file or directory)", xapian.Database, slavepath) clientp = None serverp = subprocess.Popen(( '../../xapian-core/bin/xapian-replicate-server', dbsdir, '--port=7876', ), ) doccount1 = 10000 doccount2 = 1000 starttime = time.time() if not os.path.isdir(firstpath): firstdb = xapian.WritableDatabase(firstpath, xapian.DB_CREATE_OR_OVERWRITE) # Make an initial, large database print print "Building initial database ..." for num in xrange(1, doccount1): doc = xapian.Document() val = 'val%d' % num doc.add_value(1, val) firstdb.add_document(doc) if num % 100000 == 0: print "%d documents..." % num firstdb.set_metadata('dbname', '1') firstdb.commit() print "built" # The secondary database gets modified during the test, so needs to be # cleared now. shutil.rmtree(secondpath) if not os.path.isdir(secondpath): seconddb = xapian.WritableDatabase(secondpath, xapian.DB_CREATE_OR_OVERWRITE) # Make second, small database print print "Building secondary database ..." for num in xrange(1, doccount2): doc = xapian.Document() val = 'val%d' % num doc.add_value(1, val) seconddb.add_document(doc) if num % 100000 == 0: print "%d documents..." % num seconddb.set_metadata('dbname', '2') seconddb.commit() print "built" if time.time() - starttime < 1: time.sleep(1) # Give server time to start try: set_master(masterpath, firstpath) clientp = subprocess.Popen(( '../../xapian-core/bin/xapian-replicate', '--host=127.0.0.1', '--master=master', os.path.join(dbsdir, 'slave'), '--interval=0', '--port=7876', '-r 0', ), ) time.sleep(1) # Give client time to start expect(xapian.Database(slavepath).get_metadata('dbname'), '1') for count in xrange(10): # Test that swapping between databases doesn't confuse replication. for count2 in xrange(2): set_master(masterpath, secondpath) time.sleep(0.1) set_master(masterpath, firstpath) time.sleep(0.1) # Test making changes to the database. set_master(masterpath, secondpath) masterdb = xapian.WritableDatabase(masterpath, xapian.DB_OPEN) print "making 100 changes" for num in xrange(100): masterdb.set_metadata('num%d' % num, str(num + count)) masterdb.commit() print "changes done" masterdb.close() # Allow time for the replication client to catch up with the # changes. time.sleep(2) expect(xapian.Database(slavepath).get_metadata('dbname'), '2') expect( xapian.Database(slavepath).get_metadata('num99'), str(99 + count)) finally: if clientp is not None: os.kill(clientp.pid, 9) clientp.wait() os.kill(serverp.pid, 9) serverp.wait()
if ".git" in dirpath or "__xdb__" in dirpath: continue for filename in filenames: cursor = os.path.join(dirpath, filename) # skip non-plain files with open(cursor, 'rb') as cursor_file: cursor_content = cursor_file.read() if is_binary_string(cursor_content): spinner.print("D: skip non-plain file {}".format(cursor)) continue #print("I: {:04d} : processing {}".format(counter_indexed, cursor)) spinner.spin() try: doc = xapian.Document() with open(cursor, 'r') as cursor_file: cursor_document = cursor_file.read() doc.set_data(cursor_document) indexer.set_document(doc) indexer.index_text(cursor_content) xdb.add_document(doc) counter_indexed = counter_indexed + 1 pathlist_indexed.append(cursor) except: spinner.print("W: skip problematic file {}".format(cursor)) pass
def update(self, index, iterable): """ Updates the `index` with any objects in `iterable` by adding/updating the database as needed. Required arguments: `index` -- The `SearchIndex` to process `iterable` -- An iterable of model instances to index For each object in `iterable`, a document is created containing all of the terms extracted from `index.full_prepare(obj)` with field prefixes, and 'as-is' as needed. Also, if the field type is 'text' it will be stemmed and stored with the 'Z' prefix as well. eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest, XCONTENTtest` Each document also contains an extra term in the format: `XCONTENTTYPE<app_name>.<model_name>` As well as a unique identifier in the the format: `Q<app_name>.<model_name>.<pk>` eg.: foo.bar (pk=1) ==> `Qfoo.bar.1`, `XCONTENTTYPEfoo.bar` This is useful for querying for a specific document corresponding to a model instance. The document also contains a pickled version of the object itself and the document ID in the document data field. Finally, we also store field values to be used for sorting data. We store these in the document value slots (position zero is reserver for the document ID). All values are stored as unicode strings with conversion of float, int, double, values being done by Xapian itself through the use of the :method:xapian.sortable_serialise method. """ database = self._database(writable=True) try: for obj in iterable: document = xapian.Document() term_generator = xapian.TermGenerator() term_generator.set_database(database) term_generator.set_stemmer(xapian.Stem(self.language)) if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING) term_generator.set_document(document) document_id = DOCUMENT_ID_TERM_PREFIX + get_identifier(obj) data = index.full_prepare(obj) weights = index.get_field_weights() for field in self.schema: if field['field_name'] in data.keys(): prefix = DOCUMENT_CUSTOM_TERM_PREFIX + field['field_name'].upper() value = data[field['field_name']] try: weight = int(weights[field['field_name']]) except KeyError: weight = 1 if field['type'] == 'text': if field['multi_valued'] == 'false': term = _marshal_term(value) term_generator.index_text(term, weight) term_generator.index_text(term, weight, prefix) if len(term.split()) == 1: document.add_term(term, weight) document.add_term(prefix + term, weight) document.add_value(field['column'], _marshal_value(value)) else: for term in value: term = _marshal_term(term) term_generator.index_text(term, weight) term_generator.index_text(term, weight, prefix) if len(term.split()) == 1: document.add_term(term, weight) document.add_term(prefix + term, weight) else: if field['multi_valued'] == 'false': term = _marshal_term(value) if len(term.split()) == 1: document.add_term(term, weight) document.add_term(prefix + term, weight) document.add_value(field['column'], _marshal_value(value)) else: for term in value: term = _marshal_term(term) if len(term.split()) == 1: document.add_term(term, weight) document.add_term(prefix + term, weight) document.set_data(pickle.dumps( (obj._meta.app_label, obj._meta.module_name, obj.pk, data), pickle.HIGHEST_PROTOCOL )) document.add_term(document_id) document.add_term( DOCUMENT_CT_TERM_PREFIX + u'%s.%s' % (obj._meta.app_label, obj._meta.module_name) ) database.replace_document(document_id, document) except UnicodeDecodeError: sys.stderr.write('Chunk failed.\n') pass finally: database = None
def index(self, fieldname, value, search_default=False, store_facet=True, spelling=False, weight=1, isdocid=False): """Index a field value. `fieldname` is the field to index. `value` is the value to index. This can be a string, int, float, or datetime object. Flax will attempt to index each appropriately. This should either be a unicode object or a UTF-8 string. `store_facet` specifies whether to store facet values (filter fields only) and is True by default. `spelling` specifies whether to add spellings to the database. `weight` allows the WDF to be set (1 by default) `isdocid` uses this field value as a docid (filter fields only). False by default. """ if not value: return if isdocid and self._docid: raise IndexingError, 'docid has already been set' if isinstance(value, unicode): value = value.encode('utf-8', 'ignore') prefix, valnum, isfilter = self._fieldmap[fieldname] if not isfilter or search_default or spelling: termgen = xapian.TermGenerator() if self._stemmer: termgen.set_stemmer(self._stemmer) if isfilter: if isinstance(value, basestring): term = u'%s%s%s' % (prefix, ':' if value[0].isupper() else '', value.decode('utf-8', 'ignore')) term = term.encode('utf-8', 'ignore') if isinstance(value, unicode): value = value.encode('utf-8', 'ignore') self._doc.add_term(term) if store_facet: if _multivalues: self._facets.setdefault( valnum, xapian.StringListSerialiser()).append(value) else: if self._facets.get(valnum): raise IndexingError, \ 'facet value already set for "%s" field' % fieldname self._facets[valnum] = value if isdocid: self._docid = term elif isinstance(value, float) or isinstance(value, int): self._doc.add_value(valnum, xapian.sortable_serialise(value)) # FIXME - helper terms? # FIXME - numeric facets if isdocid: self._docid = '%s%s' % (prefix, value) elif isinstance(value, datetime): self._doc.add_term('%s%04d' % (prefix, value.year)) self._doc.add_term('%s%04d%02d' % (prefix, value.year, value.month)) self._doc.add_term( '%s%04d%02d%02d' % (prefix, value.year, value.month, value.day)) # self._doc.add_value(valnum, '%04d%02d%02d%02d%02d%02d' % ( # value.year, value.month, value.day, # value.hour, value.minute, value.second)) self._doc.add_value( valnum, xapian.sortable_serialise(time.mktime(value.timetuple()))) if isdocid: raise IndexingError, 'cannot use date as docid' else: if isinstance(value, str): termgen.set_document(self._doc) termgen.index_text(value, weight, prefix) else: raise IndexingError, 'non-filter field requires string value' if isdocid: raise IndexingError, 'cannot use non-filter field as docid' # spelling only works for prefix-less terms if search_default or spelling: if search_default: termgen.set_document(self._doc) else: termgen.set_document(xapian.Document()) # dummy document if spelling: if self.database is None: raise IndexingError, 'spelling requires document.database to be set' termgen.set_database(self.database) termgen.set_flags(termgen.FLAG_SPELLING) termgen.index_text(value)
#!/usr/bin/env python # -*- coding: utf-8 -*- import xapian stem = xapian.Stem('english') db = xapian.inmemory_open() doc = xapian.Document() doc.add_posting(stem("is"), 1) doc.add_posting(stem("there"), 2) doc.add_posting(stem("anybody"), 3) doc.add_posting(stem("out"), 4) doc.add_posting(stem("there"), 5) db.add_document(doc) doc1 = xapian.Document() doc1.add_posting(stem("is"), 1) doc1.add_posting(stem("there"), 2) doc1.add_posting(stem("anybody"), 3) doc1.add_posting(stem("out"), 4) doc1.add_posting(stem("there"), 5) db.add_document(doc1) db.commit() for term in db.allterms(): print term.term, term.termfreq """ anybodi 2 is 2
def test_all(): # Test the version number reporting functions give plausible results. v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(), xapian.revision()) v2 = xapian.version_string() expect(v2, v, "Unexpected version output") # A regexp check would be better, but seems to create a bogus "leak" of -1 # objects in Python 3. expect(len(xapian.__version__.split('.')), 3, 'xapian.__version__ not X.Y.Z') expect((xapian.__version__.split('.'))[0], '1', 'xapian.__version__ not "1.Y.Z"') def access_cvar(): return xapian.cvar # Check that SWIG isn't generating cvar (regression test for ticket#297). expect_exception(AttributeError, "'module' object has no attribute 'cvar'", access_cvar) stem = xapian.Stem(b"english") expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)") doc = xapian.Document() doc.set_data(b"a\0b") if doc.get_data() == b"a": raise TestFail("get_data+set_data truncates at a zero byte") expect(doc.get_data(), b"a\0b", "get_data+set_data doesn't transparently handle a zero byte") doc.set_data(b"is there anybody out there?") doc.add_term(b"XYzzy") doc.add_posting(stem(b"is"), 1) doc.add_posting(stem(b"there"), 2) doc.add_posting(stem(b"anybody"), 3) doc.add_posting(stem(b"out"), 4) doc.add_posting(stem(b"there"), 5) db = xapian.inmemory_open() db.add_document(doc) expect(db.get_doccount(), 1, "Unexpected db.get_doccount()") terms = ["smoke", "test", "terms"] expect_query( xapian.Query(xapian.Query.OP_OR, [t.encode('utf-8') for t in terms]), "(smoke OR test OR terms)") query1 = xapian.Query(xapian.Query.OP_PHRASE, (b"smoke", b"test", b"tuple")) query2 = xapian.Query(xapian.Query.OP_XOR, (xapian.Query(b"smoke"), query1, b"string")) expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)") expect_query( query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)") subqs = ["a", "b"] expect_query( xapian.Query(xapian.Query.OP_OR, [s.encode('utf-8') for s in subqs]), "(a OR b)") expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, b'1', b'4'), "VALUE_RANGE 0 1 4") # Check database factory functions are wrapped as expected: expect_exception(xapian.DatabaseOpeningError, None, xapian.open_stub, b"nosuchdir/nosuchdb") expect_exception(xapian.DatabaseOpeningError, None, xapian.open_stub, b"nosuchdir/nosuchdb", xapian.DB_OPEN) expect_exception(xapian.DatabaseOpeningError, None, xapian.brass_open, b"nosuchdir/nosuchdb") expect_exception(xapian.DatabaseCreateError, None, xapian.brass_open, b"nosuchdir/nosuchdb", xapian.DB_CREATE) expect_exception(xapian.DatabaseOpeningError, None, xapian.chert_open, b"nosuchdir/nosuchdb") expect_exception(xapian.DatabaseCreateError, None, xapian.chert_open, b"nosuchdir/nosuchdb", xapian.DB_CREATE) expect_exception(xapian.NetworkError, None, xapian.remote_open, b"/bin/false", b"") expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, b"/bin/false", b"") expect_exception(xapian.NetworkError, None, xapian.remote_open, b"127.0.0.1", 0, 1) expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, b"127.0.0.1", 0, 1) # Check wrapping of MatchAll and MatchNothing: expect_query(xapian.Query.MatchAll, "<alldocuments>") expect_query(xapian.Query.MatchNothing, "") # Feature test for Query.__iter__ term_count = 0 for term in query2: term_count += 1 expect(term_count, 4, "Unexpected number of terms in query2") enq = xapian.Enquire(db) enq.set_query(xapian.Query(xapian.Query.OP_OR, b"there", b"is")) mset = enq.get_mset(0, 10) expect(mset.size(), 1, "Unexpected mset.size()") expect(len(mset), 1, "Unexpected mset.size()") # Feature test for Enquire.matching_terms(docid) term_count = 0 for term in enq.matching_terms(mset.get_hit(0)): term_count += 1 expect(term_count, 2, "Unexpected number of matching terms") # Feature test for MSet.__iter__ msize = 0 for match in mset: msize += 1 expect(msize, mset.size(), "Unexpected number of entries in mset") terms = b" ".join(enq.matching_terms(mset.get_hit(0))) expect(terms, b"is there", "Unexpected terms") # Feature test for ESet.__iter__ rset = xapian.RSet() rset.add_document(1) eset = enq.get_eset(10, rset) term_count = 0 for term in eset: term_count += 1 expect(term_count, 3, "Unexpected number of expand terms") # Feature test for Database.__iter__ term_count = 0 for term in db: term_count += 1 expect(term_count, 5, "Unexpected number of terms in db") # Feature test for Database.allterms term_count = 0 for term in db.allterms(): term_count += 1 expect(term_count, 5, "Unexpected number of terms in db.allterms") # Feature test for Database.postlist count = 0 for posting in db.postlist(b"there"): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('there')") # Feature test for Database.postlist with empty term (alldocspostlist) count = 0 for posting in db.postlist(b""): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('')") # Feature test for Database.termlist count = 0 for term in db.termlist(1): count += 1 expect(count, 5, "Unexpected number of entries in db.termlist(1)") # Feature test for Database.positionlist count = 0 for term in db.positionlist(1, b"there"): count += 1 expect(count, 2, "Unexpected number of entries in db.positionlist(1, 'there')") # Feature test for Document.termlist count = 0 for term in doc.termlist(): count += 1 expect(count, 5, "Unexpected number of entries in doc.termlist()") # Feature test for TermIter.skip_to term = doc.termlist() term.skip_to(b'n') while True: try: x = next(term) except StopIteration: break if x.term < b'n': raise TestFail("TermIter.skip_to didn't skip term '%s'" % x.term.decode('utf-8')) # Feature test for Document.values count = 0 for term in list(doc.values()): count += 1 expect(count, 0, "Unexpected number of entries in doc.values") # Check exception handling for Xapian::DocNotFoundError expect_exception(xapian.DocNotFoundError, "Docid 3 not found", db.get_document, 3) # Check value of OP_ELITE_SET expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET") # Feature test for MatchDecider doc = xapian.Document() doc.set_data(b"Two") doc.add_posting(stem(b"out"), 1) doc.add_posting(stem(b"outside"), 1) doc.add_posting(stem(b"source"), 2) doc.add_value(0, b"yes") db.add_document(doc) class testmatchdecider(xapian.MatchDecider): def __call__(self, doc): return doc.get_value(0) == b"yes" query = xapian.Query(stem(b"out")) enquire = xapian.Enquire(db) enquire.set_query(query) mset = enquire.get_mset(0, 10, None, testmatchdecider()) expect(mset.size(), 1, "Unexpected number of documents returned by match decider") expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in") # Feature test for ExpandDecider class testexpanddecider(xapian.ExpandDecider): def __call__(self, term): return (not term.startswith(b'a')) enquire = xapian.Enquire(db) rset = xapian.RSet() rset.add_document(1) eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0, testexpanddecider()) eset_terms = [item.term for item in eset] expect(len(eset_terms), eset.size(), "Unexpected number of terms returned by expand") if [t for t in eset_terms if t.startswith(b'a')]: raise TestFail("ExpandDecider was not used") # Check min_wt argument to get_eset() works (new in 1.2.5). eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ) expect([i.weight for i in eset][-1] < 1.9, True, "test get_eset() without min_wt") eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0, None, 1.9) expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt") # Check QueryParser parsing error. qp = xapian.QueryParser() expect_exception(xapian.QueryParserError, "Syntax: <expression> AND <expression>", qp.parse_query, b"test AND") # Check QueryParser pure NOT option qp = xapian.QueryParser() expect_query( qp.parse_query(b"NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(<alldocuments> AND_NOT test@1)") # Check QueryParser partial option qp = xapian.QueryParser() qp.set_database(db) qp.set_default_op(xapian.Query.OP_AND) qp.set_stemming_strategy(qp.STEM_SOME) qp.set_stemmer(xapian.Stem(b'en')) expect_query(qp.parse_query(b"foo o", qp.FLAG_PARTIAL), "(Zfoo@1 AND ((out@2 SYNONYM outsid@2) OR Zo@2))") expect_query(qp.parse_query(b"foo outside", qp.FLAG_PARTIAL), "(Zfoo@1 AND Zoutsid@2)") # Test supplying unicode strings expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar')), '(foo OR bar)') expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xa3')), '(foo OR bar\\xa3)') expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xc2\xa3')), '(foo OR bar\u00a3)') expect_query(xapian.Query(xapian.Query.OP_OR, b'foo', b'bar'), '(foo OR bar)') expect_query( qp.parse_query(b"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(<alldocuments> AND_NOT Zt\u00e9st@1)") doc = xapian.Document() doc.set_data(b"Unicode with an acc\xe9nt") doc.add_posting(stem(b"out\xe9r"), 1) expect(doc.get_data(), b"Unicode with an acc\xe9nt") term = next(doc.termlist()).term expect(term, b"out\xe9r") # Check simple stopper stop = xapian.SimpleStopper() qp.set_stopper(stop) expect(stop(b'a'), False) expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") stop.add(b'a') expect(stop(b'a'), True) expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Feature test for custom Stopper class my_b_stopper(xapian.Stopper): def __call__(self, term): return term == b"b" def get_description(self): return "my_b_stopper" stop = my_b_stopper() expect(stop.get_description(), "my_b_stopper") qp.set_stopper(stop) expect(stop(b'a'), False) expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") expect(stop(b'b'), True) expect_query(qp.parse_query(b"foo bar b", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Test TermGenerator termgen = xapian.TermGenerator() doc = xapian.Document() termgen.set_document(doc) termgen.index_text(b'foo bar baz foo') expect([(item.term, item.wdf, [pos for pos in item.positer]) for item in doc.termlist()], [(b'bar', 1, [2]), (b'baz', 1, [3]), (b'foo', 2, [1, 4])]) # Check DateValueRangeProcessor works context("checking that DateValueRangeProcessor works") qp = xapian.QueryParser() vrpdate = xapian.DateValueRangeProcessor(1, 1, 1960) qp.add_valuerangeprocessor(vrpdate) query = qp.parse_query(b'12/03/99..12/04/01') expect(str(query), 'Query(0 * VALUE_RANGE 1 19991203 20011204)') # Regression test for bug#193, fixed in 1.0.3. context("running regression test for bug#193") vrp = xapian.NumberValueRangeProcessor(0, b'$', True) a = '$10' b = '20' slot, a, b = vrp(a, b.encode('utf-8')) expect(slot, 0) expect(xapian.sortable_unserialise(a), 10) expect(xapian.sortable_unserialise(b), 20) # Regression tests copied from PHP (probably always worked in python, but # let's check...) context("running regression tests for issues which were found in PHP") # PHP overload resolution involving boolean types failed. enq.set_sort_by_value(1, True) # Regression test - fixed in 0.9.10.1. oqparser = xapian.QueryParser() oquery = oqparser.parse_query(b"I like tea") # Regression test for bug#192 - fixed in 1.0.3. enq.set_cutoff(100) # Test setting and getting metadata expect(db.get_metadata(b'Foo'), b'') db.set_metadata(b'Foo', b'Foo') expect(db.get_metadata(b'Foo'), b'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, b'') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.set_metadata, b'', b'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, b'') # Test OP_SCALE_WEIGHT and corresponding constructor expect_query( xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query(b'foo'), 5), "5 * foo")
def update(self, documents=None, after_index=None, per_page=10000, commit_each=False): """ Update the database with the documents. There are some default value and terms in a document: * Values: 1. Used to store the ID of the document 2. Store the model of the object (in the string format, like "project.app.model") 3. Store the indexer descriptor (module path) 4..10. Free * Terms UID: Used to store the ID of the document, so we can replace the document by the ID """ # Open Xapian Database database = self._db.open(write=True) # If doesnt have any document at all if documents is None: update_queue = self._model.objects.all() else: update_queue = documents commiter = Commiter.create(commit_each)( lambda: database.begin_transaction(flush=True), database.commit_transaction, database.cancel_transaction) # Get each document received for page in paginate(update_queue, per_page): try: commiter.begin_page() for obj in page.object_list: commiter.begin_object() try: if not self.trigger(obj): self.delete(obj.pk, database) continue doc = xapian.Document() # Add default terms and values uid = self._create_uid(obj) doc.add_term(self._create_uid(obj)) self._insert_meta_values(doc, obj) generator = xapian.TermGenerator() generator.set_database(database) generator.set_document(doc) generator.set_flags(xapian.TermGenerator.FLAG_SPELLING) stemming_lang = self._get_stem_language(obj) if stemming_lang: stemmer = self.get_stemmer(stemming_lang) generator.set_stemmer(stemmer) stopper = self.get_stopper(stemming_lang) if stopper: generator.set_stopper(stopper) # Get a weight for the object obj_weight = self._get_object_weight(obj) # Index fields self._do_index_fields(doc, generator, obj, obj_weight) database.replace_document(uid, doc) if after_index: after_index(obj) commiter.commit_object() except Exception: commiter.cancel_object() raise commiter.commit_page() except Exception: commiter.cancel_page() raise