def __init__(self, index_storage, backend, wiki_name=None, acl_rights_contents=[], **kw): """ Store params, create schemas. """ self.index_storage = index_storage self.backend = backend self.wikiname = wiki_name self.ix = {} # open indexes self.schemas = {} # existing schemas common_fields = { # wikiname so we can have a shared index in a wiki farm, always check this! WIKINAME: ID(stored=True), # namespace, so we can have different namespaces within a wiki, always check this! NAMESPACE: ID(stored=True), # tokenized NAME from metadata - use this for manual searching from UI NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0), # unmodified NAME from metadata - use this for precise lookup by the code. # also needed for wildcard search, so the original string as well as the query # (with the wildcard) is not cut into pieces. NAME_EXACT: ID(field_boost=3.0), # revision id (aka meta id) REVID: ID(unique=True, stored=True), # parent revision id PARENTID: ID(stored=True), # backend name (which backend is this rev stored in?) BACKENDNAME: ID(stored=True), # MTIME from revision metadata (converted to UTC datetime) MTIME: DATETIME(stored=True), # publish time from metadata (converted to UTC datetime) PTIME: DATETIME(stored=True), # ITEMTYPE from metadata, always matched exactly hence ID ITEMTYPE: ID(stored=True), # tokenized CONTENTTYPE from metadata CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()), # unmodified list of TAGS from metadata TAGS: ID(stored=True), LANGUAGE: ID(stored=True), # USERID from metadata USERID: ID(stored=True), # ADDRESS from metadata ADDRESS: ID(stored=True), # HOSTNAME from metadata HOSTNAME: ID(stored=True), # SIZE from metadata SIZE: NUMERIC(stored=True), # ACTION from metadata ACTION: ID(stored=True), # tokenized COMMENT from metadata COMMENT: TEXT(stored=True), # SUMMARY from metadata SUMMARY: TEXT(stored=True), # DATAID from metadata DATAID: ID(stored=True), # TRASH from metadata TRASH: BOOLEAN(stored=True), # data (content), converted to text/plain and tokenized CONTENT: TEXT(stored=True, spelling=True), } latest_revs_fields = { # ITEMID from metadata - as there is only latest rev of same item here, it is unique ITEMID: ID(unique=True, stored=True), # unmodified list of ITEMLINKS from metadata ITEMLINKS: ID(stored=True), # unmodified list of ITEMTRANSCLUSIONS from metadata ITEMTRANSCLUSIONS: ID(stored=True), # tokenized ACL from metadata ACL: TEXT(analyzer=AclTokenizer(acl_rights_contents), multitoken_query="and", stored=True), # ngram words, index ngrams of words from main content CONTENTNGRAM: NGRAMWORDS(minsize=3, maxsize=6), } latest_revs_fields.update(**common_fields) userprofile_fields = { # Note: email / openid (if given) should be unique, but we might # have lots of empty values if it is not given and thus it is NOT # unique overall! Wrongly declaring it unique would lead to whoosh # killing other users from index when update_document() is called! EMAIL: ID(stored=True), OPENID: ID(stored=True), DISABLED: BOOLEAN(stored=True), LOCALE: ID(stored=True), SUBSCRIPTION_IDS: ID(), SUBSCRIPTION_PATTERNS: ID(), } latest_revs_fields.update(**userprofile_fields) # XXX This is a highly adhoc way to support indexing of ticket items. ticket_fields = { EFFORT: NUMERIC(stored=True), DIFFICULTY: NUMERIC(stored=True), SEVERITY: NUMERIC(stored=True), PRIORITY: NUMERIC(stored=True), ASSIGNED_TO: ID(stored=True), SUPERSEDED_BY: ID(stored=True), DEPENDS_ON: ID(stored=True), CLOSED: BOOLEAN(stored=True), } latest_revs_fields.update(**ticket_fields) blog_entry_fields = { } latest_revs_fields.update(**blog_entry_fields) all_revs_fields = { ITEMID: ID(stored=True), } all_revs_fields.update(**common_fields) latest_revisions_schema = Schema(**latest_revs_fields) all_revisions_schema = Schema(**all_revs_fields) # Define dynamic fields dynamic_fields = [("*_id", ID(stored=True)), ("*_text", TEXT(stored=True)), ("*_keyword", KEYWORD(stored=True)), ("*_numeric", NUMERIC(stored=True)), ("*_datetime", DATETIME(stored=True)), ("*_boolean", BOOLEAN(stored=True)), ] # Adding dynamic fields to schemas for glob, field_type in dynamic_fields: latest_revisions_schema.add(glob, field_type, glob=True) all_revisions_schema.add(glob, field_type, glob=True) # schemas are needed by query parser and for index creation self.schemas[ALL_REVS] = all_revisions_schema self.schemas[LATEST_REVS] = latest_revisions_schema # what fields could whoosh result documents have (no matter whether all revs index # or latest revs index): self.common_fields = set(latest_revs_fields.keys()) & set(all_revs_fields.keys())
def __init__(self, index_dir, backend, wiki_name=None, acl_rights_contents=[], **kw): """ Store params, create schemas. """ self.index_dir = index_dir self.index_dir_tmp = index_dir + '.temp' self.backend = backend self.wikiname = wiki_name self.ix = {} # open indexes self.schemas = {} # existing schemas common_fields = { # wikiname so we can have a shared index in a wiki farm, always check this! WIKINAME: ID(stored=True), # tokenized NAME from metadata - use this for manual searching from UI NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0), # unmodified NAME from metadata - use this for precise lookup by the code. # also needed for wildcard search, so the original string as well as the query # (with the wildcard) is not cut into pieces. NAME_EXACT: ID(field_boost=3.0), # revision id (aka meta id) REVID: ID(unique=True, stored=True), # parent revision id PARENTID: ID(stored=True), # MTIME from revision metadata (converted to UTC datetime) MTIME: DATETIME(stored=True), # tokenized CONTENTTYPE from metadata CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()), # unmodified list of TAGS from metadata TAGS: ID(stored=True), LANGUAGE: ID(stored=True), # USERID from metadata USERID: ID(stored=True), # ADDRESS from metadata ADDRESS: ID(stored=True), # HOSTNAME from metadata HOSTNAME: ID(stored=True), # SIZE from metadata SIZE: NUMERIC(stored=True), # ACTION from metadata ACTION: ID(stored=True), # tokenized COMMENT from metadata COMMENT: TEXT(stored=True), # SUMMARY from metadata SUMMARY: TEXT(stored=True), # data (content), converted to text/plain and tokenized CONTENT: TEXT(stored=True), } latest_revs_fields = { # ITEMID from metadata - as there is only latest rev of same item here, it is unique ITEMID: ID(unique=True, stored=True), # unmodified list of ITEMLINKS from metadata ITEMLINKS: ID(stored=True), # unmodified list of ITEMTRANSCLUSIONS from metadata ITEMTRANSCLUSIONS: ID(stored=True), # tokenized ACL from metadata ACL: TEXT(analyzer=AclTokenizer(acl_rights_contents), multitoken_query="and", stored=True), } latest_revs_fields.update(**common_fields) userprofile_fields = { # Note: email / openid (if given) should be unique, but we might # have lots of empty values if it is not given and thus it is NOT # unique overall! Wrongly declaring it unique would lead to whoosh # killing other users from index when update_document() is called! EMAIL: ID(stored=True), OPENID: ID(stored=True), } latest_revs_fields.update(**userprofile_fields) all_revs_fields = { ITEMID: ID(stored=True), } all_revs_fields.update(**common_fields) latest_revisions_schema = Schema(**latest_revs_fields) all_revisions_schema = Schema(**all_revs_fields) # Define dynamic fields dynamic_fields = [("*_id", ID(stored=True)), ("*_text", TEXT(stored=True)), ("*_keyword", KEYWORD(stored=True)), ("*_numeric", NUMERIC(stored=True)), ("*_datetime", DATETIME(stored=True)), ("*_boolean", BOOLEAN(stored=True)), ] # Adding dynamic fields to schemas for glob, field_type in dynamic_fields: latest_revisions_schema.add(glob, field_type, glob=True) all_revisions_schema.add(glob, field_type, glob=True) # schemas are needed by query parser and for index creation self.schemas[ALL_REVS] = all_revisions_schema self.schemas[LATEST_REVS] = latest_revisions_schema # what fields could whoosh result documents have (no matter whether all revs index # or latest revs index): self.common_fields = set(latest_revs_fields.keys()) & set(all_revs_fields.keys())