class NoteSchema(SchemaClass): id = ID(stored=True, unique=True) created = DATETIME() title = TEXT(analyzer=custom_analyzer, spelling=True) contents = TEXT(spelling=True) public = BOOLEAN() draft = BOOLEAN() short_code = ID(stored=True, unique=True) disable = BOOLEAN() reported = BOOLEAN() tags = TEXT(analyzer=tag_analyzer, spelling=True) user = ID()
class RecipeSchema(SchemaClass): id = ID(stored=True, unique=True) created = DATETIME() title = TEXT(analyzer=custom_analyzer) description = TEXT(analyzer=custom_analyzer) public = BOOLEAN() deleted = BOOLEAN() reported = BOOLEAN() short_code = ID(stored=True, unique=True) tags = TEXT(analyzer=tag_analyzer) user = ID() steps = TEXT(analyzer=custom_analyzer) ingredients = TEXT(analyzer=custom_analyzer) country = TEXT()
def fields_map(self, field_type): if field_type == "primary": return ID(stored=True, unique=True) type_map = { 'date': types.Date, 'datetime': types.DateTime, 'boolean': types.Boolean, 'integer': types.Integer, 'float': types.Float } if isinstance(field_type, str): field_type = type_map.get(field_type, types.Text) if not isinstance(field_type, type): field_type = field_type.__class__ if issubclass(field_type, (types.DateTime, types.Date)): return DATETIME(stored=True, sortable=True) elif issubclass(field_type, types.Integer): return NUMERIC(stored=True, numtype=int) elif issubclass(field_type, types.Float): return NUMERIC(stored=True, numtype=float) elif issubclass(field_type, types.Boolean): return BOOLEAN(stored=True) return TEXT(stored=True, analyzer=self.analyzer, sortable=False)
def mk_index(self, indexpath="indexdir", overwrite=False): """ creates an index for IR operations """ if os.path.exists(indexpath): if overwrite: shutil.rmtree(indexpath) if not os.path.exists(indexpath): os.mkdir(indexpath) self.context_schema = Schema(contextid=ID(stored=True), title=TEXT(stored=True), context=TEXT(stored=True, analyzer=StemmingAnalyzer())) self.context_ix = index.create_in("indexdir", self.context_schema) self.question_schema = Schema(contextid=ID(stored=True), questionid=ID(stored=True), is_impossible=BOOLEAN(stored=True), question=TEXT( stored=True, analyzer=StemmingAnalyzer())) self.question_ix = index.create_in("indexdir", self.question_schema) self.answer_schema = Schema(contextid=ID(stored=True), questionid=ID(stored=True), answerid=ID(stored=True), answer_start=NUMERIC(stored=True), answer=TEXT(stored=True, analyzer=StemmingAnalyzer())) self.answer_ix = index.create_in("indexdir", self.answer_schema)
class LawSchema(SchemaClass): PK = ID(unique=True) ACTIVE_FLG = BOOLEAN() ARTICLE = ID() ARTICLE_HEADING = TEXT(stored=True) ARTICLE_HISTORY = TEXT(stored=True) CHAPTER = ID() CHAPTER_HEADING = TEXT(stored=True) CODE_HEADING = TEXT(stored=True) DIVISION = ID() DIVISION_HEADING = TEXT(stored=True) EFFECTIVE_DATE = DATETIME(stored=True) HISTORY = TEXT(stored=True) LAW_CODE = ID() LAW_SECTION_VERSION_ID = ID() LEGAL_TEXT = TEXT(stored=True) LOB_FILE = ID() OP_CHAPTER = ID() OP_SECTION = ID() OP_STATUES = ID() PART = ID() SECTION_HISTORY = TEXT(stored=True) SECTION_NUM = ID(stored=True) SECTION_TITLE = TEXT(stored=True) TITLE = ID() TRANS_UID = ID() TRANS_UPDATE = DATETIME()
class UserSchema(SchemaClass): pk = ID(stored=True, unique=True) fullname = TEXT(stored=True, spelling=True) username = TEXT(stored=True, spelling=True) organization = BOOLEAN(stored=True) content = NGRAM(minsize=1, phrase=True)
def spam_schema(): analyzer = StemmingAnalyzer(cachesize=-1) schema = Schema(title=TEXT(stored=True, analyzer=analyzer, sortable=True), content=TEXT(stored=True, analyzer=analyzer, sortable=True), content_length=NUMERIC(stored=True, sortable=True), uid=ID(stored=True), is_spam=BOOLEAN(stored=True)) return schema
class PhotSchema(SchemaClass): id = ID(stored=True, unique=True) created = DATETIME() title = TEXT(analyzer=custom_analyzer, spelling=True) short_code = ID(stored=True, unique=True) disable = BOOLEAN() tags = TEXT(analyzer=tag_analyzer, spelling=True) user = ID()
def get_schema(): return Schema( id=NUMERIC(stored=True, unique=True), title=TEXT(sortable=True), content=TEXT(), asn=NUMERIC(sortable=True), correspondent=TEXT(sortable=True), correspondent_id=NUMERIC(), has_correspondent=BOOLEAN(), tag=KEYWORD(commas=True, scorable=True, lowercase=True), tag_id=KEYWORD(commas=True, scorable=True), has_tag=BOOLEAN(), type=TEXT(sortable=True), type_id=NUMERIC(), has_type=BOOLEAN(), created=DATETIME(sortable=True), modified=DATETIME(sortable=True), added=DATETIME(sortable=True), )
def ingest_languages(writer: BufferedWriter): print('INFO: start languages ingestion') path = join(configuration.DATA, 'languages.csv') with open(path, 'r') as stream: languages = [line.strip() for line in stream.readlines()] for i in range(len(languages)): locale, label = languages[i].split(',') print(f'\tingest [{locale}] language') writer.add_field(locale, BOOLEAN()) storage.lpush('locales', locale) storage.set(f'locale:{locale}', label)
def get_whoosh_schema(): return Schema(offerId=ID(stored=True), university=TEXT(stored=True), enterprise=TEXT(stored=True), months=NUMERIC(stored=True), salary=NUMERIC(stored=True), country=TEXT(stored=True), province=TEXT(stored=True), city=TEXT(stored=True), description=TEXT(stored=True), immediate=BOOLEAN(stored=True))
def get_schema(schema): kwargs = {} for key, value in schema.items(): if value == "indexed": kwargs[key] = NGRAMWORDS(minsize=2, sortable=True) elif value == "id_stored": kwargs[key] = ID(stored=True) elif value == "unique_id_stored": kwargs[key] = ID(unique=True, stored=True) elif value == "boolean": kwargs[key] = BOOLEAN(stored=True) return Schema(**kwargs)
def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: # schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True) # 中文分词 schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search.") return (content_field_name, Schema(**schema_fields))
def __get_index_schema(self): """ :return: organization index schema """ return Schema(id=NUMERIC(stored=True), url=ID(stored=True), external_id=ID(stored=True), name=ID(stored=True), domain_names=KEYWORD(stored=True, commas=True), created_at=ID(stored=True), details=ID(stored=True), shared_tickets=BOOLEAN(stored=True), tags=KEYWORD(stored=True, commas=True))
def __get_index_schema(self): """ :return: user index schema """ return Schema(id=NUMERIC(stored=True), url=ID(stored=True), external_id=ID(stored=True), name=ID(stored=True), alias=ID(stored=True), created_at=ID(stored=True), active=BOOLEAN(stored=True), verified=BOOLEAN(stored=True), shared=BOOLEAN(stored=True), locale=ID(stored=True), timezone=ID(stored=True), last_login_at=ID(stored=True), email=ID(stored=True), phone=ID(stored=True), signature=ID(stored=True), organization_id=NUMERIC(stored=True), tags=KEYWORD(stored=True, commas=True), suspended=BOOLEAN(stored=True), role=ID(stored=True))
def create(self): analyzer = StandardAnalyzer(minsize=1, stoplist=None) | CleanUpFilter() schema = Schema( source=TEXT(stored=True, analyzer=analyzer), target=TEXT(stored=True, analyzer=analyzer), comment=TEXT(stored=True), context=TEXT(stored=True), softcatala=BOOLEAN(stored=True), project=TEXT(stored=True), ) if not os.path.exists(self.dir_name): os.mkdir(self.dir_name) ix = create_in(self.dir_name, schema) self.writer = ix.writer()
def fields(self): model = self.table schema_fields = {'id': ID(stored=True, unique=True)} searchable = set(model.__searchable__) analyzer = getattr(model, '__msearch_analyzer__') if hasattr( model, '__msearch_analyzer__') else self.analyzer primary_keys = [key.name for key in inspect(model).primary_key] for field in searchable: if '.' in field: fields = field.split('.') field_attr = getattr( getattr(model, fields[0]).property.mapper.class_, fields[1]) else: field_attr = getattr(model, field) if hasattr(field_attr, 'descriptor') and isinstance( field_attr.descriptor, hybrid_property): field_type = Text type_hint = getattr(field_attr, 'type_hint', None) if type_hint is not None: type_hint_map = { 'date': Date, 'datetime': DateTime, 'boolean': Boolean, 'integer': Integer, 'float': Float } field_type = type_hint if isclass( type_hint) else type_hint_map.get( type_hint.lower(), Text) else: field_type = field_attr.property.columns[0].type if field in primary_keys: schema_fields[field] = ID(stored=True, unique=True) elif field_type in (DateTime, Date): schema_fields[field] = DATETIME(stored=True, sortable=True) elif field_type == Integer: schema_fields[field] = NUMERIC(stored=True, numtype=int) elif field_type == Float: schema_fields[field] = NUMERIC(stored=True, numtype=float) elif field_type == Boolean: schema_fields[field] = BOOLEAN(stored=True) else: schema_fields[field] = TEXT( stored=True, analyzer=analyzer, sortable=False) return schema_fields
def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, type=int) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, type=float) elif field_class.field_type == 'boolean': schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=StemmingAnalyzer()) if field_class.document is True: content_field_name = field_class.index_fieldname # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields))
class Mydocument(SchemaClass): project_name = ID(stored=True) project_id = ID(stored=True) snapshot_name = ID(stored=True) snapshot_id = ID(stored=True) branch_name = ID(stored=True) branch_id = ID(stored=True) item_name = ID(stored=True) item_id = ID(stored=True) item_type = ID(stored=True) item_content = TEXT(stored=True) is_toolkit = BOOLEAN(stored=True) toolkit_project_name = ID(stored=True) toolkit_project_id = ID(stored=True) toolkit_snapshot_name = ID(stored=True) toolkit_snapshot_id = ID(stored=True) toolkit_branch_name = ID(stored=True) toolkit_branch_id = ID(stored=True) apppath = ID(stored=True)
def __get_index_schema(self): """ :return: ticket index schema """ return Schema(status=ID(stored=True), assignee_id=NUMERIC(stored=True), via=ID(stored=True), description=ID(stored=True), tags=KEYWORD(stored=True, commas=True), url=ID(stored=True), external_id=ID(stored=True), created_at=ID(stored=True), submitter_id=NUMERIC(stored=True), priority=ID(stored=True), due_at=ID(stored=True), organization_id=NUMERIC(stored=True), has_incidents=BOOLEAN(stored=True), id=ID(stored=True), type=ID(stored=True), subject=ID(stored=True))
def get_schema(): analyzer = StemmingAnalyzer(stoplist=STOP) schema = Schema(title=TEXT(stored=True, analyzer=analyzer, sortable=True), url=ID(stored=True), content_length=NUMERIC(stored=True, sortable=True), thread_votecount=NUMERIC(stored=True, sortable=True), vote_count=NUMERIC(stored=True, sortable=True), content=TEXT(stored=True, analyzer=analyzer, sortable=True), tags=KEYWORD(stored=True, commas=True), is_toplevel=BOOLEAN(stored=True), lastedit_date=NUMERIC(stored=True, sortable=True), rank=NUMERIC(stored=True, sortable=True), author=TEXT(stored=True), author_score=NUMERIC(stored=True, sortable=True), author_handle=TEXT(stored=True), author_uid=ID(stored=True), author_url=ID(stored=True), uid=ID(stored=True), type=NUMERIC(stored=True, sortable=True), type_display=TEXT(stored=True)) return schema
def get_schema(): analyzer = StemmingAnalyzer(stoplist=STOP) schema = Schema(title=TEXT(stored=True, analyzer=analyzer, sortable=True), url=ID(stored=True), content_length=NUMERIC(stored=True, sortable=True), thread_votecount=NUMERIC(stored=True, sortable=True), vote_count=NUMERIC(stored=True, sortable=True), content=TEXT(stored=True, analyzer=analyzer, sortable=True), tags=KEYWORD(stored=True, commas=True), is_toplevel=BOOLEAN(stored=True), author_is_moderator=BOOLEAN(stored=True), lastedit_user_is_moderator=BOOLEAN(stored=True), lastedit_user_is_suspended=BOOLEAN(stored=True), author_is_suspended=BOOLEAN(stored=True), lastedit_date=DATETIME(stored=True, sortable=True), creation_date=DATETIME(stored=True, sortable=True), rank=NUMERIC(stored=True, sortable=True), author=TEXT(stored=True), lastedit_user=TEXT(stored=True), lastedit_user_email=TEXT(stored=True), lastedit_user_score=NUMERIC(stored=True, sortable=True), lastedit_user_uid=ID(stored=True), lastedit_user_url=ID(stored=True), author_score=NUMERIC(stored=True, sortable=True), author_handle=TEXT(stored=True), author_email=TEXT(stored=True), author_uid=ID(stored=True), author_url=ID(stored=True), root_has_accepted=BOOLEAN(stored=True), reply_count=NUMERIC(stored=True, sortable=True), view_count=NUMERIC(stored=True, sortable=True), answer_count=NUMERIC(stored=True, sortable=True), uid=ID(stored=True), type=NUMERIC(stored=True, sortable=True), type_display=TEXT(stored=True)) return schema
class SearchIndexer: """Full-text search indexer.""" # schema for searches of all (public + private) info SCHEMA = Schema( type=ID(stored=True, sortable=True), handle=ID(stored=True, unique=True), private=BOOLEAN(stored=True), text=TEXT(), text_private=TEXT(), change=DATETIME(sortable=True, stored=True), ) # schema for searches of public info only SCHEMA_PUBLIC = Schema( type=ID(stored=True, sortable=True), handle=ID(stored=True, unique=True), private=BOOLEAN(stored=True), text=TEXT(), change=DATETIME(sortable=True, stored=True), ) def __init__(self, index_dir=FilenameOrPath): """Initialize given an index dir path.""" self.index_dir = Path(index_dir) self.index_dir.mkdir(exist_ok=True) # query parser for all (public + private) content self.query_parser_all = MultifieldParser(["text", "text_private"], schema=self.SCHEMA) # query parser for public content only self.query_parser_public = QueryParser("text", schema=self.SCHEMA_PUBLIC) def index(self, overwrite=False): """Return the index; create if doesn't exist.""" index_dir = str(self.index_dir) if overwrite or not index.exists_in(index_dir): return index.create_in(index_dir, self.SCHEMA) return index.open_dir(index_dir) def _add_obj_strings(self, writer, obj_dict): """Add or update an object to the index.""" try: writer.update_document( type=obj_dict["class_name"].lower(), handle=obj_dict["handle"], private=obj_dict["private"], text=obj_dict["string"], text_private=obj_dict["string_private"], change=obj_dict["change"], ) except: current_app.logger.error("Failed adding object {}".format( obj_dict["handle"])) def reindex_full(self, db_handle: DbReadBase): """Reindex the whole database.""" with self.index(overwrite=True).writer() as writer: for obj_dict in iter_obj_strings(db_handle): self._add_obj_strings(writer, obj_dict) def _get_object_timestamps(self): """Get a dictionary with the timestamps of all objects in the index.""" d = {} with self.index().searcher() as searcher: for fields in searcher.all_stored_fields(): class_name = fields["type"] if class_name not in d: d[class_name] = set() d[class_name].add((fields["handle"], fields["change"])) return d def _get_update_info(self, db_handle: DbReadBase): """Get a dictionary with info about changed objects in the db.""" db_timestamps = get_object_timestamps(db_handle) ix_timestamps = self._get_object_timestamps() deleted = {} updated = {} new = {} for class_name in db_timestamps: db_handles = set(handle for handle, _ in db_timestamps[class_name]) ix_handles = set( handle for handle, _ in ix_timestamps.get(class_name.lower(), set())) # new: not present in index new[class_name] = db_handles - ix_handles # deleted: not present in db deleted[class_name] = ix_handles - db_handles # changed: different (new or modified) in db changed_timestamps = db_timestamps[class_name] - ix_timestamps.get( class_name.lower(), set()) changed_handles = set(handle for handle, _ in changed_timestamps) # updated: changed and present in the index updated[class_name] = changed_handles & ix_handles return {"deleted": deleted, "updated": updated, "new": new} def delete_object(self, writer, handle: str): """Delete an object from the index.""" writer.delete_by_term("handle", handle) def add_or_update_object(self, writer, handle: str, db_handle: DbReadBase, class_name: str): """Add an object to the index or update it if it exists.""" obj_dict = obj_strings_from_handle(db_handle, class_name, handle) self._add_obj_strings(writer, obj_dict) def get_writer(self, overwrite: bool = False, use_async: bool = False): """Get a writer instance. If `use_async` is true, use an `AsyncWriter`. """ idx = self.index(overwrite=overwrite) if use_async: return AsyncWriter(idx, delay=0.1) return idx.writer() def reindex_incremental(self, db_handle: DbReadBase): """Update the index incrementally.""" update_info = self._get_update_info(db_handle) with self.index(overwrite=False).writer() as writer: # delete objects for class_name, handles in update_info["deleted"].items(): for handle in handles: self.delete_object(writer, handle) # add objects for class_name, handles in update_info["new"].items(): for handle in handles: self.add_or_update_object(writer, handle, db_handle, class_name) # update objects for class_name, handles in update_info["updated"].items(): for handle in handles: self.add_or_update_object(writer, handle, db_handle, class_name) @staticmethod def format_hit(hit: Hit) -> Dict[str, Any]: """Format a search hit.""" return { "handle": hit["handle"], "object_type": hit["type"], "rank": hit.rank, "score": hit.score, } def _get_sorting( self, sort: Optional[List[str]] = None, ) -> Optional[List[FieldFacet]]: """Get the appropriate field facets for sorting.""" if not sort: return None facets = [] allowed_sorters = {"type", "change"} for sorter in sort: _field = sorter.lstrip("+-") if _field not in allowed_sorters: continue reverse = sorter.startswith("-") facets.append(FieldFacet(_field, reverse=reverse)) return facets def search( self, query: str, page: int, pagesize: int, include_private: bool = True, extend: bool = False, sort: Optional[List[str]] = None, ): """Search the index. If `include_private` is true, include also private objects and search in private fields. """ query_parser = (self.query_parser_all if include_private else self.query_parser_public) query_parser.add_plugin(DateParserPlugin()) # if private objects should not be shown, add a mask mask = None if include_private else Term("private", True) parsed_query = query_parser.parse(query) with self.index().searcher() as searcher: sortedby = self._get_sorting(sort) results = searcher.search_page(parsed_query, page, pagesize, mask=mask, sortedby=sortedby) return results.total, [self.format_hit(hit) for hit in results]
class SearchIndexer: """Full-text search indexer.""" # schema for searches of all (public + private) info SCHEMA = Schema( type=ID(stored=True), handle=ID(stored=True, unique=True), private=BOOLEAN(stored=True), text=TEXT(), text_private=TEXT(), changed=DATETIME(), ) # schema for searches of public info only SCHEMA_PUBLIC = Schema( type=ID(stored=True), handle=ID(stored=True, unique=True), private=BOOLEAN(stored=True), text=TEXT(), changed=DATETIME(), ) def __init__(self, index_dir=FilenameOrPath): """Initialize given an index dir path.""" self.index_dir = Path(index_dir) self.index_dir.mkdir(exist_ok=True) # query parser for all (public + private) content self.query_parser_all = MultifieldParser(["text", "text_private"], schema=self.SCHEMA) # query parser for public content only self.query_parser_public = QueryParser("text", schema=self.SCHEMA_PUBLIC) def index(self, overwrite=False): """Return the index; create if doesn't exist.""" index_dir = str(self.index_dir) if overwrite or not index.exists_in(index_dir): return index.create_in(index_dir, self.SCHEMA) return index.open_dir(index_dir) def reindex_full(self, db_handle: DbReadBase): """Reindex the whole database.""" with self.index(overwrite=True).writer() as writer: for obj_dict in iter_obj_strings(db_handle): writer.add_document( type=obj_dict["class_name"].lower(), handle=obj_dict["handle"], text=obj_dict["string"], text_private=obj_dict["string_private"], changed=obj_dict["changed"], ) @staticmethod def format_hit(hit: Hit) -> Dict[str, Any]: """Format a search hit.""" return { "handle": hit["handle"], "object_type": hit["type"], "rank": hit.rank, "score": hit.score, } def search( self, query: str, page: int, pagesize: int, include_private: bool = True, extend: bool = False, ): """Search the index. If `include_private` is true, include also private objects and search in private fields. """ query_parser = (self.query_parser_all if include_private else self.query_parser_public) query_parser.add_plugin(DateParserPlugin()) # if private objects should not be shown, add a mask mask = None if include_private else Term("private", True) parsed_query = query_parser.parse(query) with self.index().searcher() as searcher: results = searcher.search_page(parsed_query, page, pagesize, mask=mask) return results.total, [self.format_hit(hit) for hit in results]
path=TEXT(stored=True), content=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), modtime=STORED(), extension=TEXT(stored=True)) IDX_NAME = 'HG_INDEX' FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') FRAGMENTER = ContextFragmenter(200) CHGSETS_SCHEMA = Schema( raw_id=ID(unique=True, stored=True), date=NUMERIC(stored=True), last=BOOLEAN(), owner=TEXT(), repository=ID(unique=True, stored=True), author=TEXT(stored=True), message=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), parents=TEXT(), added=TEXT(), removed=TEXT(), changed=TEXT(), ) CHGSET_IDX_NAME = 'CHGSET_INDEX'
def __init__(self): app_dir = pathlib.Path(env.str('FLASK_APP')).parent.absolute() # Get configuration values from the environment. self.SECRET_KEY = env.str('SECRET_KEY') self.KERKO_ZOTERO_API_KEY = env.str('KERKO_ZOTERO_API_KEY') self.KERKO_ZOTERO_LIBRARY_ID = env.str('KERKO_ZOTERO_LIBRARY_ID') self.KERKO_ZOTERO_LIBRARY_TYPE = env.str('KERKO_ZOTERO_LIBRARY_TYPE') self.KERKO_DATA_DIR = env.str('KERKO_DATA_DIR', str(app_dir / 'data' / 'kerko')) # Set other configuration variables. self.LOGGING_HANDLER = 'default' self.EXPLAIN_TEMPLATE_LOADING = False self.LIBSASS_INCLUDES = [ str( pathlib.Path(__file__).parent.parent / 'static' / 'src' / 'vendor' / 'bootstrap' / 'scss'), str( pathlib.Path(__file__).parent.parent / 'static' / 'src' / 'vendor' / '@fortawesome' / 'fontawesome-free' / 'scss'), ] self.BABEL_DEFAULT_LOCALE = 'en_GB' self.KERKO_WHOOSH_LANGUAGE = 'en' self.KERKO_ZOTERO_LOCALE = 'en-GB' self.HOME_URL = 'https://opendeved.net' self.HOME_TITLE = _("Open Development & Education") # self.HOME_SUBTITLE = _("...") self.ABOUT_URL = 'https://opendeved.net/about/' self.BLOG_URL = 'https://opendeved.net/' self.JOBS_URL = 'https://opendeved.net/jobs/' self.PROGRAMMES_URL = 'https://opendeved.net/programmes/' self.CONTACTUS_URL = 'https://opendeved.net/contact-us/' self.NAV_TITLE = _("Evidence Library") self.KERKO_TITLE = _("Evidence Library – Open Development & Education") self.KERKO_PRINT_ITEM_LINK = True self.KERKO_PRINT_CITATIONS_LINK = True self.KERKO_RESULTS_FIELDS = [ 'id', 'attachments', 'bib', 'coins', 'data', 'preview', 'url' ] self.KERKO_RESULTS_ABSTRACTS = True self.KERKO_RESULTS_ABSTRACTS_MAX_LENGTH = 500 self.KERKO_RESULTS_ABSTRACTS_MAX_LENGTH_LEEWAY = 40 self.KERKO_TEMPLATE_LAYOUT = 'app/layout.html.jinja2' self.KERKO_TEMPLATE_SEARCH = 'app/search.html.jinja2' self.KERKO_TEMPLATE_SEARCH_ITEM = 'app/search-item.html.jinja2' self.KERKO_TEMPLATE_ITEM = 'app/item.html.jinja2' self.KERKO_DOWNLOAD_ATTACHMENT_NEW_WINDOW = True self.KERKO_RELATIONS_INITIAL_LIMIT = 50 # CAUTION: The URL's query string must be changed after any edit to the CSL # style, otherwise zotero.org might still use a previously cached version of # the file. self.KERKO_CSL_STYLE = 'https://docs.opendeved.net/static/dist/csl/eth_apa.xml?202012301815' self.KERKO_COMPOSER = Composer( whoosh_language=self.KERKO_WHOOSH_LANGUAGE, exclude_default_facets=[ 'facet_tag', 'facet_link', 'facet_item_type' ], exclude_default_fields=['data'], default_item_exclude_re=r'^_exclude$', default_child_include_re=r'^(_publish|publishPDF)$', default_child_exclude_re=r'', ) # Replace the default 'data' extractor to strip unwanted data from the Extra field. self.KERKO_COMPOSER.add_field( FieldSpec( key='data', field_type=STORED, extractor=extractors.TransformerExtractor( extractor=extractors.RawDataExtractor(), transformers=[extra_field_cleaner]), )) # Add field for storing the formatted item preview used on search result # pages. This relies on the CSL style's in-text citation formatting and only # makes sense using our custom CSL style! self.KERKO_COMPOSER.add_field( FieldSpec( key='preview', field_type=STORED, extractor=extractors.TransformerExtractor( extractor=extractors.ItemExtractor(key='citation', format_='citation'), # Zotero wraps the citation in a <span> element (most probably # because it expects the 'citation' format to be used in-text), # but that <span> has to be removed because our custom CSL style # causes <div>s to be nested within. Let's replace that <span> # with the same markup that the 'bib' format usually provides. transformers=[ lambda value: re.sub(r'^<span>', '<div class="csl-entry">', value, count=1), lambda value: re.sub( r'</span>$', '</div>', value, count=1), ]))) # Add extractors for the 'alternate_id' field. self.KERKO_COMPOSER.fields['alternate_id'].extractor.extractors.append( extractors.TransformerExtractor( extractor=extractors.ItemDataExtractor(key='extra'), transformers=[ transformers.find( regex=r'^\s*EdTechHub.ItemAlsoKnownAs\s*:\s*(.*)$', flags=re.IGNORECASE | re.MULTILINE, max_matches=1, ), transformers.split(sep=';'), ])) self.KERKO_COMPOSER.fields['alternate_id'].extractor.extractors.append( extractors.TransformerExtractor( extractor=extractors.ItemDataExtractor(key='extra'), transformers=[ transformers.find( regex=r'^\s*KerkoCite.ItemAlsoKnownAs\s*:\s*(.*)$', flags=re.IGNORECASE | re.MULTILINE, max_matches=1, ), transformers.split(sep=' '), ])) self.KERKO_COMPOSER.fields['alternate_id'].extractor.extractors.append( extractors.TransformerExtractor( extractor=extractors.ItemDataExtractor(key='extra'), transformers=[ transformers.find( regex=r'^\s*shortDOI\s*:\s*(\S+)\s*$', flags=re.IGNORECASE | re.MULTILINE, max_matches=0, ), ])) self.KERKO_COMPOSER.add_facet( CollectionFacetSpec( key='facet_featured', filter_key='featured', title=_('Featured publisher'), weight=10, collection_key='JUDM2WBF', )) self.KERKO_COMPOSER.add_facet( CollectionFacetSpec( key='facet_pubtype', filter_key='pubtype', title=_('Publication type'), weight=20, collection_key='NEH6ARC4', )) self.KERKO_COMPOSER.add_facet( CollectionFacetSpec( key='facet_theme', filter_key='theme', title=_('Theme'), weight=30, collection_key='QK8NXPKZ', )) self.KERKO_COMPOSER.add_facet( CollectionFacetSpec( key='facet_location', filter_key='location', title=_('Location'), weight=50, collection_key='94GNF2EB', )) # OpenDevEd flag and badge. self.KERKO_COMPOSER.add_field( FieldSpec( key='opendeved', field_type=BOOLEAN(stored=True), extractor=extractors.InCollectionExtractor( collection_key='JG6T4YVA'), )) self.KERKO_COMPOSER.add_badge( BadgeSpec( key='opendeved', field=self.KERKO_COMPOSER.fields['opendeved'], activator=lambda field, item: bool(item.get(field.key)), renderer=TemplateRenderer( 'app/_ode-badge.html.jinja2', badge_title=_( 'Published by Open Development & Education')), weight=100, )) # "Internal document" flag and badge. self.KERKO_COMPOSER.add_field( FieldSpec( key='internal', field_type=BOOLEAN(stored=True), extractor=MatchesTagExtractor(pattern=r'^_internal$'), )) self.KERKO_COMPOSER.add_badge( BadgeSpec( key='internal', field=self.KERKO_COMPOSER.fields['internal'], activator=lambda field, item: item.get(field.key, False), renderer=TemplateRenderer('app/_text-badge.html.jinja2', text=_('Internal<br />document')), weight=10, )) # "Coming soon" flag and badge. self.KERKO_COMPOSER.add_field( FieldSpec( key='comingsoon', field_type=BOOLEAN(stored=True), extractor=MatchesTagExtractor(pattern=r'^_comingsoon$'), )) self.KERKO_COMPOSER.add_badge( BadgeSpec( key='comingsoon', field=self.KERKO_COMPOSER.fields['comingsoon'], activator=lambda field, item: item.get(field.key, False), renderer=TemplateRenderer('app/_text-badge.html.jinja2', text=_('Coming<br >soon')), weight=20, )) # Sort option based on the OpenDevEd flag. self.KERKO_COMPOSER.add_sort( SortSpec( key='ode_desc', label=_('Open Development & Education first'), weight=100, fields=[ self.KERKO_COMPOSER.fields['opendeved'], self.KERKO_COMPOSER.fields['sort_date'], self.KERKO_COMPOSER.fields['sort_creator'], self.KERKO_COMPOSER.fields['sort_title'] ], reverse=[ False, True, False, False, ], ))
def build_schema(self, fields): # Copied from https://github.com/django-haystack/django-haystack/blob/v2.8.1/haystack/backends/whoosh_backend.py schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = "" for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = WHOOSH_ID( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ["date", "datetime"]: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == "integer": schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == "float": schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == "boolean": # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == "ngram": schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == "edge_ngram": schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at="start", stored=field_class.stored, field_boost=field_class.boost, ) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=getattr(field_class, "analyzer", StemmingAnalyzer()), field_boost=field_class.boost, sortable=True, ) schema_fields[ field_class.index_fieldname].field_name = field_name if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields))
def __init__(self, index_dir, backend, user_name=None, acl_support=False, **kw): """ Store params, create schemas. """ self.index_dir = index_dir self.index_dir_tmp = index_dir + '.temp' self.backend = backend self.user_name = user_name # TODO use currently logged-in username self.acl_support = acl_support self.wikiname = u'' # TODO take from app.cfg.interwikiname self.ix = {} # open indexes self.schemas = {} # existing schemas common_fields = { # wikiname so we can have a shared index in a wiki farm, always check this! WIKINAME: ID(stored=True), # tokenized NAME from metadata - use this for manual searching from UI # TODO was: NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0), NAME: ID(stored=True, field_boost=2.0), # unmodified NAME from metadata - use this for precise lookup by the code. # also needed for wildcard search, so the original string as well as the query # (with the wildcard) is not cut into pieces. NAME_EXACT: ID(field_boost=3.0), # revision id (aka meta id) REVID: ID(unique=True, stored=True), # MTIME from revision metadata (converted to UTC datetime) MTIME: DATETIME(stored=True), # tokenized CONTENTTYPE from metadata # TODO was: CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()), CONTENTTYPE: ID(stored=True), # unmodified list of TAGS from metadata TAGS: ID(stored=True), LANGUAGE: ID(stored=True), # USERID from metadata TODO: -> user ITEMID USERID: ID(stored=True), # ADDRESS from metadata ADDRESS: ID(stored=True), # HOSTNAME from metadata HOSTNAME: ID(stored=True), # SIZE from metadata SIZE: NUMERIC(stored=True), # ACTION from metadata ACTION: ID(stored=True), # tokenized COMMENT from metadata COMMENT: TEXT(stored=True), # data (content), converted to text/plain and tokenized CONTENT: TEXT(stored=True), } latest_revs_fields = { # ITEMID from metadata - as there is only latest rev of same item here, it is unique ITEMID: ID(unique=True, stored=True), # unmodified list of ITEMLINKS from metadata ITEMLINKS: ID(stored=True), # unmodified list of ITEMTRANSCLUSIONS from metadata ITEMTRANSCLUSIONS: ID(stored=True), # tokenized ACL from metadata # TODO was: ACL: TEXT(analyzer=AclTokenizer(self._cfg), multitoken_query="and", stored=True), ACL: ID(stored=True), } latest_revs_fields.update(**common_fields) userprofile_fields = { EMAIL: ID(unique=True, stored=True), OPENID: ID(unique=True, stored=True), } latest_revs_fields.update(**userprofile_fields) all_revs_fields = { ITEMID: ID(stored=True), } all_revs_fields.update(**common_fields) latest_revisions_schema = Schema(**latest_revs_fields) all_revisions_schema = Schema(**all_revs_fields) # Define dynamic fields dynamic_fields = [("*_id", ID(stored=True)), ("*_text", TEXT(stored=True)), ("*_keyword", KEYWORD(stored=True)), ("*_numeric", NUMERIC(stored=True)), ("*_datetime", DATETIME(stored=True)), ("*_boolean", BOOLEAN(stored=True)), ] # Adding dynamic fields to schemas for glob, field_type in dynamic_fields: latest_revisions_schema.add(glob, field_type, glob=True) all_revisions_schema.add(glob, field_type, glob=True) # schemas are needed by query parser and for index creation self.schemas[ALL_REVS] = all_revisions_schema self.schemas[LATEST_REVS] = latest_revisions_schema
def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: # schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True) schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields))