Beispiel #1
0
class NoteSchema(SchemaClass):
    id = ID(stored=True, unique=True)
    created = DATETIME()
    title = TEXT(analyzer=custom_analyzer, spelling=True)
    contents = TEXT(spelling=True)
    public = BOOLEAN()
    draft = BOOLEAN()
    short_code = ID(stored=True, unique=True)
    disable = BOOLEAN()
    reported = BOOLEAN()
    tags = TEXT(analyzer=tag_analyzer, spelling=True)
    user = ID()
Beispiel #2
0
class RecipeSchema(SchemaClass):
    id = ID(stored=True, unique=True)
    created = DATETIME()
    title = TEXT(analyzer=custom_analyzer)
    description = TEXT(analyzer=custom_analyzer)
    public = BOOLEAN()
    deleted = BOOLEAN()
    reported = BOOLEAN()
    short_code = ID(stored=True, unique=True)
    tags = TEXT(analyzer=tag_analyzer)
    user = ID()
    steps = TEXT(analyzer=custom_analyzer)
    ingredients = TEXT(analyzer=custom_analyzer)
    country = TEXT()
Beispiel #3
0
    def fields_map(self, field_type):
        if field_type == "primary":
            return ID(stored=True, unique=True)
        type_map = {
            'date': types.Date,
            'datetime': types.DateTime,
            'boolean': types.Boolean,
            'integer': types.Integer,
            'float': types.Float
        }
        if isinstance(field_type, str):
            field_type = type_map.get(field_type, types.Text)

        if not isinstance(field_type, type):
            field_type = field_type.__class__

        if issubclass(field_type, (types.DateTime, types.Date)):
            return DATETIME(stored=True, sortable=True)
        elif issubclass(field_type, types.Integer):
            return NUMERIC(stored=True, numtype=int)
        elif issubclass(field_type, types.Float):
            return NUMERIC(stored=True, numtype=float)
        elif issubclass(field_type, types.Boolean):
            return BOOLEAN(stored=True)
        return TEXT(stored=True, analyzer=self.analyzer, sortable=False)
Beispiel #4
0
 def mk_index(self, indexpath="indexdir", overwrite=False):
     """
     creates an index for IR operations
     """
     if os.path.exists(indexpath):
         if overwrite:
             shutil.rmtree(indexpath)
     if not os.path.exists(indexpath):
         os.mkdir(indexpath)
     self.context_schema = Schema(contextid=ID(stored=True),
                                  title=TEXT(stored=True),
                                  context=TEXT(stored=True,
                                               analyzer=StemmingAnalyzer()))
     self.context_ix = index.create_in("indexdir", self.context_schema)
     self.question_schema = Schema(contextid=ID(stored=True),
                                   questionid=ID(stored=True),
                                   is_impossible=BOOLEAN(stored=True),
                                   question=TEXT(
                                       stored=True,
                                       analyzer=StemmingAnalyzer()))
     self.question_ix = index.create_in("indexdir", self.question_schema)
     self.answer_schema = Schema(contextid=ID(stored=True),
                                 questionid=ID(stored=True),
                                 answerid=ID(stored=True),
                                 answer_start=NUMERIC(stored=True),
                                 answer=TEXT(stored=True,
                                             analyzer=StemmingAnalyzer()))
     self.answer_ix = index.create_in("indexdir", self.answer_schema)
Beispiel #5
0
class LawSchema(SchemaClass):
    PK = ID(unique=True)
    ACTIVE_FLG = BOOLEAN()
    ARTICLE = ID()
    ARTICLE_HEADING = TEXT(stored=True)
    ARTICLE_HISTORY = TEXT(stored=True)
    CHAPTER = ID()
    CHAPTER_HEADING = TEXT(stored=True)
    CODE_HEADING = TEXT(stored=True)
    DIVISION = ID()
    DIVISION_HEADING = TEXT(stored=True)
    EFFECTIVE_DATE = DATETIME(stored=True)
    HISTORY = TEXT(stored=True)
    LAW_CODE = ID()
    LAW_SECTION_VERSION_ID = ID()
    LEGAL_TEXT = TEXT(stored=True)
    LOB_FILE = ID()
    OP_CHAPTER = ID()
    OP_SECTION = ID()
    OP_STATUES = ID()
    PART = ID()
    SECTION_HISTORY = TEXT(stored=True)
    SECTION_NUM = ID(stored=True)
    SECTION_TITLE = TEXT(stored=True)
    TITLE = ID()
    TRANS_UID = ID()
    TRANS_UPDATE = DATETIME()
Beispiel #6
0
class UserSchema(SchemaClass):

    pk = ID(stored=True, unique=True)
    fullname = TEXT(stored=True, spelling=True)
    username = TEXT(stored=True, spelling=True)
    organization = BOOLEAN(stored=True)
    content = NGRAM(minsize=1, phrase=True)
def spam_schema():
    analyzer = StemmingAnalyzer(cachesize=-1)
    schema = Schema(title=TEXT(stored=True, analyzer=analyzer, sortable=True),
                    content=TEXT(stored=True, analyzer=analyzer, sortable=True),
                    content_length=NUMERIC(stored=True, sortable=True),
                    uid=ID(stored=True),
                    is_spam=BOOLEAN(stored=True))
    return schema
Beispiel #8
0
class PhotSchema(SchemaClass):
    id = ID(stored=True, unique=True)
    created = DATETIME()
    title = TEXT(analyzer=custom_analyzer, spelling=True)
    short_code = ID(stored=True, unique=True)
    disable = BOOLEAN()
    tags = TEXT(analyzer=tag_analyzer, spelling=True)
    user = ID()
Beispiel #9
0
def get_schema():
    return Schema(
        id=NUMERIC(stored=True, unique=True),
        title=TEXT(sortable=True),
        content=TEXT(),
        asn=NUMERIC(sortable=True),
        correspondent=TEXT(sortable=True),
        correspondent_id=NUMERIC(),
        has_correspondent=BOOLEAN(),
        tag=KEYWORD(commas=True, scorable=True, lowercase=True),
        tag_id=KEYWORD(commas=True, scorable=True),
        has_tag=BOOLEAN(),
        type=TEXT(sortable=True),
        type_id=NUMERIC(),
        has_type=BOOLEAN(),
        created=DATETIME(sortable=True),
        modified=DATETIME(sortable=True),
        added=DATETIME(sortable=True),
    )
Beispiel #10
0
def ingest_languages(writer: BufferedWriter):
    print('INFO: start languages ingestion')
    path = join(configuration.DATA, 'languages.csv')
    with open(path, 'r') as stream:
        languages = [line.strip() for line in stream.readlines()]
        for i in range(len(languages)):
            locale, label = languages[i].split(',')
            print(f'\tingest [{locale}] language')
            writer.add_field(locale, BOOLEAN())
            storage.lpush('locales', locale)
            storage.set(f'locale:{locale}', label)
Beispiel #11
0
def get_whoosh_schema():
    return Schema(offerId=ID(stored=True),
                  university=TEXT(stored=True),
                  enterprise=TEXT(stored=True),
                  months=NUMERIC(stored=True),
                  salary=NUMERIC(stored=True),
                  country=TEXT(stored=True),
                  province=TEXT(stored=True),
                  city=TEXT(stored=True),
                  description=TEXT(stored=True),
                  immediate=BOOLEAN(stored=True))
Beispiel #12
0
def get_schema(schema):
    kwargs = {}
    for key, value in schema.items():
        if value == "indexed":
            kwargs[key] = NGRAMWORDS(minsize=2, sortable=True)
        elif value == "id_stored":
            kwargs[key] = ID(stored=True)
        elif value == "unique_id_stored":
            kwargs[key] = ID(unique=True, stored=True)
        elif value == "boolean":
            kwargs[key] = BOOLEAN(stored=True)
    return Schema(**kwargs)
Beispiel #13
0
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True,
                                                                         field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=int,
                                                                     field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=float,
                                                                     field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored,
                                                                   field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start',
                                                                        stored=field_class.stored,
                                                                        field_boost=field_class.boost)
            else:
                # schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True)
                # 中文分词
                schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=ChineseAnalyzer(),
                                                                  field_boost=field_class.boost, sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search.")

        return (content_field_name, Schema(**schema_fields))
Beispiel #14
0
 def __get_index_schema(self):
     """
     :return: organization index schema
     """
     return Schema(id=NUMERIC(stored=True),
                   url=ID(stored=True),
                   external_id=ID(stored=True),
                   name=ID(stored=True),
                   domain_names=KEYWORD(stored=True, commas=True),
                   created_at=ID(stored=True),
                   details=ID(stored=True),
                   shared_tickets=BOOLEAN(stored=True),
                   tags=KEYWORD(stored=True, commas=True))
Beispiel #15
0
 def __get_index_schema(self):
     """
     :return: user index schema
     """
     return Schema(id=NUMERIC(stored=True),
                   url=ID(stored=True),
                   external_id=ID(stored=True),
                   name=ID(stored=True),
                   alias=ID(stored=True),
                   created_at=ID(stored=True),
                   active=BOOLEAN(stored=True),
                   verified=BOOLEAN(stored=True),
                   shared=BOOLEAN(stored=True),
                   locale=ID(stored=True),
                   timezone=ID(stored=True),
                   last_login_at=ID(stored=True),
                   email=ID(stored=True),
                   phone=ID(stored=True),
                   signature=ID(stored=True),
                   organization_id=NUMERIC(stored=True),
                   tags=KEYWORD(stored=True, commas=True),
                   suspended=BOOLEAN(stored=True),
                   role=ID(stored=True))
Beispiel #16
0
    def create(self):
        analyzer = StandardAnalyzer(minsize=1, stoplist=None) | CleanUpFilter()
        schema = Schema(
            source=TEXT(stored=True, analyzer=analyzer),
            target=TEXT(stored=True, analyzer=analyzer),
            comment=TEXT(stored=True),
            context=TEXT(stored=True),
            softcatala=BOOLEAN(stored=True),
            project=TEXT(stored=True),
        )
        if not os.path.exists(self.dir_name):
            os.mkdir(self.dir_name)

        ix = create_in(self.dir_name, schema)
        self.writer = ix.writer()
Beispiel #17
0
    def fields(self):
        model = self.table
        schema_fields = {'id': ID(stored=True, unique=True)}
        searchable = set(model.__searchable__)
        analyzer = getattr(model, '__msearch_analyzer__') if hasattr(
            model, '__msearch_analyzer__') else self.analyzer
        primary_keys = [key.name for key in inspect(model).primary_key]

        for field in searchable:
            if '.' in field:
                fields = field.split('.')
                field_attr = getattr(
                    getattr(model, fields[0]).property.mapper.class_,
                    fields[1])
            else:
                field_attr = getattr(model, field)
            if hasattr(field_attr, 'descriptor') and isinstance(
                    field_attr.descriptor, hybrid_property):
                field_type = Text
                type_hint = getattr(field_attr, 'type_hint', None)
                if type_hint is not None:
                    type_hint_map = {
                        'date': Date,
                        'datetime': DateTime,
                        'boolean': Boolean,
                        'integer': Integer,
                        'float': Float
                    }
                    field_type = type_hint if isclass(
                        type_hint) else type_hint_map.get(
                            type_hint.lower(), Text)
            else:
                field_type = field_attr.property.columns[0].type
            if field in primary_keys:
                schema_fields[field] = ID(stored=True, unique=True)
            elif field_type in (DateTime, Date):
                schema_fields[field] = DATETIME(stored=True, sortable=True)
            elif field_type == Integer:
                schema_fields[field] = NUMERIC(stored=True, numtype=int)
            elif field_type == Float:
                schema_fields[field] = NUMERIC(stored=True, numtype=float)
            elif field_type == Boolean:
                schema_fields[field] = BOOLEAN(stored=True)
            else:
                schema_fields[field] = TEXT(
                    stored=True, analyzer=analyzer, sortable=False)
        return schema_fields
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True, commas=True, scorable=True)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored, type=int)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored, type=float)
            elif field_class.field_type == 'boolean':
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True, analyzer=StemmingAnalyzer())

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))
Beispiel #19
0
class Mydocument(SchemaClass):
    project_name = ID(stored=True)
    project_id = ID(stored=True)
    snapshot_name = ID(stored=True)
    snapshot_id = ID(stored=True)
    branch_name = ID(stored=True)
    branch_id = ID(stored=True)
    item_name = ID(stored=True)
    item_id = ID(stored=True)
    item_type = ID(stored=True)
    item_content = TEXT(stored=True)
    is_toolkit = BOOLEAN(stored=True)
    toolkit_project_name = ID(stored=True)
    toolkit_project_id = ID(stored=True)
    toolkit_snapshot_name = ID(stored=True)
    toolkit_snapshot_id = ID(stored=True)
    toolkit_branch_name = ID(stored=True)
    toolkit_branch_id = ID(stored=True)
    apppath = ID(stored=True)
Beispiel #20
0
 def __get_index_schema(self):
     """
     :return: ticket index schema
     """
     return Schema(status=ID(stored=True),
                   assignee_id=NUMERIC(stored=True),
                   via=ID(stored=True),
                   description=ID(stored=True),
                   tags=KEYWORD(stored=True, commas=True),
                   url=ID(stored=True),
                   external_id=ID(stored=True),
                   created_at=ID(stored=True),
                   submitter_id=NUMERIC(stored=True),
                   priority=ID(stored=True),
                   due_at=ID(stored=True),
                   organization_id=NUMERIC(stored=True),
                   has_incidents=BOOLEAN(stored=True),
                   id=ID(stored=True),
                   type=ID(stored=True),
                   subject=ID(stored=True))
Beispiel #21
0
def get_schema():
    analyzer = StemmingAnalyzer(stoplist=STOP)
    schema = Schema(title=TEXT(stored=True, analyzer=analyzer, sortable=True),
                    url=ID(stored=True),
                    content_length=NUMERIC(stored=True, sortable=True),
                    thread_votecount=NUMERIC(stored=True, sortable=True),
                    vote_count=NUMERIC(stored=True, sortable=True),
                    content=TEXT(stored=True, analyzer=analyzer, sortable=True),
                    tags=KEYWORD(stored=True, commas=True),
                    is_toplevel=BOOLEAN(stored=True),
                    lastedit_date=NUMERIC(stored=True, sortable=True),
                    rank=NUMERIC(stored=True, sortable=True),
                    author=TEXT(stored=True),
                    author_score=NUMERIC(stored=True, sortable=True),
                    author_handle=TEXT(stored=True),
                    author_uid=ID(stored=True),
                    author_url=ID(stored=True),
                    uid=ID(stored=True),
                    type=NUMERIC(stored=True, sortable=True),
                    type_display=TEXT(stored=True))
    return schema
Beispiel #22
0
def get_schema():
    analyzer = StemmingAnalyzer(stoplist=STOP)
    schema = Schema(title=TEXT(stored=True, analyzer=analyzer, sortable=True),
                    url=ID(stored=True),
                    content_length=NUMERIC(stored=True, sortable=True),
                    thread_votecount=NUMERIC(stored=True, sortable=True),
                    vote_count=NUMERIC(stored=True, sortable=True),
                    content=TEXT(stored=True, analyzer=analyzer,
                                 sortable=True),
                    tags=KEYWORD(stored=True, commas=True),
                    is_toplevel=BOOLEAN(stored=True),
                    author_is_moderator=BOOLEAN(stored=True),
                    lastedit_user_is_moderator=BOOLEAN(stored=True),
                    lastedit_user_is_suspended=BOOLEAN(stored=True),
                    author_is_suspended=BOOLEAN(stored=True),
                    lastedit_date=DATETIME(stored=True, sortable=True),
                    creation_date=DATETIME(stored=True, sortable=True),
                    rank=NUMERIC(stored=True, sortable=True),
                    author=TEXT(stored=True),
                    lastedit_user=TEXT(stored=True),
                    lastedit_user_email=TEXT(stored=True),
                    lastedit_user_score=NUMERIC(stored=True, sortable=True),
                    lastedit_user_uid=ID(stored=True),
                    lastedit_user_url=ID(stored=True),
                    author_score=NUMERIC(stored=True, sortable=True),
                    author_handle=TEXT(stored=True),
                    author_email=TEXT(stored=True),
                    author_uid=ID(stored=True),
                    author_url=ID(stored=True),
                    root_has_accepted=BOOLEAN(stored=True),
                    reply_count=NUMERIC(stored=True, sortable=True),
                    view_count=NUMERIC(stored=True, sortable=True),
                    answer_count=NUMERIC(stored=True, sortable=True),
                    uid=ID(stored=True),
                    type=NUMERIC(stored=True, sortable=True),
                    type_display=TEXT(stored=True))
    return schema
Beispiel #23
0
class SearchIndexer:
    """Full-text search indexer."""

    # schema for searches of all (public + private) info
    SCHEMA = Schema(
        type=ID(stored=True, sortable=True),
        handle=ID(stored=True, unique=True),
        private=BOOLEAN(stored=True),
        text=TEXT(),
        text_private=TEXT(),
        change=DATETIME(sortable=True, stored=True),
    )

    # schema for searches of public info only
    SCHEMA_PUBLIC = Schema(
        type=ID(stored=True, sortable=True),
        handle=ID(stored=True, unique=True),
        private=BOOLEAN(stored=True),
        text=TEXT(),
        change=DATETIME(sortable=True, stored=True),
    )

    def __init__(self, index_dir=FilenameOrPath):
        """Initialize given an index dir path."""
        self.index_dir = Path(index_dir)
        self.index_dir.mkdir(exist_ok=True)
        # query parser for all (public + private) content
        self.query_parser_all = MultifieldParser(["text", "text_private"],
                                                 schema=self.SCHEMA)
        # query parser for public content only
        self.query_parser_public = QueryParser("text",
                                               schema=self.SCHEMA_PUBLIC)

    def index(self, overwrite=False):
        """Return the index; create if doesn't exist."""
        index_dir = str(self.index_dir)
        if overwrite or not index.exists_in(index_dir):
            return index.create_in(index_dir, self.SCHEMA)
        return index.open_dir(index_dir)

    def _add_obj_strings(self, writer, obj_dict):
        """Add or update an object to the index."""
        try:
            writer.update_document(
                type=obj_dict["class_name"].lower(),
                handle=obj_dict["handle"],
                private=obj_dict["private"],
                text=obj_dict["string"],
                text_private=obj_dict["string_private"],
                change=obj_dict["change"],
            )
        except:
            current_app.logger.error("Failed adding object {}".format(
                obj_dict["handle"]))

    def reindex_full(self, db_handle: DbReadBase):
        """Reindex the whole database."""
        with self.index(overwrite=True).writer() as writer:
            for obj_dict in iter_obj_strings(db_handle):
                self._add_obj_strings(writer, obj_dict)

    def _get_object_timestamps(self):
        """Get a dictionary with the timestamps of all objects in the index."""
        d = {}
        with self.index().searcher() as searcher:
            for fields in searcher.all_stored_fields():
                class_name = fields["type"]
                if class_name not in d:
                    d[class_name] = set()
                d[class_name].add((fields["handle"], fields["change"]))
        return d

    def _get_update_info(self, db_handle: DbReadBase):
        """Get a dictionary with info about changed objects in the db."""
        db_timestamps = get_object_timestamps(db_handle)
        ix_timestamps = self._get_object_timestamps()
        deleted = {}
        updated = {}
        new = {}
        for class_name in db_timestamps:
            db_handles = set(handle for handle, _ in db_timestamps[class_name])
            ix_handles = set(
                handle
                for handle, _ in ix_timestamps.get(class_name.lower(), set()))
            # new: not present in index
            new[class_name] = db_handles - ix_handles
            # deleted: not present in db
            deleted[class_name] = ix_handles - db_handles
            # changed: different (new or modified) in db
            changed_timestamps = db_timestamps[class_name] - ix_timestamps.get(
                class_name.lower(), set())
            changed_handles = set(handle for handle, _ in changed_timestamps)
            # updated: changed and present in the index
            updated[class_name] = changed_handles & ix_handles
        return {"deleted": deleted, "updated": updated, "new": new}

    def delete_object(self, writer, handle: str):
        """Delete an object from the index."""
        writer.delete_by_term("handle", handle)

    def add_or_update_object(self, writer, handle: str, db_handle: DbReadBase,
                             class_name: str):
        """Add an object to the index or update it if it exists."""
        obj_dict = obj_strings_from_handle(db_handle, class_name, handle)
        self._add_obj_strings(writer, obj_dict)

    def get_writer(self, overwrite: bool = False, use_async: bool = False):
        """Get a writer instance.

        If `use_async` is true, use an `AsyncWriter`.
        """
        idx = self.index(overwrite=overwrite)
        if use_async:
            return AsyncWriter(idx, delay=0.1)
        return idx.writer()

    def reindex_incremental(self, db_handle: DbReadBase):
        """Update the index incrementally."""
        update_info = self._get_update_info(db_handle)
        with self.index(overwrite=False).writer() as writer:
            # delete objects
            for class_name, handles in update_info["deleted"].items():
                for handle in handles:
                    self.delete_object(writer, handle)
            # add objects
            for class_name, handles in update_info["new"].items():
                for handle in handles:
                    self.add_or_update_object(writer, handle, db_handle,
                                              class_name)
            # update objects
            for class_name, handles in update_info["updated"].items():
                for handle in handles:
                    self.add_or_update_object(writer, handle, db_handle,
                                              class_name)

    @staticmethod
    def format_hit(hit: Hit) -> Dict[str, Any]:
        """Format a search hit."""
        return {
            "handle": hit["handle"],
            "object_type": hit["type"],
            "rank": hit.rank,
            "score": hit.score,
        }

    def _get_sorting(
        self,
        sort: Optional[List[str]] = None,
    ) -> Optional[List[FieldFacet]]:
        """Get the appropriate field facets for sorting."""
        if not sort:
            return None
        facets = []
        allowed_sorters = {"type", "change"}
        for sorter in sort:
            _field = sorter.lstrip("+-")
            if _field not in allowed_sorters:
                continue
            reverse = sorter.startswith("-")
            facets.append(FieldFacet(_field, reverse=reverse))
        return facets

    def search(
        self,
        query: str,
        page: int,
        pagesize: int,
        include_private: bool = True,
        extend: bool = False,
        sort: Optional[List[str]] = None,
    ):
        """Search the index.

        If `include_private` is true, include also private objects and
        search in private fields.
        """
        query_parser = (self.query_parser_all
                        if include_private else self.query_parser_public)
        query_parser.add_plugin(DateParserPlugin())
        # if private objects should not be shown, add a mask
        mask = None if include_private else Term("private", True)
        parsed_query = query_parser.parse(query)
        with self.index().searcher() as searcher:
            sortedby = self._get_sorting(sort)
            results = searcher.search_page(parsed_query,
                                           page,
                                           pagesize,
                                           mask=mask,
                                           sortedby=sortedby)
            return results.total, [self.format_hit(hit) for hit in results]
Beispiel #24
0
class SearchIndexer:
    """Full-text search indexer."""

    # schema for searches of all (public + private) info
    SCHEMA = Schema(
        type=ID(stored=True),
        handle=ID(stored=True, unique=True),
        private=BOOLEAN(stored=True),
        text=TEXT(),
        text_private=TEXT(),
        changed=DATETIME(),
    )

    # schema for searches of public info only
    SCHEMA_PUBLIC = Schema(
        type=ID(stored=True),
        handle=ID(stored=True, unique=True),
        private=BOOLEAN(stored=True),
        text=TEXT(),
        changed=DATETIME(),
    )

    def __init__(self, index_dir=FilenameOrPath):
        """Initialize given an index dir path."""
        self.index_dir = Path(index_dir)
        self.index_dir.mkdir(exist_ok=True)
        # query parser for all (public + private) content
        self.query_parser_all = MultifieldParser(["text", "text_private"],
                                                 schema=self.SCHEMA)
        # query parser for public content only
        self.query_parser_public = QueryParser("text",
                                               schema=self.SCHEMA_PUBLIC)

    def index(self, overwrite=False):
        """Return the index; create if doesn't exist."""
        index_dir = str(self.index_dir)
        if overwrite or not index.exists_in(index_dir):
            return index.create_in(index_dir, self.SCHEMA)
        return index.open_dir(index_dir)

    def reindex_full(self, db_handle: DbReadBase):
        """Reindex the whole database."""
        with self.index(overwrite=True).writer() as writer:
            for obj_dict in iter_obj_strings(db_handle):
                writer.add_document(
                    type=obj_dict["class_name"].lower(),
                    handle=obj_dict["handle"],
                    text=obj_dict["string"],
                    text_private=obj_dict["string_private"],
                    changed=obj_dict["changed"],
                )

    @staticmethod
    def format_hit(hit: Hit) -> Dict[str, Any]:
        """Format a search hit."""
        return {
            "handle": hit["handle"],
            "object_type": hit["type"],
            "rank": hit.rank,
            "score": hit.score,
        }

    def search(
        self,
        query: str,
        page: int,
        pagesize: int,
        include_private: bool = True,
        extend: bool = False,
    ):
        """Search the index.

        If `include_private` is true, include also private objects and
        search in private fields.
        """
        query_parser = (self.query_parser_all
                        if include_private else self.query_parser_public)
        query_parser.add_plugin(DateParserPlugin())
        # if private objects should not be shown, add a mask
        mask = None if include_private else Term("private", True)
        parsed_query = query_parser.parse(query)
        with self.index().searcher() as searcher:
            results = searcher.search_page(parsed_query,
                                           page,
                                           pagesize,
                                           mask=mask)
            return results.total, [self.format_hit(hit) for hit in results]
Beispiel #25
0
                path=TEXT(stored=True),
                content=FieldType(format=Characters(),
                                  analyzer=ANALYZER,
                                  scorable=True,
                                  stored=True),
                modtime=STORED(),
                extension=TEXT(stored=True))

IDX_NAME = 'HG_INDEX'
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
FRAGMENTER = ContextFragmenter(200)

CHGSETS_SCHEMA = Schema(
    raw_id=ID(unique=True, stored=True),
    date=NUMERIC(stored=True),
    last=BOOLEAN(),
    owner=TEXT(),
    repository=ID(unique=True, stored=True),
    author=TEXT(stored=True),
    message=FieldType(format=Characters(),
                      analyzer=ANALYZER,
                      scorable=True,
                      stored=True),
    parents=TEXT(),
    added=TEXT(),
    removed=TEXT(),
    changed=TEXT(),
)

CHGSET_IDX_NAME = 'CHGSET_INDEX'
    def __init__(self):
        app_dir = pathlib.Path(env.str('FLASK_APP')).parent.absolute()

        # Get configuration values from the environment.
        self.SECRET_KEY = env.str('SECRET_KEY')
        self.KERKO_ZOTERO_API_KEY = env.str('KERKO_ZOTERO_API_KEY')
        self.KERKO_ZOTERO_LIBRARY_ID = env.str('KERKO_ZOTERO_LIBRARY_ID')
        self.KERKO_ZOTERO_LIBRARY_TYPE = env.str('KERKO_ZOTERO_LIBRARY_TYPE')
        self.KERKO_DATA_DIR = env.str('KERKO_DATA_DIR',
                                      str(app_dir / 'data' / 'kerko'))

        # Set other configuration variables.
        self.LOGGING_HANDLER = 'default'
        self.EXPLAIN_TEMPLATE_LOADING = False

        self.LIBSASS_INCLUDES = [
            str(
                pathlib.Path(__file__).parent.parent / 'static' / 'src' /
                'vendor' / 'bootstrap' / 'scss'),
            str(
                pathlib.Path(__file__).parent.parent / 'static' / 'src' /
                'vendor' / '@fortawesome' / 'fontawesome-free' / 'scss'),
        ]

        self.BABEL_DEFAULT_LOCALE = 'en_GB'
        self.KERKO_WHOOSH_LANGUAGE = 'en'
        self.KERKO_ZOTERO_LOCALE = 'en-GB'

        self.HOME_URL = 'https://opendeved.net'
        self.HOME_TITLE = _("Open Development & Education")
        # self.HOME_SUBTITLE = _("...")

        self.ABOUT_URL = 'https://opendeved.net/about/'
        self.BLOG_URL = 'https://opendeved.net/'
        self.JOBS_URL = 'https://opendeved.net/jobs/'
        self.PROGRAMMES_URL = 'https://opendeved.net/programmes/'
        self.CONTACTUS_URL = 'https://opendeved.net/contact-us/'

        self.NAV_TITLE = _("Evidence Library")
        self.KERKO_TITLE = _("Evidence Library – Open Development & Education")
        self.KERKO_PRINT_ITEM_LINK = True
        self.KERKO_PRINT_CITATIONS_LINK = True
        self.KERKO_RESULTS_FIELDS = [
            'id', 'attachments', 'bib', 'coins', 'data', 'preview', 'url'
        ]
        self.KERKO_RESULTS_ABSTRACTS = True
        self.KERKO_RESULTS_ABSTRACTS_MAX_LENGTH = 500
        self.KERKO_RESULTS_ABSTRACTS_MAX_LENGTH_LEEWAY = 40
        self.KERKO_TEMPLATE_LAYOUT = 'app/layout.html.jinja2'
        self.KERKO_TEMPLATE_SEARCH = 'app/search.html.jinja2'
        self.KERKO_TEMPLATE_SEARCH_ITEM = 'app/search-item.html.jinja2'
        self.KERKO_TEMPLATE_ITEM = 'app/item.html.jinja2'
        self.KERKO_DOWNLOAD_ATTACHMENT_NEW_WINDOW = True
        self.KERKO_RELATIONS_INITIAL_LIMIT = 50

        # CAUTION: The URL's query string must be changed after any edit to the CSL
        # style, otherwise zotero.org might still use a previously cached version of
        # the file.
        self.KERKO_CSL_STYLE = 'https://docs.opendeved.net/static/dist/csl/eth_apa.xml?202012301815'

        self.KERKO_COMPOSER = Composer(
            whoosh_language=self.KERKO_WHOOSH_LANGUAGE,
            exclude_default_facets=[
                'facet_tag', 'facet_link', 'facet_item_type'
            ],
            exclude_default_fields=['data'],
            default_item_exclude_re=r'^_exclude$',
            default_child_include_re=r'^(_publish|publishPDF)$',
            default_child_exclude_re=r'',
        )

        # Replace the default 'data' extractor to strip unwanted data from the Extra field.
        self.KERKO_COMPOSER.add_field(
            FieldSpec(
                key='data',
                field_type=STORED,
                extractor=extractors.TransformerExtractor(
                    extractor=extractors.RawDataExtractor(),
                    transformers=[extra_field_cleaner]),
            ))

        # Add field for storing the formatted item preview used on search result
        # pages. This relies on the CSL style's in-text citation formatting and only
        # makes sense using our custom CSL style!
        self.KERKO_COMPOSER.add_field(
            FieldSpec(
                key='preview',
                field_type=STORED,
                extractor=extractors.TransformerExtractor(
                    extractor=extractors.ItemExtractor(key='citation',
                                                       format_='citation'),
                    # Zotero wraps the citation in a <span> element (most probably
                    # because it expects the 'citation' format to be used in-text),
                    # but that <span> has to be removed because our custom CSL style
                    # causes <div>s to be nested within. Let's replace that <span>
                    # with the same markup that the 'bib' format usually provides.
                    transformers=[
                        lambda value: re.sub(r'^<span>',
                                             '<div class="csl-entry">',
                                             value,
                                             count=1),
                        lambda value: re.sub(
                            r'</span>$', '</div>', value, count=1),
                    ])))

        # Add extractors for the 'alternate_id' field.
        self.KERKO_COMPOSER.fields['alternate_id'].extractor.extractors.append(
            extractors.TransformerExtractor(
                extractor=extractors.ItemDataExtractor(key='extra'),
                transformers=[
                    transformers.find(
                        regex=r'^\s*EdTechHub.ItemAlsoKnownAs\s*:\s*(.*)$',
                        flags=re.IGNORECASE | re.MULTILINE,
                        max_matches=1,
                    ),
                    transformers.split(sep=';'),
                ]))
        self.KERKO_COMPOSER.fields['alternate_id'].extractor.extractors.append(
            extractors.TransformerExtractor(
                extractor=extractors.ItemDataExtractor(key='extra'),
                transformers=[
                    transformers.find(
                        regex=r'^\s*KerkoCite.ItemAlsoKnownAs\s*:\s*(.*)$',
                        flags=re.IGNORECASE | re.MULTILINE,
                        max_matches=1,
                    ),
                    transformers.split(sep=' '),
                ]))
        self.KERKO_COMPOSER.fields['alternate_id'].extractor.extractors.append(
            extractors.TransformerExtractor(
                extractor=extractors.ItemDataExtractor(key='extra'),
                transformers=[
                    transformers.find(
                        regex=r'^\s*shortDOI\s*:\s*(\S+)\s*$',
                        flags=re.IGNORECASE | re.MULTILINE,
                        max_matches=0,
                    ),
                ]))

        self.KERKO_COMPOSER.add_facet(
            CollectionFacetSpec(
                key='facet_featured',
                filter_key='featured',
                title=_('Featured publisher'),
                weight=10,
                collection_key='JUDM2WBF',
            ))

        self.KERKO_COMPOSER.add_facet(
            CollectionFacetSpec(
                key='facet_pubtype',
                filter_key='pubtype',
                title=_('Publication type'),
                weight=20,
                collection_key='NEH6ARC4',
            ))

        self.KERKO_COMPOSER.add_facet(
            CollectionFacetSpec(
                key='facet_theme',
                filter_key='theme',
                title=_('Theme'),
                weight=30,
                collection_key='QK8NXPKZ',
            ))

        self.KERKO_COMPOSER.add_facet(
            CollectionFacetSpec(
                key='facet_location',
                filter_key='location',
                title=_('Location'),
                weight=50,
                collection_key='94GNF2EB',
            ))

        # OpenDevEd flag and badge.
        self.KERKO_COMPOSER.add_field(
            FieldSpec(
                key='opendeved',
                field_type=BOOLEAN(stored=True),
                extractor=extractors.InCollectionExtractor(
                    collection_key='JG6T4YVA'),
            ))
        self.KERKO_COMPOSER.add_badge(
            BadgeSpec(
                key='opendeved',
                field=self.KERKO_COMPOSER.fields['opendeved'],
                activator=lambda field, item: bool(item.get(field.key)),
                renderer=TemplateRenderer(
                    'app/_ode-badge.html.jinja2',
                    badge_title=_(
                        'Published by Open Development & Education')),
                weight=100,
            ))
        # "Internal document" flag and badge.
        self.KERKO_COMPOSER.add_field(
            FieldSpec(
                key='internal',
                field_type=BOOLEAN(stored=True),
                extractor=MatchesTagExtractor(pattern=r'^_internal$'),
            ))
        self.KERKO_COMPOSER.add_badge(
            BadgeSpec(
                key='internal',
                field=self.KERKO_COMPOSER.fields['internal'],
                activator=lambda field, item: item.get(field.key, False),
                renderer=TemplateRenderer('app/_text-badge.html.jinja2',
                                          text=_('Internal<br />document')),
                weight=10,
            ))
        # "Coming soon" flag and badge.
        self.KERKO_COMPOSER.add_field(
            FieldSpec(
                key='comingsoon',
                field_type=BOOLEAN(stored=True),
                extractor=MatchesTagExtractor(pattern=r'^_comingsoon$'),
            ))
        self.KERKO_COMPOSER.add_badge(
            BadgeSpec(
                key='comingsoon',
                field=self.KERKO_COMPOSER.fields['comingsoon'],
                activator=lambda field, item: item.get(field.key, False),
                renderer=TemplateRenderer('app/_text-badge.html.jinja2',
                                          text=_('Coming<br >soon')),
                weight=20,
            ))

        # Sort option based on the OpenDevEd flag.
        self.KERKO_COMPOSER.add_sort(
            SortSpec(
                key='ode_desc',
                label=_('Open Development & Education first'),
                weight=100,
                fields=[
                    self.KERKO_COMPOSER.fields['opendeved'],
                    self.KERKO_COMPOSER.fields['sort_date'],
                    self.KERKO_COMPOSER.fields['sort_creator'],
                    self.KERKO_COMPOSER.fields['sort_title']
                ],
                reverse=[
                    False,
                    True,
                    False,
                    False,
                ],
            ))
    def build_schema(self, fields):
        # Copied from https://github.com/django-haystack/django-haystack/blob/v2.8.1/haystack/backends/whoosh_backend.py
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ""

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = WHOOSH_ID(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ["date", "datetime"]:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == "integer":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == "float":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == "boolean":
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == "ngram":
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == "edge_ngram":
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at="start",
                    stored=field_class.stored,
                    field_boost=field_class.boost,
                )
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=getattr(field_class, "analyzer",
                                     StemmingAnalyzer()),
                    field_boost=field_class.boost,
                    sortable=True,
                )
                schema_fields[
                    field_class.index_fieldname].field_name = field_name

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))
Beispiel #28
0
    def __init__(self, index_dir, backend, user_name=None, acl_support=False, **kw):
        """
        Store params, create schemas.
        """
        self.index_dir = index_dir
        self.index_dir_tmp = index_dir + '.temp'
        self.backend = backend
        self.user_name = user_name # TODO use currently logged-in username
        self.acl_support = acl_support
        self.wikiname = u'' # TODO take from app.cfg.interwikiname
        self.ix = {}  # open indexes
        self.schemas = {}  # existing schemas

        common_fields = {
            # wikiname so we can have a shared index in a wiki farm, always check this!
            WIKINAME: ID(stored=True),
            # tokenized NAME from metadata - use this for manual searching from UI
            # TODO was: NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0),
            NAME: ID(stored=True, field_boost=2.0),
            # unmodified NAME from metadata - use this for precise lookup by the code.
            # also needed for wildcard search, so the original string as well as the query
            # (with the wildcard) is not cut into pieces.
            NAME_EXACT: ID(field_boost=3.0),
            # revision id (aka meta id)
            REVID: ID(unique=True, stored=True),
            # MTIME from revision metadata (converted to UTC datetime)
            MTIME: DATETIME(stored=True),
            # tokenized CONTENTTYPE from metadata
            # TODO was: CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()),
            CONTENTTYPE: ID(stored=True),
            # unmodified list of TAGS from metadata
            TAGS: ID(stored=True),
            LANGUAGE: ID(stored=True),
            # USERID from metadata TODO: -> user ITEMID
            USERID: ID(stored=True),
            # ADDRESS from metadata
            ADDRESS: ID(stored=True),
            # HOSTNAME from metadata
            HOSTNAME: ID(stored=True),
            # SIZE from metadata
            SIZE: NUMERIC(stored=True),
            # ACTION from metadata
            ACTION: ID(stored=True),
            # tokenized COMMENT from metadata
            COMMENT: TEXT(stored=True),
            # data (content), converted to text/plain and tokenized
            CONTENT: TEXT(stored=True),
        }

        latest_revs_fields = {
            # ITEMID from metadata - as there is only latest rev of same item here, it is unique
            ITEMID: ID(unique=True, stored=True),
            # unmodified list of ITEMLINKS from metadata
            ITEMLINKS: ID(stored=True),
            # unmodified list of ITEMTRANSCLUSIONS from metadata
            ITEMTRANSCLUSIONS: ID(stored=True),
            # tokenized ACL from metadata
            # TODO was: ACL: TEXT(analyzer=AclTokenizer(self._cfg), multitoken_query="and", stored=True),
            ACL: ID(stored=True),
        }
        latest_revs_fields.update(**common_fields)

        userprofile_fields = {
            EMAIL: ID(unique=True, stored=True),
            OPENID: ID(unique=True, stored=True),
        }
        latest_revs_fields.update(**userprofile_fields)

        all_revs_fields = {
            ITEMID: ID(stored=True),
        }
        all_revs_fields.update(**common_fields)

        latest_revisions_schema = Schema(**latest_revs_fields)
        all_revisions_schema = Schema(**all_revs_fields)

        # Define dynamic fields
        dynamic_fields = [("*_id", ID(stored=True)),
                          ("*_text", TEXT(stored=True)),
                          ("*_keyword", KEYWORD(stored=True)),
                          ("*_numeric", NUMERIC(stored=True)),
                          ("*_datetime", DATETIME(stored=True)),
                          ("*_boolean", BOOLEAN(stored=True)),
                         ]

        # Adding dynamic fields to schemas
        for glob, field_type in dynamic_fields:
            latest_revisions_schema.add(glob, field_type, glob=True)
            all_revisions_schema.add(glob, field_type, glob=True)

        # schemas are needed by query parser and for index creation
        self.schemas[ALL_REVS] = all_revisions_schema
        self.schemas[LATEST_REVS] = latest_revisions_schema
Beispiel #29
0
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        initial_key_count = len(schema_fields)
        content_field_name = ''
        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)

            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                # schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True)
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=ChineseAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True)
            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )
        return (content_field_name, Schema(**schema_fields))