def get_schema(self): return Schema( title=TEXT(stored=True, field_boost=1.5), name=ID(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=StemmingAnalyzer()), keywords=KEYWORD(stored=True, scorable=True, commas=True), )
class TMSchema(SchemaClass): """Fultext index schema for source and context strings.""" source_language = ID(stored=True) target_language = ID(stored=True) source = TEXT(stored=True) target = STORED() origin = ID(stored=True) category = NUMERIC(stored=True)
def init_extensions(app): global use_cache whoosh_searcher.init_app(app) configure_uploads(app, upload_photos) mail.init_app(app) admin.init_app(app) mongo.init_app(app, "MONGO") oauth.init_app(app) login_manager.init_app(app) # use_cache = app.config.get('USE_CACHE', False) # if use_cache: # cache.init_app(app, {}) with app.app_context(): # 添加flask-admin视图 admin.add_view(admin_view.RolesModelView(mongo.db['roles'], '角色管理')) admin.add_view(admin_view.UsersModelView(mongo.db['users'], '用户管理')) admin.add_view( admin_view.CatalogsModelView(mongo.db['catalogs'], '栏目管理', category='内容管理')) admin.add_view( admin_view.PostsModelView(mongo.db['posts'], '帖子管理', category='内容管理')) admin.add_view( admin_view.PassagewaysModelView(mongo.db['passageways'], '温馨通道', category='推广管理')) admin.add_view( admin_view.FriendLinksModelView(mongo.db['friend_links'], '友链管理', category='推广管理')) admin.add_view( admin_view.PagesModelView(mongo.db['pages'], '页面管理', category='推广管理')) admin.add_view( admin_view.FooterLinksModelView(mongo.db['footer_links'], '底部链接', category='推广管理')) admin.add_view( admin_view.AdsModelView(mongo.db['ads'], '广告管理', category='推广管理')) admin.add_view(admin_view.OptionsModelView(mongo.db['options'], '系统设置')) # 初始化Whoosh索引 chinese_analyzer = ChineseAnalyzer() post_schema = Schema(obj_id=ID(unique=True, stored=True), title=TEXT(stored=True, analyzer=chinese_analyzer), content=TEXT(stored=True, analyzer=chinese_analyzer), create_at=DATETIME(stored=True), catalog_id=ID(stored=True), user_id=ID(stored=True)) whoosh_searcher.add_index('posts', post_schema)
def schema(self): my_analyzer = RegexTokenizer("[a-zA-Z_]+") | LowercaseFilter() | StopFilter() schema = Schema( h=TEXT(stored=True, analyzer=my_analyzer), gnx=ID(stored=True), b=TEXT(analyzer=my_analyzer), parent=ID(stored=True), doc=ID(stored=True), ) return schema
def _get_generic_schema(self): """ Returns whoosh's generic schema of the partition document. """ schema = Schema( vid=ID(stored=True, unique=True), dataset_vid=ID(stored=True), # dataset_vid? Convert if so. title=NGRAMWORDS(), keywords=KEYWORD, doc=TEXT) # Generated document for the core of the topic search return schema
def __init__(self): self.file_index = None self.schema = Schema(floor_id=ID(stored=True), user_id=ID(stored=True), user_name=ID(stored=True), floor_content=TEXT(stored=True, analyzer=ChineseAnalyzer()), tie_id=ID(stored=True)) # 直接将file_index初始化 self.open_index()
class FileSchema(SchemaClass): "remote files" path = ID(stored=True) # without tree_path checksum = ID(stored=True) size = NUMERIC(bits=64, signed=False, stored=True) tree = ID(stored=True) tree_path = ID(stored=True) mtime = NUMERIC(stored=True) pubkey = ID(stored=True) # only needed for plants
def prepare_indices(self, build_index, path): if build_index: print("Indexing corpus...") schema = None if self.lang == "ja": schema = Schema(path=ID(stored=True), content=NGRAM(stored=True)) else: ana = analysis.StandardAnalyzer(stoplist=None, minsize=0) schema = Schema(path=ID(stored=True), content=TEXT(analyzer=ana)) index_directory = os.path.dirname(path) + "/tmp/indices/indexdir" if not os.path.exists(index_directory): os.makedirs(index_directory) self.ix = create_in(index_directory, schema) with self.ix.writer(limitmb=2048, multisegment=True) as writer: i = 0 for utterance in log_progress(self.utterances): writer.add_document(path=str(i), content=utterance.text) i += 1 print("Indexing corpus by lemma...") if self.lang == "ja": schema = Schema(path=ID(stored=True), content=NGRAM(stored=True)) else: ana = analysis.StandardAnalyzer(stoplist=None, minsize=0) schema = Schema(path=ID(stored=True), content=TEXT(analyzer=ana)) lemma_index_directory = os.path.dirname(path) + \ "/tmp/indices/lemmaindexdir" if not os.path.exists(lemma_index_directory): os.makedirs(lemma_index_directory) self.ix_lemma = create_in(lemma_index_directory, schema) with self.ix_lemma.writer(limitmb=2048, multisegment=True) as writer: i = 0 for utterance in log_progress(self.utterances): lemmas = [token.lemma_ for token in utterance.spacy] writer.add_document(path=str(i), content=" ".join(lemmas)) i += 1 else: print("Loading indices...") index_directory = os.path.dirname(path) + "/tmp/indices/indexdir" if not os.path.exists(index_directory): raise IOError('No existing indices! You should build ' + 'indices before trying to load them.') self.ix = open_dir(index_directory) print("Loading lemma indices...") index_directory = os.path.dirname(path) + \ "/tmp/indices/lemmaindexdir" if not os.path.exists(index_directory): raise IOError('No existing indices! You should build ' + 'indices before trying to load them.') self.ix_lemma = open_dir(index_directory)
def _get_schema(): analyzer = StemmingAnalyzer() | CharsetFilter( accent_map ) # WARN: stemming is english specific; character folding is for western languages schema = Schema( code=ID(unique=True, stored=True), slug=ID(unique=False, stored=True), title=TEXT(analyzer=analyzer, stored=True), content=TEXT(analyzer=analyzer), ) return schema
def test_2_1(self): source = Schema( checksum=ID(stored=True, unique=True), source=TEXT(), context=TEXT(), ) target = Schema( checksum=ID(stored=True, unique=True), target=TEXT(), ) self.do_test(source, target)
class NoteSchema(SchemaClass): id = ID(stored=True, unique=True) created = DATETIME() title = TEXT(analyzer=custom_analyzer, spelling=True) contents = TEXT(spelling=True) public = BOOLEAN() draft = BOOLEAN() short_code = ID(stored=True, unique=True) disable = BOOLEAN() reported = BOOLEAN() tags = TEXT(analyzer=tag_analyzer, spelling=True) user = ID()
def get_schema(schema): kwargs = {} for key, value in schema.items(): if value == "indexed": kwargs[key] = NGRAMWORDS(minsize=2, sortable=True) elif value == "id_stored": kwargs[key] = ID(stored=True) elif value == "unique_id_stored": kwargs[key] = ID(unique=True, stored=True) elif value == "boolean": kwargs[key] = BOOLEAN(stored=True) return Schema(**kwargs)
def __init__(self, articles_path): """Attempt to initialize a folder with Markdown articles. If a git repo, create a search index and populate. Markdown Extension References * http://facelessuser.github.io/pymdown-extensions * https://pythonhosted.org/Markdown/extensions """ self.article_repo = Repo(articles_path) self.articles_path = articles_path self.markdown_extensions = [ 'markdown.extensions.abbr', 'markdown.extensions.attr_list', 'markdown.extensions.def_list', 'markdown.extensions.fenced_code', 'markdown.extensions.footnotes', 'markdown.extensions.tables', 'markdown.extensions.smart_strong', 'markdown.extensions.admonition', 'markdown.extensions.codehilite', 'markdown.extensions.headerid', 'markdown.extensions.sane_lists', 'markdown.extensions.smarty', 'markdown.extensions.toc', 'markdown.extensions.wikilinks', 'pymdownx.betterem', 'pymdownx.caret', 'pymdownx.githubemoji', 'pymdownx.headeranchor', 'pymdownx.magiclink', 'pymdownx.mark', 'pymdownx.smartsymbols', 'pymdownx.tasklist', 'pymdownx.tilde', 'pymdownx.critic', ] self.markdown_extensions_config = { 'markdown.extensions.codehilite': { 'css_class': 'code-highlight' } } self.__search_schema = Schema( title=ID(stored=True, unique=True), path=ID(stored=True), content=TEXT, ) self.__search_parser = MultifieldParser( ['title', 'content'], schema=self.__search_schema, ) self.__search_parser.add_plugin(FuzzyTermPlugin()) self.__search_index = self.create_search_index() self.populate_search_index()
def build_schema_and_corpus(): schema = Schema( id=ID(stored=True), filename=ID(stored=True), story=TEXT(analyzer=StemmingAnalyzer(), stored=True, lang="en"), ) file = os.environ["HOME"] + "/data/QA/coqa/" + "coqa-train-v1.0.json" data = ({ "id": d["id"], "filename": d["filename"], "story": d["story"] } for d in data_io.read_json(file)["data"]) return schema, data
def __get_index_schema(self): """ :return: organization index schema """ return Schema(id=NUMERIC(stored=True), url=ID(stored=True), external_id=ID(stored=True), name=ID(stored=True), domain_names=KEYWORD(stored=True, commas=True), created_at=ID(stored=True), details=ID(stored=True), shared_tickets=BOOLEAN(stored=True), tags=KEYWORD(stored=True, commas=True))
def _get_schema(self): stem_ana = StemmingAnalyzer() return Schema( list_name=ID(stored=True), message_id=ID(stored=True), sender=TEXT(field_boost=1.5), user_id=TEXT, subject=TEXT(field_boost=2.0, analyzer=stem_ana), content=TEXT(analyzer=stem_ana), date=DATETIME(), attachments=TEXT, tags=KEYWORD(commas=True, scorable=True), )
def _get_schema(self, language): lang_analyzer = LanguageAnalyzer(language) return Schema( key=ID(stored=True, unique=True), assignee=ID(stored=True), reporter=ID(stored=True), status=ID(stored=True), summary=TEXT(analyzer=lang_analyzer, field_boost=2.0), description=TEXT(analyzer=lang_analyzer), comments_str=TEXT(analyzer=lang_analyzer), labels=KEYWORD(stored=True, lowercase=True), components=KEYWORD(stored=True, lowercase=True), )
def _mail_schema(self): return Schema( ident=ID(stored=True, unique=True), sender=ID(stored=False), to=KEYWORD(stored=False, commas=True), cc=KEYWORD(stored=False, commas=True), bcc=KEYWORD(stored=False, commas=True), bounced=KEYWORD(stored=False, commas=True), subject=TEXT(stored=False), date=NUMERIC(stored=False, sortable=True, bits=64, signed=False), body=TEXT(stored=False), tag=KEYWORD(stored=True, commas=True), flags=KEYWORD(stored=True, commas=True), raw=TEXT(stored=False))
def main(): indexdir = 'indexdir' if os.path.exists(indexdir): index = open_dir(indexdir) else: schema = Schema(doc_id=ID(unique=True, stored=True), url=ID(unique=True, stored=True), title=TEXT(stored=True), body=TEXT()) os.mkdir(indexdir) index = create_in(indexdir, schema) incremental_index_msmacro(index)
class RecipeSchema(SchemaClass): id = ID(stored=True, unique=True) created = DATETIME() title = TEXT(analyzer=custom_analyzer) description = TEXT(analyzer=custom_analyzer) public = BOOLEAN() deleted = BOOLEAN() reported = BOOLEAN() short_code = ID(stored=True, unique=True) tags = TEXT(analyzer=tag_analyzer) user = ID() steps = TEXT(analyzer=custom_analyzer) ingredients = TEXT(analyzer=custom_analyzer) country = TEXT()
def CreateSchemaInitIndex(): print "Creating schema" my_schema = Schema(id = ID(unique = True, stored = True),\ path = ID(stored = True),\ source = ID(stored = True),\ author = TEXT(stored = True),\ title = TEXT(stored = True),\ year = TEXT(stored = True),\ text = TEXT) print my_schema if not os.path.exists("gutenbergindex"): os.mkdir("gutenbergindex") index = create_in("gutenbergindex", my_schema)
def mkSchema(): schema = Schema( date=ID(stored=True), city=TEXT(stored=True), state=TEXT(stored=True), country=TEXT(stored=True), shape=TEXT(stored=True), durationSecs=NUMERIC(stored=True), # RAUSGENOMMEN durationHoursMins=TEXT(stored=True), # eventuell rausnehmen, da Sekunden exakter sind comments=TEXT(analyzer=StemmingAnalyzer(), stored=True), # oder Standardanalyzer datePosted=ID(stored=True), latitude=NUMERIC(float, stored=True), longitude=NUMERIC(float, stored=True)) return schema
def get_cache_schema(): schema = Schema( key=ID(unique=True, stored=True), # Copied from Zotero. version=NUMERIC(stored=True), # Copied from Zotero. parentItem=ID(stored=True), # Kerko addition. itemType=ID(stored=True), # Kerko addition. library=STORED, # Copied from Zotero & JSON-encoded. links=STORED, # Copied from Zotero & JSON-encoded. meta=STORED, # Copied from Zotero & JSON-encoded. data=STORED, # Copied from Zotero & JSON-encoded. fulltext=STORED, # Kerko addition. ) for format_ in get_formats(): schema.add(format_, STORED) return schema
def populate_whoosh(text_dir, whoosh_dir): loaded = 0 ## Create analyzer used for tokenizing and normalizing tokens my_analyzer = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter()) # Create schema schema = Schema(url=ID(stored=True), body=TEXT(stored=True, analyzer=my_analyzer)) # Setup index os.makedirs(whoosh_dir, exist_ok=True) ix = index.create_in(whoosh_dir, schema) # Clear index writer = ix.writer() writer.commit(mergetype=writing.CLEAR) # Index documents writer = ix.writer() for root, dirs, files in os.walk(text_dir, topdown=False): for name in files: text_file = os.path.join(root, name) with open(text_file, encoding="utf8") as tf: body = tf.read() url = text_file.replace(text_dir, "") writer.add_document(url=url, body=body) print("Added", url) loaded += 1 writer.commit() print("\n\nLoaded", loaded, "documents")
def build_indexes(index, hord): if not os.path.exists(index): os.mkdir(index) index = create_in( index, Schema( quote=TEXT(stored=True), id=ID(stored=True), submitter=STORED, submitted=STORED, ), ) corpus = [] with index.writer() as writer: LOGGER.info("Building Whoosh index and markov model from hord.") for row in hord.get_rows(): corpus.append(row.quote) if row.submitted: submitted = row.submitted.strftime("%b %d %Y %H:%M:%S") else: submitted = None writer.update_document( quote=row.quote, id=str(row.id), submitter=(row.submitter), submitted=(submitted), ) LOGGER.info(f"Index built. {index.doc_count()} documents indexed.") if len(corpus) > 0: model = markovify.NewlineText("\n".join(corpus)) else: model = None LOGGER.info(f"Markov model built.") return index, model
def get_schema(): """ Return a schema used for indexing document """ analyzer = MyVietnameseTokenizer() | LowercaseFilter() | StopFilter(get_stopword_list()) return Schema(title=TEXT(analyzer=analyzer, stored=True, field_boost=1.5), path=ID(unique=True, stored=True), time=STORED, content=TEXT(analyzer=analyzer, stored=True))
class TargetSchema(SchemaClass): ''' Fultext index schema for target strings. ''' checksum = ID(stored=True, unique=True) target = TEXT() comment = TEXT()
def __init__(self, IndexDir="../index"): self.IndexDir = IndexDir from whoosh.fields import Schema, TEXT, ID self.schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) self.ix = index.open_dir(IndexDir)
def fields_map(self, field_type): if field_type == "primary": return ID(stored=True, unique=True) type_map = { 'date': types.Date, 'datetime': types.DateTime, 'boolean': types.Boolean, 'integer': types.Integer, 'float': types.Float } if isinstance(field_type, str): field_type = type_map.get(field_type, types.Text) if not isinstance(field_type, type): field_type = field_type.__class__ if issubclass(field_type, (types.DateTime, types.Date)): return DATETIME(stored=True, sortable=True) elif issubclass(field_type, types.Integer): return NUMERIC(stored=True, numtype=int) elif issubclass(field_type, types.Float): return NUMERIC(stored=True, numtype=float) elif issubclass(field_type, types.Boolean): return BOOLEAN(stored=True) return TEXT(stored=True, analyzer=self.analyzer, sortable=False)
class SourceSchema(SchemaClass): ''' Fultext index schema for source and context strings. ''' checksum = ID(stored=True, unique=True) source = TEXT() context = TEXT()