class NewsSchema(SchemaClass): newsAgency = TEXT title = TEXT(stored=True) summary = TEXT(stored=True) url = ID(stored=True) content = TEXT
class SourceSchema(SchemaClass): """Fultext index schema for source and context strings.""" pk = NUMERIC(stored=True, unique=True) source = TEXT() context = TEXT() location = TEXT()
def get_schema(): return Schema(remitente=TEXT(stored=True), destinatarios=KEYWORD(stored=True), asunto=TEXT(stored=True), contenido=TEXT(stored=True))
import os.path from whoosh.fields import Schema, TEXT from whoosh.index import create_in, open_dir from .modeles.data_db import data as db # Il a été déterminé que l'indexation via un moteur de recherche plein texte ne serait utile que pour la recherche # via nom de ville. # Définition du schéma du moteur de recherche. Le code du pays sera indexé et le contenu # sera retourné en fonction du mot indexé (stored). "City" sert à l'indexation des noms de ville. # "Name" sert au type de représentation. Le contenu de "Content" servira à produire le marqueur de la carte. # Whoosh ne peut pas indexer des chaînes de caractères en UTF-8. schema = Schema(city=TEXT, name=TEXT(stored=True), content=TEXT(stored=True)) # L'indexation n'est lancée que si le dossier "index" n'existe pas (la documentation Whoosh conseille de # stocker l'index dans un dossier comme cela). L'indexation n'est donc lancée qu'une seule fois dans le cycle de vie # de l'application installée en local. Si on refaisait l'indexation à chaque lancement de l'application, cela # gaspillerait des ressources. villes = [] if not os.path.exists("index"): # Ce print ne doit s'afficher que lorsque l'index est écrit, c'est-à-dire lors du 1er lancement de l'application, # ou lors d'un changement des données décidé par l'utilisateur. Logiquement, il apparaît lors du lancement des # tests. print("Création du dossier 'index'.") os.mkdir("index") index = create_in("index", schema) # On ouvre l'index vide (qui a maintenant un schéma) pour y ajouter ce qu'on veut indexer. index = open_dir("index") writer = index.writer() # Ajout des documents indexés selon les villes. Le contenu est le nom de la représentation diplomatique concernée.
"""File name is deals_ + datetime with YearMonthDay""" daystr = datetime.date.today().strftime('%Y%m%d') filename = 'deals_' + daystr + '.jl' rootFolder = 'C:\crawlData\\' oldFiles = Path(rootFolder).files('*.jl') oldFiles[0].remove() print('Removed {} file').format(oldFiles[0]) fullFilePath = rootFolder + filename """Download deals from S3""" session = boto3.Session(profile_name='indexingProf') s3_client = session.client('s3') s3_client.download_file('home-deals', 'deals/' + filename, fullFilePath) # ixDirectory = 'indexed_'+daystr ixDirectory = rootFolder + 'indexed' dealSchema = Schema(title=TEXT(stored=True), img=ID(stored=True), link=TEXT(stored=True), price=ID(stored=True)) if not os.path.exists(ixDirectory): os.mkdir(ixDirectory) ix = create_in(ixDirectory, dealSchema) writer = ix.writer() """Configuration for indexing full-text search by whoosh""" with open(fullFilePath) as file: for line in file: try: lineData = json.loads(line) title = lineData['title'] img = lineData['img'] link = lineData['link']
from whoosh.fields import Schema from whoosh.fields import ID, TEXT from whoosh.index import open_dir, create_in from whoosh.analysis import StopFilter from whoosh.analysis import RegexTokenizer from whoosh.qparser import QueryParser from collections import Counter #=============Input=========== #=============UAT Indexing=========== my_schema = Schema(id = ID(unique=True, stored=True), path = ID(stored=True), source = ID(stored=True), author = TEXT(stored=True), title = TEXT(stored=True), text = TEXT) ix = create_in("index", my_schema) index = open_dir("index") writer = index.writer() import io writer.add_document(id = u'uat_voc', path = u'sample/uat_voc.txt', source = u'uat_voc.txt', title = u'uat_voc', text = io.open('uat_voc.txt', encoding='utf-8').read()) writer.commit()
def schema_type(self): return TEXT(stored=True, analyzer=SimpleAnalyzer())
class BmarkSchema(SchemaClass): bid = ID(unique=True, stored=True) description = TEXT extended = TEXT tags = KEYWORD readable = TEXT(analyzer=StemmingAnalyzer())
from whoosh.query import Variations from whoosh.support.charset import accent_map from whoosh.analysis import RegexTokenizer from whoosh.analysis import CharsetFilter, LowercaseFilter, StopFilter from newebe.lib.stopwords import stoplists from newebe.config import CONFIG logger = logging.getLogger("newebe.lib") chfilter = CharsetFilter(accent_map) stoplist = stoplists["en"].union(stoplists["fr"]) analyzer = RegexTokenizer() | LowercaseFilter() | \ StopFilter(stoplist=stoplist) | chfilter schema = Schema(content=TEXT(analyzer=analyzer), docType=TEXT, docId=ID(stored=True), tags=KEYWORD) class Indexer(): """ Indexer simplifies objects indexation and search with the whoosh api. """ def __init__(self): """ Set index, create it if it does not exists. """ if CONFIG.main.debug:
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED from whoosh.analysis import StemmingAnalyzer from whoosh.filedb.filestore import FileStorage schema = Schema(teorArtigo=TEXT(analyzer=StemmingAnalyzer()), numArtigo=TEXT(analyzer=StemmingAnalyzer()), pergunta=TEXT(analyzer=StemmingAnalyzer()), idResposta=ID(stored=True)) storage = FileStorage("index") ix = storage.create_index(schema) ix = storage.open_index() writer = ix.writer()
with open(file) as f: for line_no, line in enumerate(f): line = line.strip().split("\t") if line_no == 0: header = line else: data.append(dict(zip(header, line))) return header, data FIELDS, DATA = parse() PAGELEN = 100 if automatic: schema = Schema(**{ header: TEXT(stored=True) for header in FIELDS }) else: schema = Schema() def create_index(data=DATA, _schema=schema): if not os.path.exists(_indexdir): os.mkdir(_indexdir) ix = index.create_in(_indexdir, _schema) writer = ix.writer() for elem in data: writer.add_document(**elem)
import sys import json from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED from whoosh.analysis import StemmingAnalyzer from whoosh import index from whoosh.qparser import QueryParser from whoosh import scoring import os, os.path ##################################### #Create the schema ##################################### schema = Schema(filename=ID(stored=True), cell_no=TEXT(stored=True), content=TEXT(analyzer=StemmingAnalyzer())) ##################################### # Create the index and initialize a `writer` ##################################### # Note, this clears the existing index in the directory ix = index.create_in("notebooks", schema) # Get a writer form the created index in writer = ix.writer() def visibleTextFromNB(filename): ''' This function pulls all the non-output visible cells from a JupyterNotebook and concatenates it all into a block of
class MySchema(SchemaClass): path = ID(stored=True) title = TEXT(stored=True) content = TEXT tags = KEYWORD
# -*- coding: utf-8 -*- #http://blog.csdn.net/twsxtd/article/details/8308893 最近想做一个搜索引擎,当然少不了看下闻名遐迩的Lucene,不得不说确实非常出色,但是对于python的实现pylucene确是差强人意,首先它 不是纯python实现 而是做了一层包装到头来还是使用java,依赖于JDK不说安装步骤繁琐至极,而且Lucene可用的中文分词词库非常之多但是由 于这层粘合关系很多都用不上, 最终还是放弃,不过平心而论如果用Java实现的确很完美。其它的有sphinx以及基于它实现的专门用于中文的 coreseek,不过看了下好像是基于SQL语言的, 对于我要做的事情好像关系不大;还有用C++写的xapian框架,可以说是一片好评啊,速度精度 都非常不错,但最终还是看上了纯python实现的Whoosh, 首先对于python使用来说非常简单,就是一个模块,easy_install就行, 但是搜了一下国内的资料非常之少,没有办法,就把它的文档翻译一下吧~~今天开始 Quick Start Whoosh是一个索引文本和搜索文本的类库,他可以为你提供搜索文本的服务,比如如果你在创建一个博客的软件,你可以用whoosh为它添加添加一个搜索功能以便用户来搜索博客的入口 下面是一个简短的例子: from whoosh.index import create_in from whoosh.fields import * schema = Schema(title = TEXT(stored = True),path = ID(stored=True),content=TEXT) ix = create_in("/home/gswewf/百科/indexer",schema)#(这里的“indexer”实际上是一个目录,因此按照这个步骤来会出错,你得先创建目录,译者注) writer = ix.writer() writer.add_document(title=u"First document",path=u"/a", content = u"this is the first document we've add!") writer.add_document(title=u"Second document", path=u"/b", ... content=u"The second one is even more interesting!") writer.commit() from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse("first") results = searcher.search(query) results[0] {"title": u"First document", "path": u"/a"} Index和Schema对象
import os import sys import json from whoosh.index import create_in, open_dir from whoosh.fields import Schema, TEXT, ID from whoosh.analysis import StemmingAnalyzer # three fields: youtube id, video title, video description stem_analyzer = StemmingAnalyzer() schema = Schema(id=ID(stored=True), title=TEXT(stored=True), description=TEXT(analyzer=stem_analyzer, stored=True), topic=ID(stored=True)) # create a folder to store index if not os.path.exists("indexdirectory"): os.mkdir("indexdirectory") # create index writer ix = open_dir("indexdirectory") writer = ix.writer() with open('data_for_indexing3.json') as f: youtube_array = json.load(f) for item in youtube_array: writer.add_document(id=item['id'], title=item['title'], description=item['description'], topic=item['topic'])
def get_more_search_result(): query = request.form['query'] q = [] q.append(query) page_offset = int(request.form['page_offset']) index_name = request.form['index_name'] num_elem_to_get = 50 # select correct index if index_name is None or index_name == "0": selected_index = get_current_index() else: selected_index = os.path.join(baseindexpath, index_name) path_array = [] preview_array = [] date_array = [] size_array = [] list_tags = [] schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) ix = index.open_dir(selected_index) with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse(" ".join(q)) results = searcher.search_page(query, page_offset, num_elem_to_get) for x in results: path = x.items()[0][1] path = path.replace(PASTES_FOLDER, '', 1) path_array.append(path) paste = Paste.Paste(path) content = paste.get_p_content() content_range = max_preview_char if len(content)>max_preview_char else len(content)-1 preview_array.append(content[0:content_range]) curr_date = str(paste._get_p_date()) curr_date = curr_date[0:4]+'/'+curr_date[4:6]+'/'+curr_date[6:] date_array.append(curr_date) size_array.append(paste._get_p_size()) p_tags = r_serv_metadata.smembers('tag:'+path) l_tags = [] for tag in p_tags: complete_tag = tag tag = tag.split('=') if len(tag) > 1: if tag[1] != '': tag = tag[1][1:-1] # no value else: tag = tag[0][1:-1] # use for custom tags else: tag = tag[0] l_tags.append( (tag, complete_tag) ) list_tags.append(l_tags) to_return = {} to_return["path_array"] = path_array to_return["preview_array"] = preview_array to_return["date_array"] = date_array to_return["size_array"] = size_array to_return["list_tags"] = list_tags to_return["bootstrap_label"] = bootstrap_label if len(path_array) < num_elem_to_get: #pagelength to_return["moreData"] = False else: to_return["moreData"] = True return jsonify(to_return)
def get_schema(): return Schema(nome=TEXT(stored=True), id=ID(stored=True), lat=NUMERIC(stored=True), lon=NUMERIC(stored=True))
def search(): query = request.form['query'] q = [] q.append(query) r = [] #complete path c = [] #preview of the paste content paste_date = [] paste_size = [] paste_tags = [] index_name = request.form['index_name'] num_elem_to_get = 50 # select correct index if index_name is None or index_name == "0": selected_index = get_current_index() else: selected_index = os.path.join(baseindexpath, index_name) ''' temporary disabled # Search filename for path in r_serv_pasteName.smembers(q[0]): r.append(path) paste = Paste.Paste(path) content = paste.get_p_content() content_range = max_preview_char if len(content)>max_preview_char else len(content)-1 c.append(content[0:content_range]) curr_date = str(paste._get_p_date()) curr_date = curr_date[0:4]+'/'+curr_date[4:6]+'/'+curr_date[6:] paste_date.append(curr_date) paste_size.append(paste._get_p_size()) ''' # Search full line schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) ix = index.open_dir(selected_index) with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse("".join(q)) results = searcher.search_page(query, 1, pagelen=num_elem_to_get) for x in results: r.append(x.items()[0][1].replace(PASTES_FOLDER, '', 1)) path = x.items()[0][1].replace(PASTES_FOLDER, '', 1) paste = Paste.Paste(path) content = paste.get_p_content() content_range = max_preview_char if len(content)>max_preview_char else len(content)-1 c.append(content[0:content_range]) curr_date = str(paste._get_p_date()) curr_date = curr_date[0:4]+'/'+curr_date[4:6]+'/'+curr_date[6:] paste_date.append(curr_date) paste_size.append(paste._get_p_size()) p_tags = r_serv_metadata.smembers('tag:'+path) l_tags = [] for tag in p_tags: complete_tag = tag tag = tag.split('=') if len(tag) > 1: if tag[1] != '': tag = tag[1][1:-1] # no value else: tag = tag[0][1:-1] # use for custom tags else: tag = tag[0] l_tags.append( (tag, complete_tag) ) paste_tags.append(l_tags) results = searcher.search(query) num_res = len(results) index_list = get_index_list() index_min = 1 index_max = len(index_list) return render_template("search.html", r=r, c=c, query=request.form['query'], paste_date=paste_date, paste_size=paste_size, char_to_display=max_preview_modal, num_res=num_res, index_min=index_min, index_max=index_max, bootstrap_label=bootstrap_label, paste_tags=paste_tags, index_list=index_list )
def _build_doc_attrs(self, Model, schema): mapper = sa.inspect(Model) args = self.doc_attrs # any field not in schema will be stored here. After all field have been # discovered we add missing ones field_definitions = dict() def setup_field(attr_name, field_name): field_def = False if not isinstance(field_name, string_types): field_name, field_def = field_name if field_name not in schema: if (field_name not in field_definitions or field_definitions[field_name] is False): field_definitions[field_name] = field_def # attrgetter offers dotted name support. Useful for attributes on # related objects. args.setdefault(field_name, {})[name] = attrgetter(name) # model level definitions for name, field_names in self.index_args.get("index_to", ()): if isinstance(field_names, string_types): field_names = (field_names, ) for field_name in field_names: setup_field(name, field_name) # per column definitions for col in mapper.columns: name = col.name info = col.info if not info.get("searchable"): continue index_to = info.get("index_to", (name, )) if isinstance(index_to, string_types): index_to = (index_to, ) for field_name in index_to: setup_field(name, field_name) # add missing fields to schema for field_name, field_def in field_definitions.items(): if field_name in schema: continue if field_def is False: field_def = TEXT(stored=True, analyzer=accent_folder) logger.debug( "Adding field to schema:\n" " Model: %s\n" ' Field: "%s" %s', Model._object_type(), field_name, field_def, ) schema.add(field_name, field_def)
.map(lambda row: (row.title, get_from(row.title, sec_redirs), row.revision.text._VALUE)) \ .filter(lambda row: row[1] != []) texts = texts_rdd.collect() ############ ############ VYHLADAVANIE ############ from whoosh import index from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED from whoosh.analysis import StemmingAnalyzer from whoosh.qparser import QueryParser schema = Schema(title=ID(stored=True), r_from=TEXT(stored=True), text=TEXT(analyzer=StemmingAnalyzer()), tags=KEYWORD) print("creating reversed index...") if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = index.create_in("indexdir", schema) writer = ix.writer() for text in texts: writer.add_document(title=text[0], r_from=' '.join(map(str, text[1] + ", ")), text=text[2],
''' import pandas as pd import time import xml.dom.minidom from xml.dom.minidom import parse from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED import os, os.path from whoosh import index if not os.path.exists("indexFolder"): os.mkdir("indexFolder") schema = Schema(docid=ID(stored=True), title=TEXT(stored=True), body=TEXT(stored=True)) ix = index.create_in("indexFolder", schema) ix = index.open_dir("indexFolder") writer = ix.writer() start_time = time.time() last_time = start_time indexLog = [] fw = open('indexLog.txt','w') for i in range(462): fiName = "WIR\WebIR-%03d.xml" if not os.path.exists(fiName % i): continue dom_data = xml.dom.minidom.parse(fiName % i)
def build_schema(self, fields): # Copied from https://github.com/django-haystack/django-haystack/blob/v2.8.1/haystack/backends/whoosh_backend.py schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = "" for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = WHOOSH_ID( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ["date", "datetime"]: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == "integer": schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == "float": schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == "boolean": # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == "ngram": schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == "edge_ngram": schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at="start", stored=field_class.stored, field_boost=field_class.boost, ) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=getattr(field_class, "analyzer", StemmingAnalyzer()), field_boost=field_class.boost, sortable=True, ) schema_fields[ field_class.index_fieldname].field_name = field_name if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields))
from tempfile import TemporaryDirectory from whoosh import qparser from whoosh.analysis import RegexTokenizer from whoosh.util.text import rcompile tokenizer = RegexTokenizer(expression=rcompile(r"[\w/.]+")) for token in tokenizer(u"Hello there templates/app1/test.html!"): print(repr(token.text)) from whoosh.fields import Schema, TEXT, ID from whoosh.index import create_in tmp_dir = TemporaryDirectory() schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(analyzer=RegexTokenizer(expression=rcompile(r"[\w/.]+")))) ix = create_in(tmp_dir.name, schema) writer = ix.writer() writer.add_document(title=u"First document", path=u"/a", content=u"this/is/a/test.html") writer.add_document(title=u"Second document", path=u"/b", content=u"this/is/a/hello.html hello a yup") writer.add_document(title=u"Second document", path=u"/b", content=u"this is a hello.html hello a yup") writer.commit() from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser("content", ix.schema)
try: if ':' not in duration: seconds = int(duration) m, s = divmod(seconds, 60) h, m = divmod(m, 60) if h > 0: return "%02d:%02d:%02d" % (h, m, s) else: return "%02d:%02d" % (m, s) except ValueError: logger.warning(f"转换时间失败:`{duration}") return dest # 搜索对象 whoosh_site_schema = Schema( id=ID(stored=True, unique=True), cname=TEXT(field_boost=5.0), author=TEXT(field_boost=3.0), brief=TEXT(), ) whoosh_article_schema = Schema( uindex=ID(stored=True, unique=True), title=TEXT(field_boost=5.0), author=TEXT(field_boost=3.0), content=TEXT(), )
class TargetSchema(SchemaClass): """Fultext index schema for target strings.""" pk = NUMERIC(stored=True, unique=True) target = TEXT() comment = TEXT()
import os import os.path from whoosh import index, analysis, searching from whoosh.fields import TEXT, Schema from whoosh.qparser import QueryParser from whoosh.reading import TermNotFound import aai.query as query SCHEMA = Schema(content=TEXT(stored=True, spelling=True)) INDEX_DIR = 'aai/indices' def get_indices(): if not os.path.exists(INDEX_DIR): os.mkdir(INDEX_DIR) if index.exists_in(INDEX_DIR): return index.open_dir(INDEX_DIR) else: return full_index() def full_index(): idx = index.create_in(INDEX_DIR, SCHEMA) writer = idx.writer() data = query.RDFQueries().artist_names() for item in data: writer.add_document(content=item)
def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields))
def get_schema(self): return Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True))
"""Module for searching the toolshed repositories""" from galaxy import exceptions from galaxy.exceptions import ObjectNotFound import logging log = logging.getLogger( __name__ ) import whoosh.index from whoosh import scoring from whoosh.fields import Schema, STORED, TEXT from whoosh.qparser import MultifieldParser schema = Schema( id=STORED, name=TEXT( field_boost=1.7, stored=True ), description=TEXT( field_boost=1.5, stored=True ), long_description=TEXT( stored=True ), homepage_url=TEXT( stored=True ), remote_repository_url=TEXT( stored=True ), repo_owner_username=TEXT( stored=True ), times_downloaded=STORED, approved=STORED, last_updated=STORED, full_last_updated=STORED ) class RepoWeighting( scoring.BM25F ): """ Affect the BM25G scoring model through the final method. source: https://groups.google.com/forum/#!msg/whoosh/1AKNbW8R_l8/XySW0OecH6gJ """ use_final = True
# initialize a new index, using a HNSW index on Cosine Similarity index_nms = nmslib.init(method='hnsw', space='cosinesimil') index_nms.addDataPointBatch(data) index_nms.createIndex({'post': 2}, print_progress=True) print("Indexing whoosh...") # http://jaympatel.com/2020/08/how-to-do-full-text-searching-in-python-using-whoosh-library/ from whoosh.fields import Schema, TEXT, ID from whoosh import index import os, os.path from whoosh import index from whoosh import qparser from whoosh.qparser import QueryParser schema = Schema(path=ID(stored=True), content=TEXT(stored=True)) #Now, we will use the schema to initialize a Whoosh index in the above directory. ix = index.create_in("models", schema) writer = ix.writer() #Lastly, let us fill this index with the data from the dataframe. for i in range(len(df_docs)): writer.add_document(content=str(df_docs.text.iloc[i]), path=str(df_docs.id.iloc[i])) writer.commit() # https://stackoverflow.com/questions/19477319/whoosh-accessing-search-page-result-items-throws-readerclosed-exception # http://annamarbut.blogspot.com/2018/08/whoosh-pandas-and-redshift-implementing.html # https://ai.intelligentonlinetools.com/ml/search-text-documents-whoosh/