def search(request): raw_query = request.GET['q'] from whoosh.index import create_in, open_dir from whoosh.fields import ID, DATETIME, TEXT, Schema from whoosh.qparser import QueryParser schema = Schema( id=ID(stored=True), type=ID(stored=True), creator_id=ID(stored=True), timestamp=DATETIME(), # TODO what about stuff with multiple contents # TODO what about pastebin which should really use a source-code analyzer content=TEXT(), ) #ix = create_in('data/whoosh-index/', schema) #writer = ix.writer() # TODO cannot guarantee this dir exists ix = open_dir(request.registry.settings['spline.search.whoosh.path']) from sqlalchemy import create_engine from spline.models import session #session.bind = create_engine('postgresql:///spline?host=/nail/home/amunroe/var/run') from spline_pastebin.models import Paste query_parser = QueryParser('content', schema=schema) whoosh_query = query_parser.parse(raw_query) with ix.searcher() as searcher: results = searcher.search(whoosh_query, limit=10) num_results = len(results) results = [repr(res) for res in results] return dict( whoosh_results=results, whoosh_results_count=num_results, )
def __init__(self, mode="normal"): self.db_handler = DbHandler() self.index_path = "index" self.ix = None self.writer = None self.spell = Spell.get_instance() """ By default, the StandardAnalyzer() is used. This analyzer is composed of a RegexTokenizer with a LowercaseFilter and an optional StopFilter (for removing stopwords) """ self.analyzer = self.__determine_analyzer(mode) """ The whoosh.fields.TEXT indexes the text and stores the term positions to allow phrase searching TEXT fields use StandardAnalyzer by default. To specify a different analyzer, use the analyzer keyword argument to the constructor, e.g. TEXT(analyzer=analysis.StemmingAnalyzer()) """ # Read the Vectors section in http://whoosh.readthedocs.io/en/latest/schema.html self.schema = Schema( doc_id=ID(stored=True), title=TEXT(stored=True), authors=TEXT(stored=True), pub_date=DATETIME(stored=True), abstract=TEXT(stored=True), content=TEXT(vector=True, analyzer=self.analyzer), pdf_name=STORED, ) """ To test whether a directory currently contains a valid index, use index.exists_in: """ exists = exists_in(self.index_path) if exists: print("Index already exists") # A valid index exists, reload the index self.__reload_index() else: print("Index does not yet exist") # No valid index found, remove and recreate index rmtree(self.index_path, ignore_errors=True) self.__create_index()
def video_rebuild(): print datetime.datetime.now() print 'video_rebuild' video_db = mysql_new.BaseDB(config.MYSQL_DEFINE_VIDEO) schema = Schema(movieid=ID(stored=True, unique=True), gcid=ID(stored=True), title=TEXT(stored=True, analyzer=analyzer_zhongwen), pinyin_title=TEXT(stored=True, analyzer=analyzer_pinyin), pic=ID(stored=True), cover_width=STORED, cover_height=STORED, uid=ID(stored=True), upline_time=DATETIME(stored=True, sortable=True), duration=STORED) SQL = '''SELECT video_id as `movieid`,`duration`,`upline_time`, `title`, `uid`, `pic`,`gcid`, `poster_width` as cover_width, `poster_height` as cover_height FROM `short_media_info_v2` WHERE `status` in (1,2) ''' res = video_db.query(SQL, ()) if not res: return index_path = os.path.join(config.index_root_dir, 'video') if not os.path.exists(index_path): os.mkdir(index_path) ix = create_in(index_path, schema=schema) storage = FileStorage(index_path) ix = storage.open_index() writer = ix.writer() for info in res: pinyin_title = ' '.join(lazy_pinyin(info.get('title').decode('utf8'))) writer.add_document(movieid=str(info.get('movieid')).decode('utf8'), gcid=str(info.get('gcid')).decode('utf8'), title=info.get('title').decode('utf8'), pinyin_title=pinyin_title, uid=str(info.get('uid')).decode('utf8'), pic=info.get('pic').decode('utf8'), cover_width=info.get('cover_width'), cover_height=info.get('cover_height'), duration=info.get('duration'), upline_time=info.get('upline_time')) writer.commit(mergetype=writing.CLEAR)
def almacenar_datos(): # define el esquema de la información schem = Schema(nombre=TEXT(stored=True), edad=NUMERIC(stored=True), altura=NUMERIC(stored=True), nacionalidad=KEYWORD(stored=True, commas=True), pie=TEXT(stored=True), posicion_principal=KEYWORD(stored=True, commas=True), posicion_secundaria=KEYWORD(stored=True, commas=True), valor=NUMERIC(stored=True), equipo=TEXT(stored=True), contrato=DATETIME(stored=True)) # eliminamos el directorio del índice, si existe if os.path.exists("Index"): shutil.rmtree("Index") os.mkdir("Index") # creamos el índice ix = create_in("Index", schema=schem) # creamos un writer para poder añadir documentos al indice writer = ix.writer() i = 0 lista = almacenar_datos_bs() for jugador in lista: # añade cada pelicula de la lista al índice writer.add_document(nombre=str(jugador[0]), edad=jugador[1], altura=float(jugador[2]), nacionalidad=str(jugador[3]), pie=str(jugador[4]), posicion_principal=str(jugador[5]), posicion_secundaria=str(jugador[6]), valor=float(jugador[7]), equipo=str(jugador[8]), contrato=jugador[9]) i += 1 writer.commit() print("Se han indexado " + str(i) + " jugadores")
def init_extensions(app): whoosh_searcher.init_app(app) configure_uploads(app, upload_photos) mail.init_app(app) admin.init_app(app) mongo.init_app(app, "MONGO") oauth.init_app(app) login_manager.init_app(app) if app.config.get('USE_CACHE', False): cache.init_app(app, {}) with app.app_context(): # 添加flask-admin视图 admin.add_view(admin_view.UsersModelView(mongo.db['users'], '用户管理')) admin.add_view( admin_view.CatalogsModelView(mongo.db['catalogs'], '栏目管理')) admin.add_view(admin_view.PostsModelView(mongo.db['posts'], '帖子管理')) admin.add_view( admin_view.PassagewaysModelView(mongo.db['passageways'], '温馨通道')) admin.add_view( admin_view.FriendLinksModelView(mongo.db['friend_links'], '友链管理')) admin.add_view(admin_view.PagesModelView(mongo.db['pages'], '页面管理')) admin.add_view( admin_view.FooterLinksModelView(mongo.db['footer_links'], '底部链接')) admin.add_view(admin_view.AdsModelView(mongo.db['ads'], '广告管理')) admin.add_view(admin_view.OptionsModelView(mongo.db['options'], '系统设置')) # 初始化Whoosh索引 chinese_analyzer = ChineseAnalyzer() post_schema = Schema(obj_id=ID(unique=True, stored=True), title=TEXT(stored=True, analyzer=chinese_analyzer), content=TEXT(stored=True, analyzer=chinese_analyzer), create_at=DATETIME(stored=True), catalog_id=ID(stored=True), user_id=ID(stored=True)) whoosh_searcher.add_index('posts', post_schema)
def fields_map(self, field_type): if field_type == "primary": return ID(stored=True, unique=True) type_map = { 'date': types.Date, 'datetime': types.DateTime, 'boolean': types.Boolean, 'integer': types.Integer, 'float': types.Float } if isinstance(field_type, str): field_type = type_map.get(field_type, types.Text) if field_type in (types.DateTime, types.Date): return DATETIME(stored=True, sortable=True) elif field_type == types.Integer: return NUMERIC(stored=True, numtype=int) elif field_type == types.Float: return NUMERIC(stored=True, numtype=float) elif field_type == types.Boolean: return BOOLEAN(stored=True) return TEXT(stored=True, analyzer=self.analyzer, sortable=False)
class Schema(SchemaClass): #: The id of the job. id = ID(stored=True, unique=True) #: The title of the job. title = TEXT(analyzer=stemming_analyzer) #: The name of the company. company = TEXT(analyzer=stemming_analyzer) #: Location as a comma-separated string of city and country. location = KEYWORD(lowercase=True, scorable=True, commas=True) #: The type of job. job_type = TEXT(analyzer=stemming_analyzer) #: The job tags as a comma-separated string of tag slugs. tags = KEYWORD(lowercase=True, scorable=True, commas=True) #: When was this job created? created = DATETIME(sortable=True)
def handle(self, *args, **kwargs): """ Creates the index iterating over all the pages of the site """ schema = Schema(pk=NUMERIC(unique=True, stored=True), title=TEXT, summary=TEXT, tags=KEYWORD(commas=True, scorable=True), pub_date=DATETIME(sortable=True)) if not os.path.exists(settings.INDEX): os.mkdir(settings.INDEX) ix = create_in(settings.INDEX, schema) writer = ix.writer() objects = Page.objects.all() for object in objects: tags = map(lambda x: x.title, object.tags.all()) writer.add_document(title=object.title, summary=object.summary, tags=",".join(tags), pk=object.pk, pub_date=object.pub_date) writer.commit()
def get_schema(): analyzer = StemmingAnalyzer(stoplist=STOP) schema = Schema(title=TEXT(stored=True, analyzer=analyzer, sortable=True), url=ID(stored=True), content_length=NUMERIC(stored=True, sortable=True), thread_votecount=NUMERIC(stored=True, sortable=True), vote_count=NUMERIC(stored=True, sortable=True), content=TEXT(stored=True, analyzer=analyzer, sortable=True), tags=KEYWORD(stored=True, commas=True), is_toplevel=BOOLEAN(stored=True), lastedit_date=DATETIME(stored=True, sortable=True), rank=NUMERIC(stored=True, sortable=True), author=TEXT(stored=True), author_score=NUMERIC(stored=True, sortable=True), author_handle=TEXT(stored=True), author_email=TEXT(stored=True), author_uid=ID(stored=True), author_url=ID(stored=True), uid=ID(stored=True), type=NUMERIC(stored=True, sortable=True), type_display=TEXT(stored=True)) return schema
def build_schema(self, fields): schema_fields = { 'id': ID(stored=True, unique=True), 'django_ct': ID(stored=True), 'django_id': ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST(stored=True) else: schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float) elif field_class.field_type == 'boolean': schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored) else: schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer()) if field_class.document is True: content_field_name = field_class.index_fieldname # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.") return (content_field_name, Schema(**schema_fields))
# collection index only has one element news = db['news'] # create directory for index if not exists if not os.path.exists("news"): os.makedirs("news") # do query for new news news_index = None if not exists_in("news"): schema = Schema(id=ID(unique=True, stored=True), d=TEXT(spelling=True, stored=True), t=TEXT(spelling=True, stored=True), tags=TEXT(stored=True), time=DATETIME(stored=True), link=TEXT(stored=True)) news_index = create_in("news", schema) result = news.find() else: news_index = open_dir("news") result = news.find({"_id": {"$gt": ObjectId(last_id["last_id"])}}) news_writer = news_index.writer() # do query for new entities entities_index = open_dir("entities") # index each entry here last_post = None i = 0
import os.path from datetime import datetime from whoosh.index import create_in from whoosh.fields import Schema, TEXT, ID, DATETIME schema = Schema(title=TEXT(stored=True), url=TEXT(stored=True), date=DATETIME(stored=True), content=TEXT, hash=ID(stored=True, unique=True)) if not os.path.exists("index"): os.mkdir("index") ix = create_in("index", schema) import re import requests from bs4 import BeautifulSoup as bs4 with open('Safari Bookmarks.html') as f: r = f.read() h = bs4(r, "lxml") links = h.find_all('a', href=re.compile('^http')) print(len(links)) writer = ix.writer() for i, link in enumerate(links): try: url = link.get('href') print(i, url)
def get_schema(): return Schema(titulo=TEXT(stored=True), fecha_inicio=DATETIME(stored=True), fecha_fin=DATETIME(stored=True), descripcion=TEXT(stored=True), categorias=KEYWORD(stored=True, commas=True, scorable=True))
def get_schema(): return Schema(titulo=TEXT(stored=True),tituloOriginal=TEXT(stored=True),fechaEstreno=DATETIME(stored=True), director=TEXT(stored=True),reparto=TEXT,sinopsis=TEXT)
def init_extensions(app): """ 初始化插件 :param app: :return: """ global use_cache whoosh_searcher.init_app(app) configure_uploads(app, upload_img) mail.init_app(app) admin.init_app(app) mongo.init_app(app, 'MONGO') oauth.init_app(app) login_manager.init_app(app) # use_cache = app.config.get('USE_CHCHE', False) # if use_cache: # cache.init_app(app) with app.app_context(): # 添加flask-admin视图 admin.add_view(admin_view.UsersModelView(mongo.db['users'], '用户管理')) admin.add_view(admin_view.RolesModelView(mongo.db['roles'], '角色管理')) admin.add_view( admin_view.CatalogsModelView(mongo.db['catalogs'], '栏目管理', category='内容管理')) admin.add_view( admin_view.PostsModelView(mongo.db['posts'], '帖子管理', category='内容管理')) admin.add_view( admin_view.PassagewaysModelView(mongo.db['passageways'], '温馨通道', category='推广管理')) admin.add_view( admin_view.FriendLinksModelView(mongo.db['friend_links'], '友链管理', category='推广管理')) admin.add_view( admin_view.PagesModelView(mongo.db['pages'], '页面管理', category='推广管理')) admin.add_view( admin_view.FooterLinksModelView(mongo.db['footer_links'], '底部链接', category='推广管理')) admin.add_view( admin_view.AdsModelView(mongo.db['ads'], '广告管理', category='推广管理')) admin.add_view(admin_view.OptionsModelView(mongo.db['options'], '系统管理')) # 初始化whoosh索引 chinese_analyzer = ChineseAnalyzer() post_schema = Schema(obj_id=ID(unique=True, sortable=True), title=TEXT(sortable=True, analyzer=chinese_analyzer), content=TEXT(sortable=True, analyzer=chinese_analyzer), create_at=DATETIME(sortable=True), catalog_id=ID(sortable=True), user_id=ID(sortable=True)) whoosh_searcher.add_index('posts', post_schema)
message=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), parents=TEXT(), added=TEXT(), removed=TEXT(), changed=TEXT(), ) CHGSET_IDX_NAME = 'CHGSET_INDEX' # used only to generate queries in journal JOURNAL_SCHEMA = Schema( username=TEXT(), date=DATETIME(), action=TEXT(), repository=TEXT(), ip=TEXT(), ) class WhooshResultWrapper(object): def __init__(self, search_type, searcher, matcher, highlight_items, repo_location): self.search_type = search_type self.searcher = searcher self.matcher = matcher self.highlight_items = highlight_items self.fragment_size = 200 self.repo_location = repo_location
def get_schema(): return Schema(titulo=TEXT(stored=True), imagen=TEXT(stored=True), rango_web=TEXT(stored=True), popularidad=TEXT(stored=True), fecha_inicio=DATETIME(stored=True), fecha_final=DATETIME(stored=True), episodios=TEXT(stored=True), sinopsis=TEXT(stored=True), generos=KEYWORD(stored=True))
def get_schema(): return Schema(titulo=TEXT(stored=True), fecha=DATETIME(stored=True), enlace=TEXT(stored=True), resumen=TEXT(stored=True), nombrefichero=ID(stored=True))
def schemaChampions(): schema = Schema(idChampion=NUMERIC(stored=True), name=TEXT(stored=True), image=TEXT(stored=True), releaseDate=DATETIME(stored=True)) return schema
def get_response_schema(): return Schema(link_tema=ID(stored=True), fecha=DATETIME(stored=True), texto=TEXT(stored=True), autor=TEXT(stored=True))
def get_schema(): return Schema(remitente=TEXT(stored=True), destinatarios=TEXT(stored=True), fecha=DATETIME(stored=True), asunto=TEXT(stored=True), contenido=TEXT(stored=True), nombrefichero=ID(stored=True))
def get_schema(): return Schema(titulo=TEXT(stored=True), description = TEXT (stored=True), categoria=TEXT(stored=True), fecha=DATETIME(stored=True))
def get_schema_correo(): return Schema(remitente=TEXT(stored=True), destinatarios=KEYWORD(stored=True), fecha=DATETIME(stored=True), asunto=TEXT(stored=True), contenido=TEXT(stored=True))
class SearchIndexer: """Full-text search indexer.""" # schema for searches of all (public + private) info SCHEMA = Schema( type=ID(stored=True), handle=ID(stored=True, unique=True), private=BOOLEAN(stored=True), text=TEXT(), text_private=TEXT(), changed=DATETIME(), ) # schema for searches of public info only SCHEMA_PUBLIC = Schema( type=ID(stored=True), handle=ID(stored=True, unique=True), private=BOOLEAN(stored=True), text=TEXT(), changed=DATETIME(), ) def __init__(self, index_dir=FilenameOrPath): """Initialize given an index dir path.""" self.index_dir = Path(index_dir) self.index_dir.mkdir(exist_ok=True) # query parser for all (public + private) content self.query_parser_all = MultifieldParser(["text", "text_private"], schema=self.SCHEMA) # query parser for public content only self.query_parser_public = QueryParser("text", schema=self.SCHEMA_PUBLIC) def index(self, overwrite=False): """Return the index; create if doesn't exist.""" index_dir = str(self.index_dir) if overwrite or not index.exists_in(index_dir): return index.create_in(index_dir, self.SCHEMA) return index.open_dir(index_dir) def reindex_full(self, db_handle: DbReadBase): """Reindex the whole database.""" with self.index(overwrite=True).writer() as writer: for obj_dict in iter_obj_strings(db_handle): writer.add_document( type=obj_dict["class_name"].lower(), handle=obj_dict["handle"], text=obj_dict["string"], text_private=obj_dict["string_private"], changed=obj_dict["changed"], ) @staticmethod def format_hit(hit: Hit) -> Dict[str, Any]: """Format a search hit.""" return { "handle": hit["handle"], "object_type": hit["type"], "rank": hit.rank, "score": hit.score, } def search( self, query: str, page: int, pagesize: int, include_private: bool = True, extend: bool = False, ): """Search the index. If `include_private` is true, include also private objects and search in private fields. """ query_parser = (self.query_parser_all if include_private else self.query_parser_public) query_parser.add_plugin(DateParserPlugin()) # if private objects should not be shown, add a mask mask = None if include_private else Term("private", True) parsed_query = query_parser.parse(query) with self.index().searcher() as searcher: results = searcher.search_page(parsed_query, page, pagesize, mask=mask) return results.total, [self.format_hit(hit) for hit in results]
def get_schema(): return Schema(titulo=TEXT(stored=True), fechaInicio=DATETIME(stored=True), fechaFin=DATETIME(stored=True), descripcion=TEXT(stored=True), categorias=TEXT(stored=True))
# hierarchical index of ids path ('/' is the separator) parent_ids = FieldType(format=Existence(), analyzer=PathTokenizer(), stored=True, unique=False) name = TEXT(stored=True, analyzer=accent_folder) slug = ID(stored=True) description = TEXT(stored=True, analyzer=accent_folder) text = TEXT(stored=False, analyzer=accent_folder) _default_dyn_fields = { '*_prefix': EdgeNgramField(), '*_at': DATETIME(stored=True, sortable=True), } def DefaultSearchSchema(*args, **kwargs): schema = _DefaultSearchSchema() for name, field in _default_dyn_fields.items(): schema.add(name, field, glob=True) return schema def indexable_role(principal): """ Returns a string suitable for query against `allowed_roles_and_users` field.
t.startchar = start_char + match.start() t.endchar = start_char + match.end() yield t def StanfordAnalyzer(lowercase=False): tokenizer = StanfordTokenizer() if lowercase: tokenizer = tokenizer | LowercaseFilter() return tokenizer schema = Schema(id=ID(stored=True), path=ID(stored=True), body=TEXT(analyzer=StanfordAnalyzer()), year=DATETIME(stored=True), tags=KEYWORD(stored=True), names=KEYWORD(stored=True)) if __name__ == '__main__': if not os.path.exists('../TIMEindex'): os.mkdir('../TIMEindex') ix = index.create_in('../TIMEindex', schema=schema, indexname="TIME") ix = index.open_dir('../TIMEindex', indexname="TIME") writer = ix.writer() for decade in os.listdir('../rich_texts_txt'): if decade.startswith('.'): continue path = os.path.join('../rich_texts_txt', decade)
def get_schema(): return Schema(title=TEXT(stored=True), date_start=DATETIME(stored=True), date_end=DATETIME(stored=True), description=TEXT(stored=True), categoria=KEYWORD(stored=True))
def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields))
from whoosh.fields import Schema, TEXT, ID, DATETIME, NUMERIC from whoosh.qparser import QueryParser from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from tqdm import tqdm from ..scraper.model import Message from .cleaning import clean_message # Setup Index schema = Schema( list_id=ID(stored=True), message_id=ID(stored=True), content=TEXT(stored=True), author=TEXT(stored=True), subject=TEXT(stored=True), sent_at=DATETIME(stored=True), thread_parent=NUMERIC(stored=True), thread_idx=NUMERIC(stored=True), thread_indent=NUMERIC(stored=True), page=TEXT(stored=True), ) def open_index(index_dir): if not os.path.exists(index_dir): os.mkdir(index_dir) return index.create_in(index_dir, schema) else: return index.open_dir(index_dir)