(i.e. law in most cases) may contain. law_name -- name of the document. Searchable and stored. law_body -- the intro and articles of a law. Searchable only. law_num_date -- the number of the law and the exact date. Searchable and stored. pub_year -- the date of the Official Gazette publication. article_one -- title and first few sentences of article one. Stored only for displaying in search results. """ schema = Schema( law_name=TEXT(analyzer=lang_ana, stored=True), law_body=TEXT(analyzer=lang_ana), law_num_date=ID(stored=True), # A doc can have multiple agency_tag=KEYWORD(stored=True), content_type_tag=KEYWORD(stored=True), pub_year=NUMERIC(sortable=True, stored=True), article_one_title=STORED, article_one_str=STORED) # CREATE AN INDEX """ The documents will be stored according to the defined schema. Fields that are indexed can be "searched." Some fields can be stored without being indexed... just to show up search results. """ # To create (or open existing) index directory if os.path.exists("scripts/indexdir"):
def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: # schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True) schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields))
if i[1] >= 10: bookdict_all_sort_upper10.append(i) remove_words = set({ "(", ")", "(", ")", "[", "]", "「", "」", "+", "-", "*", "$", "'", '"', "、", ".", "”", "’", ":", ";", "_", "/", "?", "!", "。", ",", "=", "=", " ", '『', '』' }) # sudachiの設定 tokenizer_obj = dictionary.Dictionary().create() mode = tokenizer.Tokenizer.SplitMode.C # 1文字を許可するためcontentのstoplistを無効化 schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True, analyzer=StandardAnalyzer(stoplist=None)), count=NUMERIC(stored=True, sortable=True)) if not os.path.exists("heroku_index"): os.mkdir("heroku_index") ix = create_in("heroku_index", schema) # index作成 writer = ix.writer() for num in range(len(bookdict_all_sort_upper10)): titlewords = set([ m.surface() for m in tokenizer_obj.tokenize( bookdict_all_sort_upper10[num][0], mode) ]) titlewords = titlewords.union( set([ m.normalized_form() for m in tokenizer_obj.tokenize(
def main(): file_content_doc1 = open("rural_min.txt").read() file_content_doc2 = open("science_min.txt").read() option = True while option: print(""" 1. Create Index. 2. Query Index. 3. Exit """) option = input("Please select an option...!") if option == "1": sent_tokenize_list1 = sent_tokenize(file_content_doc1, language='english') sent_tokenize_list2 = sent_tokenize(file_content_doc2, language='english') if not os.path.exists("index_task3_min"): os.mkdir("index_task3_min") my_analyzer = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | Lemmatizer() pos_tagger = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | PosTagger() wordnetsyn1 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets() wordnetsyn2 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets1() wordnetsyn3 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets2() wordnetsyn4 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets3() schema = Schema(id=ID(stored=True, unique=True), standard=TEXT(stored=True, analyzer=StandardAnalyzer()), stem_text=TEXT(stored=True, analyzer=StemmingAnalyzer()), lemma=TEXT(stored=True, analyzer=my_analyzer), pos_text=TEXT(stored=True, analyzer=pos_tagger), hypernym=TEXT(stored=True, analyzer=wordnetsyn1), hyponym=TEXT(stored=True, analyzer=wordnetsyn2), holonym=TEXT(stored=True, analyzer=wordnetsyn3), meronyms=TEXT(stored=True, analyzer=wordnetsyn4), dependency=TEXT(analyzer=DependencyParser())) ix = index.create_in("index_task3_min", schema) writer = ix.writer() for sentence in sent_tokenize_list1: writer.add_document(standard=sentence, stem_text=sentence, lemma=sentence, pos_text=sentence, hypernym=sentence, hyponym=sentence, meronyms=sentence, holonym=sentence, dependency=sentence) for sentence in sent_tokenize_list2: writer.add_document(standard=sentence, stem_text=sentence, lemma=sentence, pos_text=sentence, hypernym=sentence, hyponym=sentence, meronyms=sentence, holonym=sentence, dependency=sentence) writer.commit() print_index_details(ix) print("\n\n Index created with various features as its fields") elif option == "2": ix = index.open_dir("index_task3") with ix.searcher(weighting=whoosh.scoring.BM25F()) as searcher: og = qparser.OrGroup.factory(0.5) q = input("\n Insert a query...!") query_text = MultifieldParser([ "standard", "stem_text", "lemma", "pos_text", "hyponym", "meronyms", "hypernym", "holonym" ], schema=ix.schema, group=og).parse(q) results = searcher.search(query_text, limit=10) for i, hit in enumerate(results): print(results.score(i), hit["standard"], sep=":") print("\n") elif option == "3": print("\n Goodbye") sys.exit(0) option = None else: print("\n Not valid choice try again...!")
import pymongo from flask import Flask, render_template, request, jsonify from whoosh.fields import Schema, TEXT, ID, KEYWORD from whoosh.index import create_in, open_dir from whoosh.qparser import MultifieldParser # jieba已经内置了和whoosh的集成功能 from jieba.analyse import ChineseAnalyzer # 构建分词 analyzer = ChineseAnalyzer() # 在url, title, tags, note, article中进行搜索, # 然后根据搜索到的_id再去数据库中获取数据. schema = Schema( nid=ID(unique=True, stored=True), title=TEXT(phrase=False), tags=KEYWORD(lowercase=True, commas=True, scorable=True), people=KEYWORD(lowercase=True, commas=True, scorable=True), ) # 创建文件内索引 def init_search(): if os.path.exists("indexdir"): shutil.rmtree('indexdir') os.mkdir("indexdir") create_in("indexdir", schema) return open_dir("indexdir") # 初始化检索 ix = init_search()
def get_absolute_url(self): return reverse('recommendations') + '?seed=%s' % self.paper_id def get_title(self): return self.title def set_rank(self, rank): self.rank = rank return self paper_schema = Schema( paper_id=ID(stored=True), title=TEXT(stored=True), abstract=TEXT(analyzer=StemmingAnalyzer()), paper_url=TEXT(), aspect_tasks=KEYWORD, aspect_methods=KEYWORD, aspect_datasets=KEYWORD, ) if settings.ASPECT_KNN_WHOOSH_INDEX_PATH and os.path.exists(settings.ASPECT_KNN_WHOOSH_INDEX_PATH): ix = index.open_dir(settings.ASPECT_KNN_WHOOSH_INDEX_PATH) #'/Users/maos01/Desktop/special-docembeds-release-files/output/pwc/whoosh_index' else: ix = None # Load vector models generic_vecs = KeyedVectors.load_word2vec_format(settings.ASPECT_KNN_GENERIC_W2V_PATH, limit=settings.ASPECT_KNN_LIMIT) if settings.ASPECT_KNN_GENERIC_W2V_PATH and os.path.exists(settings.ASPECT_KNN_GENERIC_W2V_PATH) else None # '/Users/maos01/Downloads/specter.1k.w2v.txt' task_vecs = KeyedVectors.load_word2vec_format(settings.ASPECT_KNN_TASK_W2V_PATH, limit=settings.ASPECT_KNN_LIMIT) if settings.ASPECT_KNN_TASK_W2V_PATH and os.path.exists(settings.ASPECT_KNN_TASK_W2V_PATH) else None method_vecs = KeyedVectors.load_word2vec_format(settings.ASPECT_KNN_METHOD_W2V_PATH, limit=settings.ASPECT_KNN_LIMIT) if settings.ASPECT_KNN_METHOD_W2V_PATH and os.path.exists(settings.ASPECT_KNN_METHOD_W2V_PATH) else None
from nltk import PorterStemmer, SnowballStemmer from whoosh import index from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED p_s = PorterStemmer() s_s = SnowballStemmer("english") kgram_numbers = [4, 5] #int(sys.argv[1]) translate_tab = string.maketrans("", "") stops = stopwords.words("english") stopset = set() for each in stops: stopset.add(each) my_schema = Schema(docId=ID(stored=True), title=TEXT(stored=True), body=TEXT(), tags=KEYWORD(stored=True)) def strip_content(content): sb = "" title = "" c = content.split("\n") for line in c: if line is not "\n" and line is not "" and len(line) > 1: title = line.rstrip() print(title) break for line in c: try: q = line.decode('ascii')
def test_build_attrs(self): schema = Schema() adapter = SAAdapter(SANotIndexable, schema) assert not adapter.indexable assert adapter.doc_attrs == {} adapter = SAAdapter(Entity, schema) assert adapter.indexable == False adapter = SAAdapter(SubclassEntityIndexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == { 'object_key', 'id', 'name', 'slug', 'object_type', 'text', 'created_at', 'updated_at', 'name_prefix', 'owner', 'owner_name', 'creator_name', 'creator', 'allowed_roles_and_users', 'tag_ids', 'tag_text', } assert all(lambda f: callable(f) for f in six.itervalues(adapter.doc_attrs)) assert set(schema.names()) == { 'object_key', 'id', 'object_type', 'name', 'slug', 'text', 'created_at', 'updated_at', 'name_prefix', 'owner', 'owner_name', 'creator_name', 'creator', 'allowed_roles_and_users', 'tag_ids', 'tag_text', } schema = Schema(id=NUMERIC(numtype=int, bits=64, signed=False, stored=True, unique=True), ) adapter = SAAdapter(Indexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == {'id', 'text', 'num', 'name'} assert all(lambda f: callable(f) for f in six.itervalues(adapter.doc_attrs)) assert set(schema.names()) == {'id', 'text', 'num', 'name'} assert isinstance(schema['text'], TEXT) assert isinstance(schema['num'], NUMERIC)
countryOfOrigin text, overview text)''') c.close() # initialise sentic net sn = SenticNet() # does stemming, removes accents so you can match words like cafe, facade etc and removes stopwords hsn_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | StopFilter() SCHEMA = Schema( filename=ID(unique=True, stored=True, analyzer=hsn_analyzer), content=TEXT(analyzer=hsn_analyzer, spelling=True), price=NUMERIC(sortable=True, stored=True), rating=NUMERIC(sortable=True, stored=True), noOfReviews=NUMERIC(sortable=True, stored=True), savings=NUMERIC(sortable=True, stored=True), percentageSavings=NUMERIC(sortable=True, stored=True), review=TEXT(analyzer=hsn_analyzer, spelling=True), productDesc=TEXT(stored=True), reviewPolarity=NUMERIC(sortable=True, stored=True), countryOfOrigin=TEXT(sortable=True, stored=True), overview=TEXT(stored=True), ) # function to check if a string contains a float def is_number(s): try: float(s) return True except ValueError: return False
from whoosh.filedb.filestore import RamStorage from whoosh.qparser import QueryParser from whoosh.query import Term from whoosh.fields import Schema, TEXT, ID, KEYWORD from corpint.core import project from corpint.model.entity import Entity schema = Schema(uid=ID(stored=True), fingerprint=TEXT, country=KEYWORD, name=TEXT(stored=True)) class EntityIndex(object): def __init__(self): storage = RamStorage() self.index = storage.create_index(schema) def build(self): project.log.info("Building entity search index...") writer = self.index.writer() q = Entity.find_by_origins(origins=[]) q = q.filter(Entity.active == True) # noqa count = 0 for entity in q: for fp in entity.fingerprints: writer.add_document(uid=entity.uid, fingerprint=fp, country=entity.country, name=entity.name)
def get_response_schema(): return Schema(link_tema=ID(stored=True), fecha=DATETIME(stored=True), texto=TEXT(stored=True), autor=TEXT(stored=True))
from whoosh.qparser import QueryParser import re, os, codecs, sys #Function that removes all the lines with tags style and script and document and head and title and also comments def visible(element): if element.parent.name in [ 'style', 'script', '[document]', 'head', 'title' ]: return False elif re.match('<!--.*-->', element.encode('utf-8')): return False return True schema = Schema(id=ID(stored=True), content=TEXT(stored=True)) dir = os.listdir('sample') ix = create_in("database", schema) for l in dir: print l p = "sample/" + l html = codecs.open(p, "r", "utf-8").read() soup = BeautifulSoup(html, 'html.parser') texts = soup.findAll(text=True) visible_texts = filter(visible, texts) s = u'' for elem in visible_texts: if (elem != u''): s += elem.strip(" \n\t\r") + " " writer = ix.writer() writer.add_document(id=l.strip(".html").decode('utf-8'), content=s)
def test_build_attrs_2() -> None: schema = Schema() adapter = SAAdapter(Entity, schema) assert adapter.indexable == False
def test_build_attrs_1() -> None: schema = Schema() adapter = SAAdapter(SANotIndexable, schema) assert not adapter.indexable assert adapter.doc_attrs == {}
def __init__(self, options, columns): super(WhooshFDW, self).__init__(options, columns) self.columns = columns self.indexdir = options["indexdir"] self.schema = Schema(title=NGRAM(stored=True))
from whoosh.fields import Schema, KEYWORD, TEXT from whoosh.filedb.filestore import RamStorage from whoosh.qparser import QueryParser SCHEMA = Schema(title=TEXT(stored=True), keywords=KEYWORD) class MovieStore: """Interface for searching movies by keyword.""" def __init__(self, data_source): self.index = RamStorage().create_index(SCHEMA) self.data_source = data_source def initialize(self): writer = self.index.writer() for doc in self.data_source.get_documents(): writer.add_document(**doc) writer.commit() def query_for_titles(self, keywords): with self.index.searcher() as searcher: query = QueryParser("keywords", self.index.schema).parse(" ".join(keywords)) return map(lambda res: str(res['title']), searcher.search(query))
def get_schema(): return Schema(titulo=TEXT(stored=True), fecha=DATETIME(stored=True), enlace=TEXT(stored=True), resumen=TEXT(stored=True), nombrefichero=ID(stored=True))
publisher.channel = "Script" config_section = 'Indexer' p = Process(config_section) # Indexer configuration - index dir and schema setup baseindexpath = join(os.environ['AIL_HOME'], p.config.get("Indexer", "path")) indexRegister_path = join(os.environ['AIL_HOME'], p.config.get("Indexer", "register")) indexertype = p.config.get("Indexer", "type") INDEX_SIZE_THRESHOLD = int(p.config.get("Indexer", "index_max_size")) if indexertype == "whoosh": schema = Schema(title=TEXT(stored=True), path=ID(stored=True, unique=True), content=TEXT) if not os.path.exists(baseindexpath): os.mkdir(baseindexpath) # create the index register if not present time_now = int(time.time()) if not os.path.isfile(indexRegister_path): #index are not organised print("Indexes are not organized") print("moving all files in folder 'old_index' ") #move all files to old_index folder move_index_into_old_index_folder(baseindexpath) print("Creating new index") #create all_index.txt with open(indexRegister_path, 'w') as f: f.write(str(time_now))
from flask import request, jsonify from whoosh import index, scoring from whoosh.fields import Schema, TEXT, ID, STORED from whoosh.qparser import QueryParser from whoosh.analysis import SimpleAnalyzer, CharsetFilter, KeywordAnalyzer #from whoosh.support.charset import accent_map # ===== GLOBAL VARIABLES ======== file_dir = "files" index_dir = "index" response = [] schema_name = "search" schema = Schema(path=ID(unique=True, stored=True), time=STORED, content=TEXT(analyzer=KeywordAnalyzer())) #my_analyzer = SimpleAnalyzer() | CharsetFilter(accent_map) #schema_fuzzy = Schema(path=ID(unique=True, stored=True), time=STORED, content=TEXT(analyzer=my_analyzer)) # ========= REST API ============= app = flask.Flask(__name__) app.config["DEBUG"] = True # Home @app.route('/', methods=['GET']) def home(): return "<h1>Home</h1><p>Try to pass search terms with: http://127.0.0.1:5000/search?q=comma,separated,search,terms</p>"
import json import os from whoosh.index import create_in, open_dir from whoosh.fields import * from whoosh.fields import Schema # 创建schema, stored为True表示能够被检索 schema = Schema(link=ID(stored=True), # 文件的链接 filetype=ID(stored=True) # 文件类型 ) def getFiles(path): file = open(path, 'r') content = file.read() files = json.JSONDecoder(strict=False).decode(content) return files def createIndex(): global ix if not os.path.exists("index_for_imag"): os.mkdir("index_for_imag") ix = create_in("index_for_imag", schema) else: ix = open_dir('index_for_imag') writer = ix.writer() # 电光学院 paths = ['/Users/lww/PycharmProjects/WebSearchingSystem/Spider/history_images.json'] for path in paths:
def get_schema(): return Schema(remitente=TEXT(stored=True), destinatarios=KEYWORD(stored=True), asunto=TEXT(stored=True), contenido=TEXT(stored=True))
from whoosh.qparser import MultifieldParser from whoosh.query import And, Every, Term from galaxy import exceptions from galaxy.exceptions import ObjectNotFound from galaxy.util.search import parse_filters log = logging.getLogger(__name__) schema = Schema( id=NUMERIC(stored=True), name=TEXT(field_boost=1.7, stored=True), description=TEXT(field_boost=1.5, stored=True), long_description=TEXT(stored=True), homepage_url=TEXT(stored=True), remote_repository_url=TEXT(stored=True), repo_owner_username=TEXT(stored=True), categories=KEYWORD(stored=True, commas=True, scorable=True), times_downloaded=STORED, approved=STORED, last_updated=STORED, repo_lineage=STORED, full_last_updated=STORED) class RepoWeighting(scoring.BM25F): """ Affect the BM25G scoring model through the final method. source: https://groups.google.com/forum/#!msg/whoosh/1AKNbW8R_l8/XySW0OecH6gJ """ use_final = True
class Index: schema = Schema(content=TEXT(stored=True, analyzer=Analyzer()), user=ID(stored=True), mentionsUsers=KEYWORD(stored=True), mentionsRoles=KEYWORD(stored=True), time=DATETIME) def __init__(self, dir, authorIds={}, context=None, start=True, baseDir=None): if not os.path.isdir(dir): os.mkdir(dir) if not list(os.listdir(dir)): self.ix = create_in(dir, Index.schema) if not baseDir: baseDir = os.path.join(os.path.split(dir)[0]) self.ix = whoosh.index.open_dir(dir) self.searchers = [] self.failedDir = os.path.join(baseDir, "failed") utils.ensureDir(self.failedDir) self.incomingDir = os.path.join(baseDir, "incoming") utils.ensureDir(self.incomingDir) self.indexer = threading.Thread(target=Index.indexLoop, args=[self]) self.logger = open(os.path.join(baseDir, "index.log"), "a") self.stopping = False if start: self.startIndexer() self.counts = {} def getCounts(self, uid): if uid in self.counts: return self.counts[uid] else: with self.getSearcher() as searcher: userNode = whoosh.query.Term("user", uid) # userId in the user field results = searcher.search(userNode) self.counts[uid] = len(results) return len(results) def getLast(self, uid, number): with self.getSearcher() as searcher: userNode = whoosh.query.Term("user", uid) # userId in the user field results = searcher.search( userNode, #sortedby="time", limit=number) return deduper(results, dedupe=True) def startIndexer(self): self.indexer.start() self.stopping = False def __del__(self): self.stopping = True if self.indexer.is_alive(): self.indexer.join() def log(self, text): print(text) self.logger.write(text) self.logger.write("\n") self.logger.flush() def indexLoop(self): print("Beginning index loop") writer = self.ix.writer() while not self.stopping: path = None try: for file in os.listdir(self.incomingDir): self.log("indexing {0}\n".format(file)) path = os.path.join(self.incomingDir, file) with open(path, "r", encoding="utf-8") as f: for line in f: doc = json.loads(line) ts = doc["timestamp"] t = datetime.datetime.fromtimestamp(ts) userid = "{0}".format(doc["user"]) writer.add_document( content=doc["content"].strip(), user=userid, mentionsUsers=",".join(doc["mentions"]), mentionsRoles=",".join(doc["role_mentions"]), time=t) writer.commit() self.log("committed {0}\n".format(file)) writer = self.ix.writer() for i in range(0, 5): try: os.remove(path) break except: pass self.searchers = [] self.getSearcher() else: time.sleep(10) except Exception as e: print(str(e)) self.log(str(e)) try: if path: if not os.path.isdir(self.failedDir): os.mkdir(self.failedDir) shutil.move(path, os.path.join(self.failedDir, file)) except Exception as ee: print(str(ee)) self.log(str(ee)) raise class ScopedSearcher: def __init__(self, parent, **kwargs): self.parent = parent self.handle = None self.args = kwargs self.time = time.time() def __enter__(self): try: self.handle = self.parent.searchers.pop() if (time.time() - self.handle.time) > 60: self.handle = self.handle.refresh() setattr(self.handle, 'time', time.time()) self.parent.log("Refreshed handle") return self.handle except: self.handle = self.parent.ix.searcher(**self.args) setattr(self.handle, 'time', time.time()) return self.handle def __exit__(self, a, b, c): self.parent.searchers.append(self.handle) self.handle = None def getSearcher(self, **kwargs): return Index.ScopedSearcher(self, **kwargs) def queryStats(self, text, expand=False, timer=NoTimer()): """ Returns a sorted tuple of (count, userName) """ with timer.sub_timer("query-stats") as t: with self.getSearcher() as searcher: from whoosh.qparser import QueryParser if expand: qp = QueryParser("content", schema=self.ix.schema, termclass=whoosh.query.Variations) else: qp = QueryParser("content", schema=self.ix.schema) q = qp.parse(text) with t.sub_timer("searcher.search") as s: results = searcher.search(q, limit=100000) with t.sub_timer("results") as s: counts = defaultdict(lambda: 0) with s.sub_timer("counts") as r: for r in results: u = r["user"] counts[u] += 1 with s.sub_timer("reverse") as r: counts = [(count, id) for id, count in counts.items() if count > 0] sc = reversed(sorted(counts)) return [v for v in sc] def deDupeResults(self, text, ret): exists = set([text.lower()]) i = len(ret) - 1 while i >= 0: r = ret[i] if not r[1].lower() in exists: exists.add(r[1].lower()) else: del ret[i] i = i - 1 return ret def queryLong(self, text, max=3, user=None, expand=False, timer=NoTimer()): with timer.sub_timer("query-long") as t: for attempt in range(0, 3): with t.sub_timer(attempt) as s: results = self.query(text, max * (2 + attempt), user, expand=(expand or (attempt > 0)), timer=t, dedupe=True) ret = list(results) if len(ret) >= max: ret = ret[:max] break return ret def queryUserOrI(self, text, max=3, userId=None, userName=None, expand=False, dedupe=False): with self.getSearcher(weighting=whoosh.scoring.TF_IDF) as searcher: from whoosh.qparser import QueryParser qp = QueryParser("content", schema=self.ix.schema) i_node = qp.parse("I") i_node.fieldname = "content" # "I" in content userNode = whoosh.query.Term("user", userId) # userId in the user field user_i_node = whoosh.query.And([userNode]) #, i_node]) userTextNode = qp.parse(userName) userTextNode.fieldname = "content" subjectNode = whoosh.query.Or([userTextNode, user_i_node]) qp2 = QueryParser("content", schema=self.ix.schema, termclass=whoosh.query.Variations) textNode = qp2.parse(text) textNode.fieldname = "content" q = whoosh.query.And([textNode, subjectNode]) results = searcher.search(q, limit=max) return Results(results) def query(self, text, max=3, user=None, expand=False, userNames=[], dedupe=False, timer=Timer("index.query")): """ text: the main text query of the content. expand=bool applies to this. if user or userNames are supplied, text is restricted to content (else no field res) user: id of a user to restrict to userNames: ORed with 'user', but a text search in content :/ """ with timer.sub_timer("query") as ot: with self.getSearcher(weighting=whoosh.scoring.TF_IDF) as searcher: with ot.sub_timer("inner-q") as t: with t.sub_timer("query-parse") as s: if expand: qp = QueryParser("content", schema=self.ix.schema, termclass=whoosh.query.Variations) else: qp = QueryParser("content", schema=self.ix.schema) nonExpandQP = QueryParser("content", schema=self.ix.schema) userNodes = [] # Massive spaghetti here textNode = qp.parse(text) textNode.fieldname = "content" if user: userNode = whoosh.query.Term("user", user) userNodes.append(userNode) if userNames: q2 = nonExpandQP.parse(" OR ".join(userNames)) q2.field = "content" userNodes.append(q2) q = textNode if userNodes: u = whoosh.query.Or(userNodes) q = whoosh.query.And([q, u]) with t.sub_timer("searcher.search") as s: results = searcher.search(q, limit=max) return deduper(results, dedupe=dedupe) async def collect_terms(self, t, usernames, corpusThresh, freq, minScore, corpusSize, filters={}, timer=NoTimer()): with timer.sub_timer("collect_terms outer") as t_: with self.getSearcher() as s: ret = [] q = whoosh.query.Term("content", t) for u in usernames: uq = filters.get(u, whoosh.query.Term("user", u)) with t_.sub_timer("search") as tt_: res = s.search(q, limit=100000000, filter=uq) with t_.sub_timer("length") as tt_: occs = res.scored_length() with t_.sub_timer("counting") as tt_: if occs and occs > corpusThresh * freq: score = (occs / self.getCounts(u)) / (freq / corpusSize) if score > 0: score = math.log(score) * 10 if score > minScore: ret.append((u, t, score)) return ret async def terms_async(self, usernames, corpusThresh=0.6, corpusNorm=False, minScore=450, timer=NoTimer()): ret = [] with timer.sub_timer("getCounts") as t: totalCounts = {u: self.getCounts(u) for u in usernames} num = re.compile(r"^\d+$") with timer.sub_timer("getReader") as t: reader = self.ix.reader() with timer.sub_timer("numDocs") as t: numDocs = reader.doc_count() with timer.sub_timer("initFilters") as t: with self.getSearcher() as s: filters = { u: s.search(whoosh.query.Term("user", u)) for u in usernames } with timer.sub_timer("termLoop") as t_: for t in reader.field_terms("content"): if num.match(t): continue if len(t) < 3: continue with t_.sub_timer("termFreq") as t__: freq = reader.frequency("content", t) if freq > 50 and freq < numDocs / 100: try: ret += await self.collect_terms(t, usernames, corpusThresh, freq, minScore, numDocs, filters=filters, timer=t_) except Exception as e: print( "Error while iterating through terms: {0}".format( e)) return ret def terms(self, usernames, corpusThresh=0.6, corpusNorm=False, minScore=450): ret = [] totalCounts = {u: self.getCounts(u) for u in usernames} num = re.compile(r"^\d+$") reader = self.ix.reader() numDocs = reader.doc_count() for t in reader.field_terms("content"): if num.match(t): continue if len(t) < 3: continue freq = reader.doc_frequency("content", t) if freq > 50 and freq < numDocs / 100: # print("{0}: {1}".format(t, freq)) with self.getSearcher() as s: q = whoosh.query.Term("content", t) uq = whoosh.query.Or( [whoosh.query.Term("user", u) for u in usernames]) qry = whoosh.query.And([q, uq]) res = s.search(qry, groupedby="user") res.estimated_length() d = res.groups("user") for u, ary in d.items(): if len(ary) > corpusThresh * freq: score = 1000000 * len(ary) / totalCounts[u] / freq if score > minScore: ret.append((u, t, score)) break return ret def countUserMentionsOthers(self, uid): with self.getSearcher() as s: uq = whoosh.query.Term("mentionsUsers", "*") qp = QueryParser("mentionsUsers", schema=self.ix.schema) tq = qp.parse("*") uq = whoosh.query.Term("user", uid) q = whoosh.query.And([uq, tq]) res = s.search(q, limit=10000000) return res def getMentionGraph(self, coreUsers: list): qp = QueryParser("mentionsUsers", schema=self.ix.schema) tq = qp.parse("*") # something in mentionUsers ret = defaultdict(lambda: defaultdict(int)) with self.getSearcher() as s: for uid in coreUsers: uq = whoosh.query.Term("user", uid) # limit to uid q = whoosh.query.And([uq, tq]) res = s.search(q, limit=10000000) thisUserContrib = ret[uid] for r in res: mentions = r["mentionsUsers"] mentions = mentions.split(",") for m in mentions: thisUserContrib[m] += 1 return ret def whoMentions(self, target: str, names: set): if type(names) != set: names = set(names) with self.getSearcher() as s: q = whoosh.query.Or( [whoosh.query.Term("content", n) for n in names]) uq = whoosh.query.Term("mentionsUsers", target) qry = whoosh.query.Or([q, uq]) res = s.search(qry, limit=10000000) counts = defaultdict(int) for r in res: counts[r["user"]] += 1 return counts def getTimes(self, userId): import whoosh.sorting from datetime import datetime, timedelta uq = whoosh.query.Term("user", userId) end = datetime.utcnow() end = datetime(end.year, end.month, end.day) gap = timedelta(hours=6) start = end - 180 * gap facet = whoosh.sorting.DateRangeFacet("time", start, end, gap) with self.getSearcher() as s: r = s.search(uq, groupedby=facet) g = r.groups() return g
# Note: Has to be run from Code folder # Decrypt files import decrypt # removing the .py fixed the error, but I don't know why? # Create/open index import os.path from whoosh.fields import Schema, TEXT, ID, KEYWORD from whoosh import index schema = Schema(content=TEXT, path=ID(stored=True), tags=KEYWORD(scorable=True)) #print(os.getcwd()) homeDir = os.path.realpath(__file__).replace('Code/search.py','') if not os.path.exists('Index'): os.mkdir('Index') ix = index.create_in("Index", schema) else: ix = index.open_dir("Index") # Add new files to index import json database = os.path.realpath(__file__).replace('Code/search.py','database.json') writer = ix.writer() with open(database,'r') as f: data = json.load(f) for item in data: if item['indexed'] == False:
from galaxy.exceptions import ObjectNotFound import logging log = logging.getLogger(__name__) eggs.require("Whoosh") import whoosh.index from whoosh import scoring from whoosh.fields import Schema, STORED, TEXT from whoosh.qparser import MultifieldParser schema = Schema(id=STORED, name=TEXT(field_boost=1.7, stored=True), description=TEXT(field_boost=1.5, stored=True), long_description=TEXT(stored=True), homepage_url=TEXT(stored=True), remote_repository_url=TEXT(stored=True), repo_owner_username=TEXT(stored=True), times_downloaded=STORED, approved=STORED, last_updated=STORED, full_last_updated=STORED) class RepoWeighting(scoring.BM25F): """ Affect the BM25G scoring model through the final method. source: https://groups.google.com/forum/#!msg/whoosh/1AKNbW8R_l8/XySW0OecH6gJ """ use_final = True def final(self, searcher, docnum, score):
from galaxy.webapps.tool_shed import config from galaxy.webapps.tool_shed import model from galaxy.tools.loader_directory import load_tool_elements_from_path from galaxy import eggs eggs.require( "SQLAlchemy" ) eggs.require( "Whoosh" ) from whoosh.filedb.filestore import FileStorage from whoosh.fields import Schema, STORED, TEXT repo_schema = Schema( id=STORED, name=TEXT( stored=True ), description=TEXT( stored=True ), long_description=TEXT( stored=True ), homepage_url=TEXT( stored=True ), remote_repository_url=TEXT( stored=True ), repo_owner_username=TEXT( stored=True ), times_downloaded=STORED, approved=STORED, last_updated=STORED, full_last_updated=STORED ) tool_schema = Schema( name=TEXT( stored=True ), description=TEXT( stored=True ), owner=TEXT( stored=True ), id=TEXT( stored=True ), help=TEXT( stored=True ), version=TEXT( stored=True), repo_owner_username=TEXT( stored=True ), repo_id=STORED )
import os, os.path import reimport pickle from whoosh.analysis import StemmingAnalyzer from whoosh import index from whoosh.fields import Schema, ID, TEXT, STORED, IDLIST if not os.path.exists('indexdir'): os.mkdir("indexdir") schema = Schema(name = TEXT(stored=True), award_list = TEXT(stored=True), track_list = TEXT(stored=True), wikilink = TEXT(stored=True)) ix = index.create_in("indexdir", schema) writer = ix.writer() #loading dictionaries DICT_artist_awards = open("FINAL_artist_awards_dict.pkl", "rb") artist_awards = pickle.load(DICT_artist_awards) DICT_artist_awards.close() DICT_artist_tracks = open("FINAL_artist_tracks_dict.pkl", "rb") artist_tracks = pickle.load(DICT_artist_tracks) DICT_artist_tracks.close() DICT_wikipage = open("FINAL_wikipage_dict.pkl", "rb") wikipage_links = pickle.load(DICT_wikipage) DICT_wikipage.close() print('LEN - awards > ', len(artist_awards))
from solvertools.wordlist import WORDS from solvertools.normalize import slugify, sanitize from solvertools.util import data_path, corpus_path from whoosh.fields import Schema, ID, TEXT, KEYWORD, NUMERIC from whoosh.analysis import StandardAnalyzer from whoosh.index import create_in import nltk import os from tqdm import tqdm schema = Schema(slug=ID, text=TEXT(stored=True, analyzer=StandardAnalyzer()), definition=TEXT(stored=True, analyzer=StandardAnalyzer()), length=NUMERIC) def init_search_index(): nltk.download('wordnet') from nltk.corpus import wordnet get_synset = wordnet._synset_from_pos_and_offset def get_adjacent(synset): return [ name for pointer_tuples in synset._pointers.values() for pos, offset in pointer_tuples for name in get_synset(pos, offset).lemma_names() ] os.makedirs(data_path('search'), exist_ok=True) ix = create_in(data_path('search'), schema) writer = ix.writer(procs=4)
import cgi from google.appengine.ext import webapp from google.appengine.ext.webapp.util import run_wsgi_app from whoosh import store from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT from whoosh.index import getdatastoreindex from whoosh.qparser import QueryParser, MultifieldParser import logging SEARCHSCHEMA = Schema(content=TEXT(stored=True)) class MainPage(webapp.RequestHandler): def get(self): self.response.out.write('<html><body>') self.response.out.write(""" <form action="/search" method="get"> <div><input name="query" type="text" value=""><input type="submit" value="Search"></div> </form> </body> </html>""") # Write the submission form and the footer of the page self.response.out.write(""" <form action="/sign" method="post"> <div><textarea name="content" rows="3" cols="60"></textarea></div> <div><input type="submit" value="Sign Guestbook"></div> </form> </body>
'note': TEXT(stored=True), 'cdrom': TEXT(stored=True), 'cite': TEXT(stored=True), 'pages': TEXT(stored=True), 'volume': TEXT(stored=True), 'number': TEXT(stored=True), 'journal': TEXT(stored=True), 'publisher': TEXT(stored=True), 'booktitle': TEXT(stored=True), 'isbn': TEXT(stored=True), 'series': TEXT(stored=True), 'school': TEXT(stored=True), 'type': TEXT(stored=True) } schema = Schema(**fields) indexdir = tempfile.mkdtemp() ix = create_in(indexdir, schema) writer = ix.writer() def add_document(doc): return attrs = {} for attrname in fields.keys(): if hasattr(doc, attrname) and doc.__getattribute__(attrname): attrs[attrname] = doc.__getattribute__(attrname) writer.add_document(**attrs) def commit():