def searchModule(qq): global results_bm25, results_tfidf, results_tf, results ,idx, time_bm25, time_tfidf, time_tf results_bm25 = results_tfidf = results_tf = results = [] time_bm25 = time_tfidf = time_tf = None if flag == 1: if doStem!="false" and doStop!="false": idx = open_dir("Indexed/Index_stsw") elif doStem!="false" and doStop=="false": idx = open_dir("Indexed/Index_st") elif doStem=="false" and doStop!="false": idx = open_dir("Indexed/Index_sw") else: idx = open_dir("Indexed/Index") qp = qparser.QueryParser("content", schema=idx.schema) q = qp.parse(unicode(qq)) #by default, the parser treats the words as if they were connected by AND s = idx.searcher() results = s.search(q) results.fragmenter.surround = 50 start = time.time() results_bm25 = idx.searcher().search(q) time_bm25 = (time.time()-start) results_bm25.fragmenter.surround = 50 start = time.time() results_tfidf = idx.searcher(weighting=scoring.TF_IDF()).search(q) time_tfidf = (time.time()-start) results_tfidf.fragmenter.surround = 50 start = time.time() results_tf = idx.searcher(weighting=scoring.Frequency()).search(q) time_tf = (time.time()-start) results_tf.fragmenter.surround = 50
def open_index(self, idx_dir): global index_error_given if os.path.exists(idx_dir): try: return open_dir(idx_dir) except ValueError: if not index_error_given: index_error_given = True g.es_print('bigdash.py: exception in whoosh.open_dir') g.es_print('please remove this directory:', g.os_path_normpath(idx_dir)) return None # Doesn't work: open_dir apparently leaves resources open, # so shutil.rmtree(idx_dir) fails. # g.es_print('re-creating', repr(idx_dir)) # try: # import shutil # shutil.rmtree(idx_dir) # os.mkdir(idx_dir) # self.create() # return open_dir(idx_dir) # except Exception as why: # g.es_print(why) # return None else: try: os.mkdir(idx_dir) self.create() return open_dir(idx_dir) except Exception: g.es_exception() return None
def __load__(region=None): """加载/建立索引 :param region: 索引范围,None表示加载所有索引;news\blog表示加载对应索引 :return: 是否加载成功 """ # 加载索引 if region: if region in Indexer.__index__: return True else: if region not in index_dir: return False if not os.path.exists(index_dir[region]): os.makedirs(index_dir[region]) Indexer.__index__[region] = index.create_in(index_dir[region], schema, indexname=region) else: Indexer.__index__[region] = index.open_dir(index_dir[region], indexname=region) return True else: # 加载全部索引 for reg in index_dir.keys(): if reg in Indexer.__index__: return True else: if not os.path.exists(index_dir[reg]): os.mkdir(index_dir[reg]) Indexer.__index__[reg] = index.create_in(index_dir[reg], schema, indexname=reg) else: Indexer.__index__[reg] = index.open_dir(index_dir[reg], indexname=reg) return True
def init(): # Setting my schema ... schema_email = Schema( path=TEXT(stored=True), sender_email=TEXT(stored=True), recipient_emails=TEXT, date=DATETIME, subject=TEXT(stored=True), body=TEXT, ) schema_book = Schema(email=TEXT(stored=True), name=TEXT(stored=True)) schemas = {"index_emails": schema_email, "index_book": schema_book} if not os.path.exists(index_path): os.mkdir(index_path) indexes = {} for ixname, schema in schemas.items(): """ Esta parte es mejorable, ya que sólo indexa si no existe indice. No tiene en cuenta si los archivos indexados se han modificado o si se han eliminado como se explica aquí: @url http://pythonhosted.org/Whoosh/indexing.html#incremental-indexing """ exists = index.exists_in(index_path, indexname=ixname) if not exists: ix = index.create_in(index_path, schema, indexname=ixname) # Indexing ... ix = index.open_dir(index_path, indexname=ixname) writer = ix.writer() if ixname == "index_emails": files = read_dir() index_emails(files, writer) elif ixname == "index_book": index_book(writer) else: ix = index.open_dir(index_path, indexname=ixname) indexes[ixname] = ix # Main routine while True: ix = indexes.get("index_emails") with ix.searcher() as searcher: input_user = str(raw_input("Introduzca una palabra del asunto o cuerpo (p.e. contrato): ")) mparser = MultifieldParser(["subject", "body"], schema=ix.schema) myquery = mparser.parse(unicode(input_user)) results = searcher.search(myquery) print "==================================================" for result in results: # read_file(result.get("path")) print ("Remitente: " + findNameBySender(indexes, result.get("sender_email"))) print ("Asunto: " + result.get("subject")) print "=================================================="
def getIndex(self): module_dir = self.indexdir +"/"+ self.sword_module_name if os.path.exists(module_dir): ix = index.open_dir(module_dir) else: indexer = Indexer(self.sword_module_name, self.Reader) indexer.buildIndex() ix = index.open_dir(module_dir) return ix
def init(): # Setting my schema ... schema_email = Schema(path=TEXT(stored=True), sender_email=TEXT(stored=True), recipient_emails=TEXT(stored=True), date=DATETIME, subject=TEXT(stored=True), body=TEXT) schema_book = Schema(email=TEXT(stored=True), name=TEXT(stored=True)) schemas = {"index_emails": schema_email, "index_book": schema_book } if not os.path.exists(index_path): os.mkdir(index_path) indexes = {} for ixname, schema in schemas.items(): ''' Esta parte es mejorable, ya que sólo indexa si no existe indice. No tiene en cuenta si los archivos indexados se han modificado o si se han eliminado como se explica aquí: @url http://pythonhosted.org/Whoosh/indexing.html#incremental-indexing ''' exists = index.exists_in(index_path, indexname=ixname) if(not exists): ix = index.create_in(index_path, schema, indexname=ixname) # Indexing ... ix = index.open_dir(index_path, indexname=ixname) writer = ix.writer() if(ixname == "index_emails"): files = read_dir() index_emails(files, writer) elif(ixname == "index_book"): index_book(writer) else: ix = index.open_dir(index_path, indexname=ixname) indexes[ixname] = ix # Main routine while(True): options = ['a', 'b', 'c'] input_user = str(raw_input("Elija el apartado que desea ejecutar (A, B o C): ")) input_user_lower = string.lower(input_user) if input_user_lower in options: def apartadoA(): input_user = raw_input("Introduzca una palabra del asunto o cuerpo (p.e. contrato): ") findMailByWordInSubjectOrBody(indexes, input_user) def apartadoB(): input_user = raw_input("Buscar emails posteriores a la fecha (con formato YYYYMMDD): ") findMailbyDate(indexes, input_user) def apartadoC(): input_user = raw_input("Introduzca palabras spam (p.e. 'Contrato Gracias compraventa'): ") findMailBySpamWords(indexes, input_user) options = { 'a': apartadoA, 'b': apartadoB, 'c': apartadoC } options[input_user_lower]() else: print "Opción no válida."
def update(self, tmp=False): """ Make sure index reflects current backend state, add missing stuff, remove outdated stuff. This is intended to be used: * after a full rebuild that was done at tmp location * after wiki is made read-only or taken offline * after the index was moved to the normal index location Reason: new revisions that were created after the rebuild started might be missing in new index. :returns: index changed (bool) """ index_dir = self.index_dir_tmp if tmp else self.index_dir index_all = open_dir(index_dir, indexname=ALL_REVS) try: # NOTE: self.backend iterator gives (mountpoint, revid) tuples, which is NOT # the same as (name, revid), thus we do the set operations just on the revids. # first update ALL_REVS index: revids_mountpoints = dict((revid, mountpoint) for mountpoint, revid in self.backend) backend_revids = set(revids_mountpoints) with index_all.searcher() as searcher: ix_revids_names = dict((doc[REVID], doc[NAME]) for doc in searcher.all_stored_fields()) revids_mountpoints.update(ix_revids_names) # this is needed for stuff that was deleted from storage ix_revids = set(ix_revids_names) add_revids = backend_revids - ix_revids del_revids = ix_revids - backend_revids changed = add_revids or del_revids add_revids = [(revids_mountpoints[revid], revid) for revid in add_revids] del_revids = [(revids_mountpoints[revid], revid) for revid in del_revids] self._modify_index(index_all, self.schemas[ALL_REVS], self.wikiname, add_revids, 'add') self._modify_index(index_all, self.schemas[ALL_REVS], self.wikiname, del_revids, 'delete') backend_latest_names_revids = set(self._find_latest_names_revids(index_all)) finally: index_all.close() index_latest = open_dir(index_dir, indexname=LATEST_REVS) try: # now update LATEST_REVS index: with index_latest.searcher() as searcher: ix_revids = set(doc[REVID] for doc in searcher.all_stored_fields()) backend_latest_revids = set(revid for name, revid in backend_latest_names_revids) upd_revids = backend_latest_revids - ix_revids upd_revids = [(revids_mountpoints[revid], revid) for revid in upd_revids] self._modify_index(index_latest, self.schemas[LATEST_REVS], self.wikiname, upd_revids, 'update') self._modify_index(index_latest, self.schemas[LATEST_REVS], self.wikiname, del_revids, 'delete') finally: index_latest.close() return changed
def _init_index(self, reset=False): index_path = os.path.join(jupyter_data_dir(), "index") # clear out old index if requested if reset: shutil.rmtree(index_path, True) # make sure there's a path to store the index data if not os.path.exists(index_path): os.makedirs(index_path) if not exists_in(index_path): # create an index with the current schema analyzer = ChineseAnalyzer() schema = Schema( basename=TEXT(stored=True, field_boost=5.0, analyzer=analyzer), dirname=ID(stored=True, analyzer=analyzer), path=ID(stored=True, unique=True, analyzer=analyzer), content=TEXT(stored=False, analyzer=analyzer), time=STORED, ) self.ix = create_in(index_path, schema) else: # open the existing index self.ix = open_dir(index_path) # build a query parser based on the current schema self.query_parser = MultifieldParser(["content", "basename", "dirname"], self.ix.schema)
def get(self): wikiResults = None jobResults = None projectResults = None if 'searchScope' in request.args and 'searchTerm' in request.args: searchTerm = request.args.get('searchTerm') searchScope = request.args.get('searchScope') index = open_dir('app/search/index') parser = QueryParser("content", schema=index.schema) with index.searcher() as searcher: if searchScope in ['everything', 'wiki']: wikiResults = [{'title':result['title'], 'url':'http://jhcwiki.jhc.co.uk/wiki/index.php/' + result['title'].replace(' ', '_')} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'WIKI'] if searchScope in ['everything', 'jobs']: jobResults = [{'title':result['title'], 'url':''} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'JOB'] if searchScope in ['everything', 'projects']: projectResults = [{'title':result['title'], 'url':url_for('projects.projectDetail', projectCode = result['title'].split('-')[0].strip())} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'PROJECT'] else: searchTerm = '' searchScope = 'everything' return render_template('search/search.html', wikiResults=wikiResults, jobResults=jobResults , projectResults=projectResults, searchTerm=searchTerm, searchScope=searchScope, title="Search")
def add_batch(posts): ix = index.open_dir(settings.WHOOSH_INDEX) wr = ix.writer(limitmb=50) for post in posts: update(post, handler=wr) wr.commit() ix.close()
def update_index(self, offering): """ Update the document of a concrete offering in the search index """ if not os.path.exists(self._index_path) or os.listdir(self._index_path) == []: raise Exception('The index does not exist') index = open_dir(self._index_path) index_writer = index.writer() text = self._aggregate_text(offering) purchasers_text = self._aggregate_purchasers(offering) in_date = None if offering.state == 'uploaded': in_date = offering.creation_date else: in_date = offering.publication_date # Get the document index_writer.update_document( id=unicode(offering.pk), owner=unicode(offering.owner_organization.pk), content=unicode(text), name=unicode(offering.name), popularity=Decimal(offering.rating), date=in_date, state=unicode(offering.state), purchaser=purchasers_text ) index_writer.commit()
def batch_query(querypath, indexpath): ix = index.open_dir(indexpath) with open(querypath, 'r') as fh: with open('output.txt', 'w') as out_file: rawdata = fh.read() document = get_section2(rawdata, "<top>", "</top>") for d in document: topicnum = get_section(d, "<num> Number:", "<title>")[0].strip(" ").strip("\n") title = get_section(d, "<title>", "<desc>")[0].strip(" ").strip("\n") desc = get_section(d, "<desc>", "<narr>")[0].replace("Description:","").strip(" ") narr = get_section(d, "<narr>", "</top>")[0].replace("Narrative:","").strip(" ") print topicnum, title, desc, narr with ix.searcher() as searcher: parser = qparser.QueryParser("content", schema=ix.schema, group=qparser.OrGroup) query = parser.parse(desc+" "+title) # query = QueryParser("content", ix.schema) results = searcher.search(query, limit=1000) print results[0] print results[1] # return for i in range(1000): out_file.write("%s\tQ0\t%s\t%s\t%s\t jack\n" % (topicnum, results[i].values()[1], results[i].rank+1, results[i].score))
def run(self): # open index self.buffer = deque(maxlen=BUFFERLINES) if not exists(self.indexdir): makedirs(self.indexdir) self.ix = create_in(self.indexdir, SCHEMA) else: if exists_in(self.indexdir): self.ix = open_dir(self.indexdir) else: self.ix = create_in(self.indexdir, SCHEMA) self.qp = QueryParser("content", self.ix.schema) self.searcher = self.ix.searcher() index_p = self.index_p while True: try: # check index_p try: type, data = index_p.recv() except EOFError: break try: if type == QUERY: self._processSearch(data) elif type == LOG: self._processLog(data) elif type == RENAME: self._processRename(data) else: prnt("Unexpected data in logindexsearch.") except: print_exc() prnt("EXCEPTION in logindexsearch process.") except KeyboardInterrupt: break self._dumpBuffer(self.buffer) self.searcher.close() self.ix.close()
def perform_category_query(query_words): """ Perform a query using the supplied words on the product category index """ indexdir = settings.WHOOSH_INDEX_DIR ix = open_dir(indexdir) query = QueryParser("sub_category", ix.schema).parse(query_words) categories = {} suggestion = None with ix.searcher() as searcher: results = searcher.search(query) for result in results: if result['category'] not in categories: categories[result['category']] = [result['sub_category']] else: categories[result['category']] += [result['sub_category']] corrected = searcher.correct_query(query, query_words) if hasattr(corrected.query, 'text') and corrected.query.text != query_words: suggestion = corrected.string return categories, suggestion
def query(indexpath): ix = index.open_dir(indexpath) with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse("test") results = searcher.search(query) print results[0]
def __init__(self, idx_dir): self.idx_dir = idx_dir if not os.path.exists(idx_dir): os.mkdir(idx_dir) self.create() else: self.ix = open_dir(idx_dir)
def indexer(self, create=True): schema = self.bench.spec.whoosh_schema() path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname) if not os.path.exists(path): os.mkdir(path) if create: ix = index.create_in(path, schema) else: ix = index.open_dir(path) poolclass = None if self.options.pool: poolclass = find_object(self.options.pool) kwargs = dict(limitmb=int(self.options.limitmb), poolclass=poolclass, dir=self.options.tempdir, procs=int(self.options.procs), batchsize=int(self.options.batch)) if self.options.expw: from whoosh.filedb.multiproc import MultiSegmentWriter self.writer = MultiSegmentWriter(ix, **kwargs) else: self.writer = ix.writer(**kwargs) self._procdoc = None if hasattr(self.bench.spec, "process_document_whoosh"): self._procdoc = self.bench.spec.process_document_whoosh
def __init__(self, **kwargs): super(WhooshEngine, self).__init__() analyzer = ( StemmingAnalyzer() | CharsetFilter(accent_map) | NgramFilter(minsize=4, maxsize=10) ) self.schema = Schema( id=ID(stored=True), title=TEXT(stored=True, field_boost=5.0, analyzer=analyzer), firstname=TEXT(stored=True, field_boost=2.0, analyzer=analyzer), lastname=TEXT(stored=True, field_boost=2.0, analyzer=analyzer), type=ID(stored=True), description=TEXT(stored=True, analyzer=analyzer), creators=TEXT(stored=False, analyzer=analyzer), tags=TEXT(stored=False, analyzer=analyzer), business_unit=TEXT(stored=False, analyzer=analyzer), position=TEXT(stored=False, analyzer=analyzer), competencies=TEXT(stored=False, analyzer=analyzer), text=TEXT(stored=True, analyzer=analyzer)) self.dir = kwargs['dir'] if not os.path.exists(self.dir): os.makedirs(self.dir) try: self._index = open_dir(self.dir) except EmptyIndexError: self._index = create_in(self.dir, self.schema)
def home(request): template = 'main_content.html' ix = index.open_dir(settings.WHOOSH_INDEX) hits = [] newsfound = [] query = request.GET.get('q', None) if query is not None and query != u"": # Whoosh don't understands '+' or '-' but we can replace # them with 'AND' and 'NOT'. query = query.replace('+', ' AND ').replace(' -', ' NOT ') parser = QueryParser("content", schema=ix.schema, group=OrGroup) try: qry = parser.parse(query) except: # don't show the user weird errors only because we don't # understand the query. # parser.parse("") would return None qry = None if qry is not None: searcher = ix.searcher() hits = searcher.search(qry) for h in hits: if News.objects.filter(pk=int(h["id"])).exists(): newsfound.append(News.objects.get(pk=int(h["id"]))) return render(request, template, {'query': query, 'hits': newsfound})
def __init__(self, parent=None): super(MikiEdit, self).__init__(parent) self.parent = parent self.settings = parent.settings self.setFontPointSize(12) self.setVisible(False) self.ix = open_dir(self.settings.indexdir) # Spell checker support try: import enchant enchant.Dict() self.speller = enchant.Dict() except ImportError: print("Spell checking unavailable. Need to install pyenchant.") self.speller = None except enchant.errors.DictNotFoundError: print("Spell checking unavailable. Need to install dictionary (e.g. aspell-en).") self.speller = None self.imageFilter = "" self.documentFilter = "" for ext in self.settings.attachmentImage: self.imageFilter += " *" + ext for ext in self.settings.attachmentDocument: self.documentFilter += " *" + ext self.imageFilter = "Image (" + self.imageFilter.strip() + ")" self.documentFilter = "Document (" + self.documentFilter.strip() + ")" self.downloadAs = "" self.networkManager = QNetworkAccessManager() self.networkManager.finished.connect(self.downloadFinished)
def _get_index(index_path, schema): if index.exists_in(index_path): return index.open_dir(index_path) else: if not os.path.exists(index_path): os.mkdir(index_path) return index.create_in(index_path, schema)
def search(self, ix=None): if ix is None: ix = open_dir(self.dir_name) self.searcher = ix.searcher() fields = [] qs = '' # We use parenthesis to prevent operators like OR used in source # to affect target if self.source is not None and len(self.source) > 0: qs += u' source:({0})'.format(self.source) fields.append("source") if self.target is not None and len(self.target) > 0: qs += u' target:({0})'.format(self.target) fields.append("target") if self.project is not None and self.project != 'tots' and len(self.project) > 0: if self.project == 'softcatala': qs += u' softcatala:true' fields.append("softcatala") else: if ',' in self.project: projects = self.project.split(',') val = ''.join(["'{0}',".format(project) for project in projects]) val = val[:-1].replace(',' , ' OR ') qs += u' project:({0})'.format(val) else: qs += u' project:(\'{0}\')'.format(self.project) fields.append("project") self.query = MultifieldParser(fields, ix.schema).parse(qs)
def search_toc(words, index_path): words = _strip_diacritics(words) " ".join(words.split()) # remove multiple spaces ix = open_dir(index_path, indexname="toc-index") parser = qparser.QueryParser("title", ix.schema, group=qparser.AndGroup) words = suffix_query(words) # prefix and suffix query = parser.parse(words) final_result = [] with ix.searcher() as searcher: results = searcher.search(query) # int(page_number), pagelen=10 total_count = len(results) for hit in results: title = hit["title"] parent_title = hit["parent_title"] full_title = shorten(title, 10) if len(parent_title) > 0: full_title = shorten(parent_title, 10) + " : " + shorten(title, 10) final_result.append( { "title": full_title, "serial": hit["serial"].encode("utf-8"), # "None" if case if not leaf "topicid": hit["topicid"].encode("utf-8"), } ) return json.dumps({"count": total_count, "result": final_result}, ensure_ascii=False)
def search_index(search_model, query, fields=[], limit=None): ix = index.open_dir(search_model.get_path()) fields = fields or search_model.fields hits = [] query = smart_unicode(query) limit = limit or getattr(settings, 'DJOOSH_SEARCH_LIMIT', 100) if query and fields: query = query.replace('+', ' AND ').replace('|', ' OR ') parser = qparser.MultifieldParser(fields, schema=ix.schema) qry = parser.parse(query) try: qry = parser.parse(query) except: qry = None if qry: searcher = ix.searcher() try: hits = searcher.search(qry, limit=limit) except: hits = [] ix.close() return hits
def __init__(self, db_path): ensuredir(db_path) if index.exists_in(db_path): self.index = index.open_dir(db_path) else: self.index = index.create_in(db_path, schema=self.schema) self.qparser = QueryParser('text', self.schema)
def update_index(search_model, obj=None, created=True): ixpath = search_model.get_path() ix = index.open_dir(ixpath) if obj: objects = [obj] else: objects = search_model.model.objects.all() writer = ix.writer() for obj in objects: fields = {} for field in search_model.fields: fields[field] = smart_unicode(getattr(obj, field, '')) if created: try: writer.update_document(**fields) except: pass else: try: writer.add_document(**fields) except: pass writer.commit() ix.close()
def init(self): """Index and writer object initialization. """ target = os.path.dirname(__file__) + "/../data/indexcopy" self.index = open_dir(target) self.writer = self.index.writer()
def __init__(self): self.directory = os.path.join(edocuments.root_folder, '.index') self.dirty = False schema = Schema(**{ PATH: ID(stored=True, unique=True), CONTENT: TEXT(stored=True), DATE: STORED, DIRECTORY: STORED, MD5: TEXT(stored=True), }) self.parser_path = QueryParser("path_id", schema) self.parser_content = QueryParser("content", schema) if not exists_in(self.directory): os.makedirs(self.directory) self.index = create_in(self.directory, schema) else: self.index = open_dir(self.directory) if 'path' in self.index.schema.names(): with self.index.writer() as writer: writer.remove_field('path') if 'directory' not in self.index.schema.names(): with self.index.writer() as writer: writer.add_field('directory', STORED) if 'md5' not in self.index.schema.names(): with self.index.writer() as writer: writer.add_field('md5', TEXT(stored=True)) print( 'Field length:\npath: %i\ncontent: %i\nmd5: %i' % ( self.index.field_length("path_id"), self.index.field_length("content"), self.index.field_length("md5"), ) )
def _get_index(cli_ctx): from whoosh import index # create index if it does not exist already if not os.path.exists(INDEX_PATH): _create_index(cli_ctx) return index.open_dir(INDEX_PATH)
def delete_from_index(oblist): ix = index.open_dir(app.config['SEARCH_INDEX_PATH']) writer = ix.writer() for item in oblist: mapping = item.search_mapping() writer.delete_by_term('idref', mapping['idref']) writer.commit()
return Schema(related_vars=TEXT(stored=True), name=NGRAMWORDS(stored=True, minsize=3, maxsize=12, at='start', queryor=True), description=TEXT(stored=True), section=TEXT(stored=True), section_title=TEXT(stored=True), related_attrs=TEXT(stored=True), params=TEXT(stored=True)) if __name__ == '__main__': print("Building index...") if not os.path.exists(VAR_INDEX_DIR): os.mkdir(VAR_INDEX_DIR) ix = index.create_in(VAR_INDEX_DIR, get_schema()) print("Creating variables index...") ix = index.open_dir(VAR_INDEX_DIR) writer = ix.writer() all_vars = [ [u'adult_obesity,diabetes', u'obesity', u'Obesity Prevalence,Diabetes Prevalence', u'conditions_diseases', u'Healthcare', u'geo', None], [u'adult_obesity,diabetes', u'diabetes', u'Obesity Prevalence,Diabetes Prevalence', u'conditions_diseases', u'Healthcare', u'geo', None], [u'adult_obesity,diabetes', u'healthcare', u'Obesity Prevalence,Diabetes Prevalence', u'conditions_diseases', u'Healthcare', u'geo', None], [u'motor_vehicle_crash_deaths', u'car crashes', u'Motor Vehicle Crash Deaths', u'risky', u'Crime', u'geo', None], [u'motor_vehicle_crash_deaths', u'accidents', u'Motor Vehicle Crash Deaths', u'risky', u'Crime', u'geo', None], [u'adult_smoking', u'smokers', u'Adult Smoking Prevalence', u'risky', u'Healthcare', u'geo', None], [u'adult_smoking', u'cigarettes', u'Adult Smoking Prevalence', u'risky', u'Healthcare', u'geo', None], # [u'infant_mortality', u'infant mortality', u'Infant mortality', u'health', u'geo'], # [u'teen_births', u'teen births', u'Teen births', u'health', u'geo'], [u'mean_commute_minutes', u'commuters', u'Average Travel Time', u'commute_time', u'Transportation', u'geo', None],
import whoosh.index as index from whoosh import columns, fields, index, sorting from whoosh.qparser import QueryParser # ix = index.open_dir("./") # facet = sorting.FieldFacet("id", reverse=True) # searcher = ix.searcher() # # searchwords = "新西兰" # qp = QueryParser("gtitle", schema=ix.schema) # q = qp.parse(searchwords) # results = searcher.search(q, sortedby=facet) # for each in results: # print(each) from whoosh.qparser import QueryParser from whoosh.index import open_dir from whoosh.sorting import FieldFacet new_list = [] index = open_dir("./index/", indexname='goods') # 读取建立好的索引 with index.searcher() as searcher: parser = QueryParser("gtitle", index.schema) # 要搜索的项目,比如“phone_name myquery = parser.parse("鸭蛋") facet = FieldFacet("id", reverse=True) # 按序排列搜索结果 results = searcher.search( myquery, limit=None, sortedby=facet) # limit为搜索结果的限制,默认为10,详见博客开头的官方文档 for result1 in results: print(dict(result1)) new_list.append(dict(result1))
# coding=utf-8 from whoosh.index import open_dir from whoosh.qparser import MultifieldParser idx_dir = 'lagou_idx' ix = open_dir(idx_dir) searcher = ix.searcher() parser = MultifieldParser(["name", "desc"], schema=ix.schema) # Single field parser. k = u'搜索 OR Python city:上海' q = parser.parse(k) results = searcher.search_page(q, 1, pagelen=5) print(u'{0} results found for keyword {1}, {2} returned: '.format( len(results), k, results.scored_length())) for hit in results[:50]: print(hit['id']) print(hit['name']) # print(hit['city']) print(hit['com_name']) print('************')
#!/usr/bin/env python from whoosh.index import open_dir from whoosh.qparser import QueryParser ix = open_dir("index_dir") with ix.searcher() as searcher: text = input("Dime:") while len(text) > 0: query = QueryParser("content", ix.schema).parse(text) results = searcher.search(query) for r in results: print (r) # print (dir(results)) # print (results.docs) text = input("Dime más:")
def search_tweets(search_term, limit=5, restrict_to_user=None): index = open_dir("indexdir") if restrict_to_user is not None: restrict_to_user = QueryParser('user', index.schema).parse(restrict_to_user) return _do_search(index, search_term, limit, restrict_to_user)
query = ["t1"] # query is given as a list of terms class MyBM25Scorer(MyBaseScorer): def __init__(self, index_dir): MyBaseScorer.__init__(self, index_dir) def score_term(self, field, t, ftq, qlen, ftd, doclen): # TODO return 0 if __name__ == "__main__": # Self scorer print "Our ranking:" scorer = MyBM25Scorer(index_dir) scorer.score_all(query, field) scorer.close() # Whoosh scorer print "Whoosh ranking:" ix = index.open_dir(index_dir) with ix.searcher(weighting=scoring.BM25F()) as searcher: qp = qparser.QueryParser(field, schema=ix.schema) q = qp.parse(" ".join(query)) # we contatenate query terms results = searcher.search(q) for r in results: print r['id'], str(r.score)[0:6] ix.close()
def index(self): self.indexer = open_dir("index") self.review_indexer = open_dir("reviews_index");
import jieba import random from whoosh.query import Term from whoosh.qparser import QueryParser, MultifieldParser from whoosh.index import open_dir import requests from flask import Flask, jsonify, request from config import CONFIG app = Flask(__name__) # 加载索引文件 IDX = open_dir(dirname=CONFIG.get('IR_DIR')) def search(context='你喜欢看什么电影?', topic='电影', method='sampling', limit=10, data=None, rtype='test'): """检索函数 :param context: 对话历史 :param topic: 对话主题 :param method: 对话对构造方法 :param limit: 返回结果个数 """ with IDX.searcher() as searcher:
indexpath = Configuration.getIndexdir() from whoosh.index import create_in, exists_in, open_dir from whoosh.fields import Schema, TEXT, ID schema = Schema(title=TEXT(stored=True), path=ID(stored=True, unique=True), content=TEXT) if not os.path.exists(indexpath): os.mkdir(indexpath) if not exists_in(indexpath): ix = create_in(indexpath, schema) else: ix = open_dir(indexpath) def dumpallcveid(entry=None): cveid = [] if entry is None: for x in collection.find({}).sort('_id', 1): cveid.append(x['id']) else: for x in collection.find({}).sort("Modified", -1).limit(int(entry)): cveid.append(x['id']) return cveid def getcve(cveid=None): if cveid is None:
def ChineseAnalyzer(): return ChineseTokenizer() analyzer = ChineseAnalyzer() schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) if not os.path.exists("index"): os.mkdir("index") ix = index.create_in("index", schema) ix = index.open_dir("index") writer = ix.writer() for pid in xrange(len(pid_p_r)): writer.add_document(title=str(pid).decode("utf-8"), path=u"/" + str(pid).decode("utf-8"), content=pid_p_r[pid][0].decode("utf-8")) writer.commit() def find(text): og = qparser.OrGroup.factory(0.9) parser = qparser.QueryParser("content", schema, group=og) with ix.searcher() as searcher:
def SearchConcept(Concept, Parser): s = myindex.searcher() Found = [] for V in Variants(Concept): q = Parser.parse(V) Found += [ str(x)[:-3].split('wiki')[-1].split("'")[0] for x in list(s.search(q, limit=None)) ] return Found Concepts = readtxt('DutchDataUMLS/Concepts_UMLS.txt')[1:153862] myindex = index.open_dir("WikiIndex") qp = QueryParser("content", schema=myindex.schema) FoundConcepts = [] c = 1 OutFile = open('FoundConcepts.txt', 'a') for C in Concepts: print(str(c) + " of " + str(len(Concepts)) + " processed.\n") Found = SearchConcept(C[1], qp) if Found != []: FoundConcepts.append((C[0], C[1], Found)) OutFile.write(C[0] + '\t' + C[1] + '\t' + str(Found) + '\n') c += 1 OutFile.close()
writer = ix.writer() for fn in traverseFile("shegongku_db"): with open(fn, 'r', encoding='utf-8') as f: print(fn, "...") lines=0 while True: line1 = f.readline() if line1: writer.add_document(full_line=line1) lines+=1; else: break print(fn, lines, "added") writer.commit() print("index finished") # 以上为建立索引的过程 index1 = open_dir("shegongku_idx", indexname='allin1line') parser1 = QueryParser("full_line", index1.schema) while True: with index1.searcher() as searcher: print("pls input what u want to search:") key = input() myquery = parser1.parse(key) resultss = searcher.search(myquery, limit=2000) #print(type(resultss)) for result1 in resultss: d1=dict(result1)['full_line'] print(d1)
lastTitle = child textCur = [] addDocument(f, section, lastTitle, textCur) writer.commit() #--------------------------------------------------------------------------------------------- if __name__ == '__main__': BuildHelpIndex() from whoosh.qparser import QueryParser ix = open_dir(indexDir, readonly=True) with ix.searcher() as searcher, open('search.html', 'w') as f: query = QueryParser('content', ix.schema).parse(u'fastest lap') results = searcher.search(query, limit=20) f.write( '<table><tr><th></th><th align="left">Section</th><th align="left">Match</th></tr>\n' ) for i, hit in enumerate(results): f.write( '<tr><td align="left">%d.</td><td><a href="%s">%s</a></td><td>%s</td></tr>\n' % ((i + 1), hit['path'], hit['section'], hit.highlights('content'))) f.write('</table>\n') ix.close()
def __init__(self): self.ix = index.open_dir("indexdir")
text = '\n'.join(chunk for chunk in chunks if chunk) text = text.replace('\n', ' ').lower() hash_text = hashlib.md5(text.encode()) return (text, hash_text.hexdigest(), ' ', str(soup.title.string), html['url']) if len(keys) == 3: text = html['body'] hash_text = hashlib.md5(text.encode()) return (text, hash_text.hexdigest(), html['tags'], ' ', html['url']) if len(keys) == 4: return (html['body'], html['hash'], html['tags'], ' ', html['url']) return (html['body'], html['hash'], html['tags'], html['title'], html['url']) ix = index.open_dir("mesh_index") # qp = QueryParser("body", schema=ix.schema) # query = "Cell" # q = qp.parse(query) searcher = ix.searcher().documents() writer = ix.writer() i = 0 for doc in searcher: text, hash, tags, title, url = parse_html(doc) writer.update_document(url=url, body=text, tags=tags, hash=hash, title=title) print(str(i + 1) + ' completed.') i += 1
url=u"https://media.giphy.com/media/czwo5mMtaknhC/giphy.gif", tags=u"done finally free react") writer.add_document( url=u"https://media.giphy.com/media/qnOBmH70CGSVa/giphy.gif", tags=u"clap welldone goodjob applaud") writer.commit() def search_query(ix, q): with ix.searcher() as searcher: query = QueryParser("tags", ix.schema).parse(q) results = searcher.search(query) file_path = [(r["url"], i) for i, r in enumerate(results)] #print(file_path) if len(file_path) > 1: file_path = [random.choice(file_path)] #return [(r["path"],i) for i,r in enumerate(results)] return (file_path) if __name__ == '__main__': s = make_index() ix = open_dir("index") add_docs(ix) result = search_query(ix, 'react') print(result) #print(ix.schema) #results = ix.searcher().search(Every('tags')) #for result in results: # print (result['url'])
def get_item_count(dirs): ix = index.open_dir(os.path.join(baseindexpath, dirs)) return ix.doc_count_all()
# -*- coding: UTF-8 -*- import sys import os sys.path.append("../") from whoosh.index import create_in, open_dir from whoosh.fields import * from whoosh.qparser import QueryParser from jieba.analyse import ChineseAnalyzer analyzer = ChineseAnalyzer() schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) if not os.path.exists("tmp"): os.mkdir("tmp") ix = open_dir("tmp") searcher = ix.searcher() parser = QueryParser("content", schema=ix.schema) for keyword in (u"水果小姐", u"你", u"first", u"中文", u"交换机", u"交换", u"少林", u"乔峰"): print "result of ", keyword q = parser.parse(keyword) results = searcher.search(q) for hit in results: print hit.highlights("content") print "=" * 10
def phone_search(request): if request.method == "POST": print(request.body) data = json.loads(request.body) sessionId = data.get("sessionId") print(sessionId) decision_arr = securer(sessionId, request.headers["user_agent"]) if decision_arr[0] == False: return JsonResponse(decision_arr[1]) else: phone_name = data.get("phone_name") if phone_name == None: return JsonResponse({"error": "no phone_name"}) ix = index.open_dir("index") qp = QueryParser("phone_name", schema=ix.schema) q = qp.parse(phone_name) retrieved_phones_ = [] with ix.searcher() as s: retrieved_phones = s.search(q) if len(retrieved_phones) != 0: for retrieved_phone in retrieved_phones: retrieved_phones_.append(retrieved_phone["phone_name"]) retrieved_phones__ = [] q = qp.parse(' '.join(phone_name.split(" ")[1:])) with ix.searcher() as s: retrieved_phones = s.search(q) if len(retrieved_phones) != 0: for retrieved_phone in retrieved_phones: retrieved_phones__.append( retrieved_phone["phone_name"]) retrieved_phones = [] total_phones = retrieved_phones__ + retrieved_phones_ print(retrieved_phones_) print(retrieved_phones__) for phone_ in total_phones: if phone_ not in retrieved_phones: retrieved_phones.append(phone_) print(len(retrieved_phones)) # special case for iphone X if phone_name == "Apple iPhone X": for phone_ in retrieved_phones: if "8" in phone_.split(" "): retrieved_phones.pop(retrieved_phones.index(phone_)) elif "7" or "iPhone\xa07" or "iPhone\xa07" in phone_.split( " "): retrieved_phones.pop(retrieved_phones.index(phone_)) elif "6" in phone_.split(" "): retrieved_phones.pop(retrieved_phones.index(phone_)) retrieved_phones = retrieved_phones[:3] phones_json = [{ "image": phone_name.lower().replace(" ", "_") + ".jpg" }, [], {}] for i, phone in enumerate(retrieved_phones): phones = ScrapedSmartphone.objects.filter(data__name=phone) for search_phone in phones: not_there = True for phone_json in phones_json[1]: if phone_json["url"] == search_phone.data["url"]: not_there = False if not_there == True: phones_json[1].append(search_phone.data) try: vendor_index = vendor.index( search_phone.data["vendor"]) phones_json[1][-1][ "brandLogoUrl"] = "http://localhost:8001/brand_images/" + vendor_images[ vendor_index] except ValueError: phones_json[1][-1][ "brandLogoUrl"] = "http://localhost:8001/brand_images/daraz.png" if len(phones_json[1][-1]["name"]) > 32: phones_json[1][-1]["name"] = phones_json[1][-1][ "name"][:33] + "..." phones_json[1][-1]["price"] = int( float(phones_json[1][-1]["price"])) original = OriginalSmartphone.objects.filter( data__DeviceName=phone_name) print(original) if (len(original) > 0): original = original[0] else: return JsonResponse({"error": "invalid phone name"}) specs = original.data specs_to_send = {} try: specs_to_send["camera"] = specs["triple"] except KeyError: try: specs_to_send["camera"] = specs["dual_"] except KeyError: try: specs_to_send["camera"] = specs["single"] except KeyError: specs_to_send["camera"] = "Photo/Video" specs_to_send["ram"] = specs["internal"] try: specs_to_send["processor"] = specs["cpu"] except KeyError: try: specs_to_send["processor"] = specs["chipset"] except: specs_to_send["processor"] = "No Info" specs_to_send["battery"] = specs["battery_c"] phones_json[2] = specs_to_send phones_json[1] = sorted(phones_json[1], key=itemgetter('price')) for specific_phone in phones_json[1]: specific_phone["price"] = locale.currency( specific_phone["price"], grouping=True)[1:-3] return JsonResponse(phones_json, safe=False) else: return JsonResponse({"error": "method not allowed"})
def searcher(self): """ Returns a searcher for this index. """ self._index = index.open_dir(self.index_path) return self._index.searcher()
if __name__ == "__main__": args = parse_args() with open(args.data_src_set, "r", encoding="utf-8") as data_src_r: with open(args.data_tgt_set, "r", encoding="utf-8") as data_tgt_r: with open(args.test_set, "r", encoding="utf-8") as test_r: with open(args.test_set + ".retrive.top-{}".format(args.top_K), "w", encoding="utf-8") as test_w: schema = Schema(source_tok=TEXT(stored=True), target_tok=TEXT(stored=True), source=TEXT(stored=True), target=TEXT(stored=True)) if os.path.exists(args.indexdir): print("Loading index dir {}".format(args.indexdir)) ix = open_dir(args.indexdir) else: print("reading source data file") data_src_lines = data_src_r.readlines() print("reading target data file") data_tgt_lines = data_tgt_r.readlines() os.mkdir(args.indexdir) print("Crating index dir {}".format(args.indexdir)) ix = create_in(args.indexdir, schema) writer = ix.writer() print("Index Build Start") for data_src_line, data_tgt_line in tqdm.tqdm( zip(data_src_lines, data_tgt_lines)): data_src_line_tok = data_src_line.strip().replace( "@@ ", "") data_tgt_line_tok = data_tgt_line.strip().replace(
def update_file_index(self): log.debug( (u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s ' 'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys())) idx = open_dir(self.index_location, indexname=self.indexname) # The set of all paths in the index indexed_paths = set() # The set of all paths we need to re-index to_index = set() writer = idx.writer() writer_is_dirty = False try: with idx.reader() as reader: # Loop over the stored fields in the index for fields in reader.all_stored_fields(): indexed_path = fields['path'] indexed_repo_path = fields['repository'] indexed_paths.add(indexed_path) if not indexed_repo_path in self.filtered_repo_update_paths: continue repo = self.repo_paths[indexed_repo_path] try: node = self.get_node(repo, indexed_path) # Check if this file was changed since it was indexed indexed_time = fields['modtime'] mtime = self.get_node_mtime(node) if mtime > indexed_time: # The file has changed, delete it and add it to # the list of files to reindex log.debug( 'adding to reindex list %s mtime: %s vs %s' % (indexed_path, mtime, indexed_time)) writer.delete_by_term('fileid', indexed_path) writer_is_dirty = True to_index.add(indexed_path) except (ChangesetError, NodeDoesNotExistError): # This file was deleted since it was indexed log.debug('removing from index %s' % indexed_path) writer.delete_by_term('path', indexed_path) writer_is_dirty = True # Loop over the files in the filesystem # Assume we have a function that gathers the filenames of the # documents to be indexed ri_cnt_total = 0 # indexed riwc_cnt_total = 0 # indexed with content for repo_name, repo in self.repo_paths.items(): # skip indexing if there aren't any revisions if len(repo) < 1: continue ri_cnt = 0 # indexed riwc_cnt = 0 # indexed with content for path in self.get_paths(repo): path = safe_unicode(path) if path in to_index or path not in indexed_paths: # This is either a file that's changed, or a new file # that wasn't indexed before. So index it! i, iwc = self.add_doc(writer, path, repo, repo_name) writer_is_dirty = True log.debug('re indexing %s' % path) ri_cnt += i ri_cnt_total += 1 riwc_cnt += iwc riwc_cnt_total += iwc log.debug('added %s files %s with content for repo %s' % (ri_cnt + riwc_cnt, riwc_cnt, repo.path)) log.debug('indexed %s files in total and %s with content' % (ri_cnt_total, riwc_cnt_total)) finally: if writer_is_dirty: log.debug('>> COMMITING CHANGES TO FILE INDEX <<') writer.commit(merge=True) log.debug('>>> FINISHED REBUILDING FILE INDEX <<<') else: log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<') writer.cancel()
from whoosh.fields import Schema, TEXT, ID from whoosh.qparser import QueryParser runPath = os.path.dirname(os.path.realpath(__file__)) sys.path.append(os.path.join(runPath, "..")) from lib.Config import Configuration from lib.DatabaseLayer import DatabaseLayer indexpath = Configuration.getIndexdir() #basepath = os.path.join(os.sep, *os.path.dirname(os.path.realpath(__file__)).rsplit('/')[:-1]) #print (os.path.split(os.path.join(basepath,indexpath))) schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) ix = index.open_dir(indexpath) argParser = argparse.ArgumentParser( description='Full text search for cve-search') argParser.add_argument('-q', action='append', help='query to lookup (one or more)') argParser.add_argument('-o', action='store_true', help='OR of the query to lookup (default is AND') argParser.add_argument('-t', action='store_true', help='output title of the match CVE(s)') argParser.add_argument('-f', action='store_true', help='output matching CVE(s) in JSON')
def Load(indexdir): ix = open_dir(indexdir) return ix
def available_checker(request): scraped = 0 original = 0 all_original_phones = OriginalSmartphone.objects.all() ix = index.open_dir("index") qp = QueryParser("phone_name", schema=ix.schema) for original_phone in all_original_phones: q = qp.parse(original_phone.data["DeviceName"]) another_trial = False with ix.searcher() as s: results = s.search(q) if len(results) != 0: original_phone.available = True original_phone.save() original += 1 if original_phone.data["DeviceName"] == "vivo Y65": for i in range(30): print("") print("In First") print(len(results)) for result in results: print(result["phone_name"]) for i in range(30): print("") for result in results: scraped_phones = ScrapedSmartphone.objects.filter( data__name=result["phone_name"]) for scraped_phone in scraped_phones: scraped_phone.available = True scraped_phone.belongs_to = original_phone scraped_phone.save() scraped += 1 else: another_trial = True if another_trial == True: keyword = original_phone.data["DeviceName"].split(" ")[1:] q = qp.parse(" ".join(keyword)) with ix.searcher() as s: results = s.search(q) if len(results) != 0: original_phone.available = True original_phone.save() original += 1 if original_phone.data["DeviceName"] == "vivo Y65": for i in range(30): print("") print("In Second") for result in results: print(result["phone_name"]) for i in range(30): print("") for result in results: scraped_phones = ScrapedSmartphone.objects.filter( data__name=result["phone_name"]) for scraped_phone in scraped_phones: scraped_phone.available = True scraped_phone.belongs_to = original_phone scraped_phone.save() scraped += 1 i = list(all_original_phones).index(original_phone) percentage = i * 100 / len(all_original_phones) print(percentage) print("Original ", original) print("Scraped ", scraped)
def query_thread(queue, database, g_minus_d, e1_type, e2_type, index): idx = open_dir(index) regex_tokenize = re.compile('\w+|-|<[A-Z]+>[^<]+</[A-Z]+>', re.U) tokenizer = RegexTokenizer(regex_tokenize) stopper = StopFilter() count = 0 with idx.searcher() as searcher: while True: r = queue.get_nowait() count += 1 if count % 25000 == 0: print multiprocessing.current_process(), count, queue.qsize() if len(database[(r.ent1, r.ent2)]) == 0: # if its not in the database calculate the PMI entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">" entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">" terms = list() for token in stopper( tokenizer((r.between.decode("utf8")), renumber=True)): terms.append(query.Term("sentence", token.text)) #print terms t1 = query.Term("sentence", entity1) t3 = query.Term("sentence", entity2) query_terms = list() query_terms.append(t1) for t in terms: query_terms.append(t) query_terms.append(t3) q1 = spans.SpanNear2(query_terms, slop=2, ordered=True) q2 = spans.SpanNear2([t1, t3], slop=8, ordered=True) entities_r = searcher.search(q1) entities = searcher.search(q2) """ print query_terms, len(entities_r) print [t1, t3], len(entities) print "\n" """ #print entity1, '\t', r.between, '\t', entity2, len(entities_r), len(entities) try: assert not len(entities_r) > len(entities) except AssertionError, e: print e print r.sentence print r.ent1 print r.ent2 print query_terms print[t1, t3] if len(entities) > 0: pmi = float(len(entities_r)) / float(len(entities)) if pmi >= 0.5: #print entity1, '\t', r.between, '\t', entity2, pmi g_minus_d.append(r) if queue.empty() is True: break
from whoosh.index import open_dir from whoosh.index import create_in from whoosh.query import Term, And, Or from whoosh.qparser import QueryParser my_schema = Schema(id=ID(unique=True, stored=True), lang=TEXT(), screenname=TEXT(), tweettext=TEXT(), hashtags=TEXT(), datetime=DATETIME()) if not os.path.exists("tweets_index"): os.mkdir("tweets_index") index = create_in("tweets_index", my_schema) index = open_dir("tweets_index") writer = index.writer() df = pd.read_csv('tweets/tweets.csv', header=None, names=[ 'id', 'language', 'screenname', 'tweettext', 'hashtags', 'timestamp' ]) for row in df.iterrows(): try: dt = datetime.datetime.fromtimestamp(int(row[1].timestamp.rstrip("L"))) except: dt = None try: tt = unicode(row[1].tweettext, errors="ignore")
def openIndex(self, dir): self.ix = index.open_dir(dir)
#! /usr/bin/env python # -*- coding:utf-8 -*- import whoosh.index as index from config import indexdir_path from whoosh import columns, fields, index, sorting from whoosh.qparser import QueryParser, MultifieldParser, query import json ix = index.open_dir(indexdir_path) facet = sorting.FieldFacet("comment_num", reverse=True) searcher = ix.searcher() #对索引进行命名然后指定索引文件打开 # ix = index.create_in("indexdir", schema=schema, indexname="usages") # ix = index.open_dir("indexdir", indexname="usages") #测试索引文件是否存在 # exists = index.exists_in("indexdir") # usages_exists = index.exists_in("indexdir", indexname="usages") #删除文档 #删除文档通过field # ix.delete_by_term('path', u'/a/b/c') #删除文档通过query #delete_by_query(query) # # ix.commit() #更新索引 需要设置字段的唯一性 (设置惟一的字段必须被索引) # schema = Schema (path=ID (unique=True),content=TEXT) #writer.add_document(path=u"/a",content=u"the first document")
def search_combined(search_term, limit=3): return _do_search(open_dir("indexcomb"), search_term, limit)