def update_index(session, index): print("Calculating query size...") query = session.query(Message)#.filter(Message.sent_at > '2016-01-01').filter_by(list_id='python-dev') count = query.count() writer = index.writer() with tqdm(total=count) as pbar: for idx, message in enumerate(query.yield_per(100)): pbar.update(1) if not message.text: continue writer.add_document( list_id=message.list_id, message_id=message.message_id, content=clean_message(message.text), author=message.author, sent_at=message.sent_at, thread_parent=message.thread_parent, thread_idx=message.thread_idx, thread_indent=message.thread_indent, page=message.page, subject=message.subject, ) if idx % 10000 == 0 and idx != 0: pbar.write("Comitting at doc {}...".format(idx)) writer.commit() writer = index.writer() pbar.write("Comitting at doc {}...".format(idx+1)) writer.commit()
def update(self, force_rebuild=False): """ Adds/updates all items in repo to index. Note: querying will call this automatically.""" # if we've already updated the index during this script run, we're done! if self.index_updated: return False # if the index is not based on the current commit, rebuild from scratch if not self.index_based_on_current_commit(): force_rebuild = True if force_rebuild: # get a new clean/empty index index = self.get_index(force_rebuild) index_writer = index.writer() # index all documents documents = self.document_iterator() activity_description = 'Rebuilding' else: # use the current index index = self.get_index() index_writer = index.writer() # delete uncommitted files that are in index already for filepath in self.get_indexed_uncommitted_files(): index_writer.delete_by_term('path', filepath) # get list of uncommitted files and persist it uncommitted_files = lib_git.get_uncommitted_oval() self.set_indexed_uncommitted_files(uncommitted_files) # if there are no uncommitted files to index, we're done if not uncommitted_files: index_writer.commit() return False # index only uncommitted files documents = self.document_iterator(uncommitted_files) activity_description = 'Updating' # add all definition files to index counter = 0 for document in documents: counter = counter + 1 self.status_spinner(counter, '{0} {1} index'.format(activity_description, self.index_name), self.item_label) if 'deleted' in document and document['deleted']: index_writer.delete_by_term('path', document['path']) #self.message('debug', 'Deleting from index:\n\t{0} '.format(document['path'])) else: index_writer.add_document(**document) #self.message('debug', 'Upserting to index:\n\t{0} '.format(document['path'])) index_writer.commit() self.status_spinner(counter, '{0} {1} index'.format(activity_description, self.index_name), self.item_label, True) # update indexed commit self.set_indexed_commit_hash() self.index_updated = True
def _get_index(self, index_path, index_name): if not whoosh.index.exists_in(index_path, index_name): print 'creating %s index at %s' % (index_name, index_path) if not os.path.exists(index_path): os.makedirs(index_path) schema = whoosh.fields.Schema( id = whoosh.fields.ID(stored=True, unique=True), artist = whoosh.fields.TEXT(stored=True), title = whoosh.fields.TEXT(stored=True), lyrics = whoosh.fields.TEXT(stored=True), ) index = whoosh.index.create_in(index_path, schema, index_name) index.writer().commit() return whoosh.index.open_dir(index_path, index_name)
def create_index_writer(index_path): ''' Constructs a whoosh index writer, which has ID, artist and title fields :parameters: - index_path : str Path to whoosh index to be written :returns: - index : whoosh.writing.IndexWriter Whoosh index writer ''' if not os.path.exists(index_path): os.mkdir(index_path) A = (whoosh.analysis.StandardAnalyzer(stoplist=None, minsize=1) | whoosh.analysis.CharsetFilter(accent_map)) Schema = whoosh.fields.Schema(id=whoosh.fields.ID(stored=True), path=whoosh.fields.TEXT(stored=True), artist=whoosh.fields.TEXT(stored=True, analyzer=A), title=whoosh.fields.TEXT(stored=True, analyzer=A)) index = whoosh.index.create_in(index_path, Schema) return index.writer()
def after_commit(self, session): """ Any db updates go through here. We check if any of these models have ``__searchable__`` fields, indicating they need to be indexed. With these we update the whoosh index for the model. If no index exists, it will be created here; this could impose a penalty on the initial commit of a model. """ for typ, values in self.to_update.iteritems(): model_class = values[0][1].__class__ index = self.index_for_model_class(model_class) with index.writer() as writer: primary_field = model_class.search_query.primary searchable = model_class.__searchable__ for change_type, model in values: # delete everything. stuff that's updated or inserted will get # added as a new doc. Could probably replace this with a whoosh # update. writer.delete_by_term(primary_field, unicode(getattr(model, primary_field))) if change_type in ("new", "changed"): attrs = dict((key, getattr(model, key)) for key in searchable) attrs[primary_field] = unicode(getattr(model, primary_field)) writer.add_document(**attrs) self.to_update = {}
def _after_flush(app, changes): # Any db updates go through here. We check if any of these models have # ``__searchable__`` fields, indicating they need to be indexed. With these # we update the whoosh index for the model. If no index exists, it will be # created here; this could impose a penalty on the initial commit of a # model. bytype = {} # sort changes by type so we can use per-model writer for change in changes: update = change[1] in ("update", "insert") if hasattr(change[0].__class__, __searchable__): bytype.setdefault(change[0].__class__.__name__, []).append((update, change[0])) for model, values in bytype.items(): index = whoosh_index(app, values[0][1].__class__) with index.writer() as writer: primary_field = values[0][1].pure_whoosh.primary_key_name searchable = values[0][1].__searchable__ for update, v in values: if update: attrs = {} for key in searchable: try: attrs[key] = unicode(getattr(v, key)) except AttributeError: raise AttributeError("{0} does not have {1} field {2}".format(model, __searchable__, key)) attrs[primary_field] = unicode(getattr(v, primary_field)) writer.update_document(**attrs) else: writer.delete_by_term(primary_field, unicode(getattr(v, primary_field)))
def build_index(sa_session, toolshed_whoosh_index_dir): storage = FileStorage(toolshed_whoosh_index_dir) index = storage.create_index(schema) writer = index.writer() def to_unicode(a_basestr): if type(a_basestr) is str: return unicode(a_basestr, 'utf-8') else: return a_basestr repos_indexed = 0 for (id, name, description, long_description, homepage_url, remote_repository_url, repo_owner_username, times_downloaded, approved, last_updated, full_last_updated) in get_repos(sa_session): writer.add_document( id=id, name=to_unicode(name), description=to_unicode(description), long_description=to_unicode(long_description), homepage_url=to_unicode(homepage_url), remote_repository_url=to_unicode(remote_repository_url), repo_owner_username=to_unicode(repo_owner_username), times_downloaded=times_downloaded, approved=approved, last_updated=last_updated, full_last_updated=full_last_updated) repos_indexed += 1 writer.commit() print "Number of repos indexed: ", repos_indexed
def index_one_record(record, delete=False, writer=None, index_parent=False): index = whoosh_index(current_app, record.__class__) close = False if not writer: writer = index.writer() close = True if index_parent: # index parent class parent_writer = whoosh_index( current_app, record.__class__.__base__).writer() primary_field = record.pure_whoosh.primary_key_name searchable = index.schema.names() if not delete: attrs = {} for key in searchable: attrs[key] = str(getattr(record, key)) attrs[primary_field] = str( getattr(record, primary_field)) writer.update_document(**attrs) if index_parent: parent_writer.update_document(**attrs) else: writer.delete_by_term( primary_field, str(getattr(record, primary_field))) if index_parent: parent_writer.delete_by_term( primary_field, str(getattr(record, primary_field))) if close: writer.commit()
def update_documentation_index(): from flask_website.docs import DocumentationPage writer = index.writer() for page in DocumentationPage.iter_pages(): page.remove_from_search_index(writer) page.add_to_search_index(writer) writer.commit()
def after_commit(self, session): """ Any db updates go through here. We check if any of these models have ``__searchable__`` fields, indicating they need to be indexed. With these we update the whoosh index for the model. If no index exists, it will be created here; this could impose a penalty on the initial commit of a model. """ for typ, values in self.to_update.iteritems(): model_class = values[0][1].__class__ index = self.index_for_model_class(model_class) with index.writer() as writer: primary_field = model_class.search_query.primary searchable = model_class.__searchable__ for change_type, model in values: # delete everything. stuff that's updated or inserted will get # added as a new doc. Could probably replace this with a whoosh # update. if change_type == "deleted": writer.delete_by_term(primary_field, unicode(getattr(model, primary_field))) else: attrs = dict((key, getattr(model, key)) for key in searchable) attrs[primary_field] = unicode(getattr(model, primary_field)) if change_type == "new": writer.add_document(**attrs) elif change_type == "changed": writer.update_document(**attrs) self.to_update = {}
def _after_flush(app, changes): # Any db updates go through here. We check if any of these models have # ``__searchable__`` fields, indicating they need to be indexed. With these # we update the whoosh index for the model. If no index exists, it will be # created here; this could impose a penalty on the initial commit of a # model. if app.config.get('WHOOSH_DISABLED') is True: return bytype = {} # sort changes by type so we can use per-model writer for change in changes: update = change[1] in ('update', 'insert') if hasattr(change[0].__class__, __searchable__): bytype.setdefault(change[0].__class__.__name__, []).append( (update, change[0])) if not bytype: return try: for model, values in list(bytype.items()): index = whoosh_index(app, values[0][1].__class__) with index.writer() as writer: for update, v in values: has_parent = isinstance( v.__class__.__base__, DeclarativeMeta) and \ hasattr(v.__class__.__base__, '__searchable__') index_one_record( v, not update, writer, index_parent=has_parent) except Exception as ex: logging.error("FAIL updating index of %s msg: %s" % (model, str(ex)))
def create_search_index(): """ Set up a Whoosh search index based on the keys in the given spellbook. """ # Try to open the index if it already exists and is recent. if os.path.exists(config['search_index_path']): idx_modified = os.path.getmtime(config['search_index_path']) spellbook_modified = os.path.getmtime(env['spellbook_path']) if spellbook_modified < idx_modified: return whoosh.index.open_dir(config['search_index_path']) else: os.makedirs(config['search_index_path']) schema = whoosh.fields.Schema(name=whoosh.fields.NGRAMWORDS(stored=True), contents=whoosh.fields.STORED) index = whoosh.index.create_in(config['search_index_path'], schema) writer = index.writer() for spell_name, spell_contents in env['flat_spellbook'].items(): writer.add_document(name=spell_name, contents=spell_contents) writer.commit() return index
def reindex_snippets(): from flask_website.database import Snippet writer = index.writer() for snippet in Snippet.query.all(): snippet.remove_from_search_index(writer) snippet.add_to_search_index(writer) writer.commit()
def after_flush(app, changes): ''' Any db updates go through here. We check if any of these models have ``__searchable__`` fields, indicating they need to be indexed. With these we update the whoosh index for the model. If no index exists, it will be created here; this could impose a penalty on the initial commit of a model. ''' bytype = {} # sort changes by type so we can use per-model writer for change in changes: update = change[1] in ('update', 'insert') if hasattr(change[0].__class__, '__searchable__'): bytype.setdefault(change[0].__class__.__name__, []).append((update, change[0])) for typ, values in bytype.iteritems(): index = whoosh_index(app, values[0][1]) with index.writer() as writer: primary_field = values[0][1].search_query.primary searchable = values[0][1].__searchable__ for update, v in values: # delete everything. stuff that's updated or inserted will get # added as a new doc. Could probably replace this with a whoosh # update. writer.delete_by_term(primary_field, unicode(getattr(v, primary_field))) if update: attrs = dict((key, getattr(v, key)) for key in searchable) attrs[primary_field] = unicode(getattr(v, primary_field)) writer.add_document(**attrs)
def update_model_based_indexes(session, flush_context): """Called by a session event, updates the model based documents.""" to_delete = [] to_add = [] for model in session.new: if isinstance(model, Indexable): to_add.append(model) for model in session.dirty: if isinstance(model, Indexable): to_delete.append(model) to_add.append(model) for model in session.dirty: if isinstance(model, Indexable): to_delete.append(model) if not (to_delete or to_add): return writer = index.writer() for model in to_delete: model.remove_from_search_index(writer) for model in to_add: model.add_to_search_index(writer) writer.commit()
def index_one_record(record, delete=False, writer=None, index_parent=False): index = whoosh_index(current_app, record.__class__) close = False if not writer: writer = index.writer() close = True if index_parent: # index parent class parent_writer = whoosh_index( current_app, record.__class__.__base__).writer() primary_field = record.pure_whoosh.primary_key_name searchable = index.schema.names() if not delete: attrs = {} for key in searchable: attrs[key] = unicode(getattr(record, key)) attrs[primary_field] = unicode( getattr(record, primary_field)) writer.update_document(**attrs) if index_parent: parent_writer.update_document(**attrs) else: writer.delete_by_term( primary_field, unicode(getattr(record, primary_field))) if index_parent: parent_writer.delete_by_term( primary_field, unicode(getattr(record, primary_field))) if close: writer.commit()
def index_objects(self, objects, index='default'): """Bulk index a list of objects.""" if not objects: return index_name = index index = self.app_state.indexes[index_name] indexed = set() with index.writer() as writer: for obj in objects: document = self.get_document(obj) if document is None: continue object_key = document['object_key'] if object_key in indexed: continue writer.delete_by_term('object_key', object_key) try: writer.add_document(**document) except ValueError: # logger is here to give us more infos in order to catch a weird bug # that happens regularly on CI but is not reliably # reproductible. logger.error( 'writer.add_document(%r)', document, exc_info=True, ) raise indexed.add(object_key)
def scrape_profiles(index): for profile_type in PROFILE_TYPES: print(f'Processing profile type: {profile_type}') profiles = scrape_profiles_of_type(profile_type) with index.writer() as index_writer: for profile in profiles: index_writer.add_document(**profile)
def create_index_writer(index_path): ''' Constructs a whoosh index writer, which has ID, artist and title fields :parameters: - index_path : str Path to whoosh index to be written :returns: - index : whoosh.writing.IndexWriter Whoosh index writer ''' if not os.path.exists(index_path): os.mkdir(index_path) A = (whoosh.analysis.StandardAnalyzer(stoplist=None, minsize=1) | whoosh.analysis.CharsetFilter(accent_map)) Schema = whoosh.fields.Schema( id=whoosh.fields.ID(stored=True), path=whoosh.fields.TEXT(stored=True), artist=whoosh.fields.TEXT(stored=True, analyzer=A), title=whoosh.fields.TEXT(stored=True, analyzer=A)) index = whoosh.index.create_in(index_path, Schema) return index.writer()
def _after_flush(app, changes): # Any db updates go through here. We check if any of these models have # ``__searchable__`` fields, indicating they need to be indexed. With these # we update the whoosh index for the model. If no index exists, it will be # created here; this could impose a penalty on the initial commit of a # model. if app.config.get('WHOOSH_DISABLED') is True: return bytype = {} # sort changes by type so we can use per-model writer for change in changes: update = change[1] in ('update', 'insert') if hasattr(change[0].__class__, __searchable__): bytype.setdefault(change[0].__class__.__name__, []).append( (update, change[0])) if not bytype: return try: for model, values in bytype.items(): index = whoosh_index(app, values[0][1].__class__) with index.writer() as writer: for update, v in values: has_parent = isinstance( v.__class__.__base__, DeclarativeMeta) index_one_record( v, not update, writer, index_parent=has_parent) except Exception as ex: logging.warning("FAIL updating index of %s msg: %s" % (model, str(ex)))
def create_index_writer(index_path): '''Create a new whoosh index in the given directory path. Input: directory in which to create the index Output: `whoosh.index` writer object ''' if not os.path.exists(index_path): os.mkdir(index_path) analyzer = (whoosh.analysis.StemmingAnalyzer() | whoosh.analysis.CharsetFilter(accent_map)) schema = whoosh.fields.Schema(track_id=whoosh.fields.STORED, title=whoosh.fields.TEXT(stored=True, analyzer=analyzer), artist=whoosh.fields.TEXT(stored=True, analyzer=analyzer), album=whoosh.fields.TEXT(stored=True, analyzer=analyzer), collection=whoosh.fields.KEYWORD(stored=True), collection_id=whoosh.fields.NUMERIC(stored=True)) index = whoosh.index.create_in(index_path, schema) return index.writer()
def _get_writer(index): writer = None while writer is None: try: writer = index.writer() except whoosh.index.LockError: time.sleep(0.25) return writer
def add_email(index, base, email_id): email_bytes = get_from_wikileaks_by_index(base, email_id) content = analyzer.retrieve_email_content(email_bytes, base + str(email_id)) subject = analyzer.retrieve_subject(email_bytes) writer = index.writer() writer.add_document(url=base + str(email_id), content=content, subject=subject) writer.commit()
def rebuild_index_model(self, model_class, session): index = self.index_for_model_class(model_class) with index.writer() as writer: primary_field = model_class.search_query.primary searchable = model_class.__searchable__ for i in session.query(model_class): attrs = dict((key, getattr(i, key)) for key in searchable) attrs[primary_field] = unicode(getattr(i, primary_field)) writer.delete_by_term(primary_field, unicode(getattr(i, primary_field))) writer.add_document(**attrs)
def appendtextindex(table, index_or_dirname, indexname=None, merge=True, optimize=False): """ Load all rows from `table` into a Whoosh index, adding them to any existing data in the index. Keyword arguments: table A table container with the data to be loaded. index_or_dirname Either an instance of `whoosh.index.Index` or a string containing the directory path where the index is to be stored. indexname String containing the name of the index, if multiple indexes are stored in the same directory. merge Merge small segments during commit? optimize Merge all segments together? """ import whoosh.index # deal with polymorphic argument if isinstance(index_or_dirname, string_types): dirname = index_or_dirname index = whoosh.index.open_dir(dirname, indexname=indexname, readonly=False) needs_closing = True elif isinstance(index_or_dirname, whoosh.index.Index): index = index_or_dirname needs_closing = False else: raise ArgumentError('expected string or index, found %r' % index_or_dirname) writer = index.writer() try: for d in dicts(table): writer.add_document(**d) writer.commit(merge=merge, optimize=optimize) except Exception: writer.cancel() raise finally: if needs_closing: index.close()
def handle_postupdate(item_id): # pragma: no cover """Insert item data into indexer. """ item = icecrate.items.by_item_id(item_id) writer = index.writer() writer.update_document( upc=item_id, name=item.get("name"), tags=list(icecrate.tags._split_tags(item.get("tags", "")))) writer.commit()
def createIndexWriter(indexPath): if not os.path.exists(indexPath): os.mkdir(indexPath) A = whoosh.analysis.FancyAnalyzer() | whoosh.analysis.CharsetFilter(accent_map) Schema = whoosh.fields.Schema( song_id = whoosh.fields.ID(stored=True), artist = whoosh.fields.TEXT(stored=True, analyzer=A), title = whoosh.fields.TEXT(stored=True, analyzer=A)) index = whoosh.index.create_in(indexPath, Schema) return index.writer() pass
def write_db(): index = storage.create_index(schema) writer = index.writer() # read doc from Database by using django for dou in Preview.objects.all(): doc = {} doc['id'] = _from_python(str(dou.id)) text = dou.description doc[index_fieldname] = text try: writer.update_document(**doc) except Exception, e: raise
def write_db(storage,schema): index = storage.create_index(schema) writer = index.writer() #for dou in DoubanMovie.objects.filter(id__lte=4000).annotate(cnt=Count('movielink')).filter(cnt__gt=0).order_by('-cnt'): for obj in HtmlContent.objects.filter(~Q(retry=3)).filter(~Q(content='')): doc = {} doc['id'] = _from_python(str(obj.id)) doc['title'] = obj.title doc['content'] = obj.content try: writer.update_document(**doc) except Exception, e: raise
def reindex(self): """Reindex all data This method retrieves all the data from the registered models and calls the ``update_<model>()`` function for every instance of such model. """ for wh in self.whoosheers: index = type(self).get_or_create_index(_get_app(self), wh) with index.writer(timeout=_get_config(self)['writer_timeout']) as writer: for model in wh.models: method_name = "{0}_{1}".format(UPDATE_KWD, model.__name__.lower()) for item in model.query.all(): getattr(wh, method_name)(writer, item)
def write_db(storage, schema): index = storage.create_index(schema) writer = index.writer() #for dou in DoubanMovie.objects.filter(id__lte=4000).annotate(cnt=Count('movielink')).filter(cnt__gt=0).order_by('-cnt'): for obj in HtmlContent.objects.filter(~Q(retry=3)).filter(~Q(content='')): doc = {} doc['id'] = _from_python(str(obj.id)) doc['title'] = obj.title doc['content'] = obj.content try: writer.update_document(**doc) except Exception, e: raise
def add_snippet(): title = request.form['title'].strip() content = request.form['content'].strip() tag = request.form['tag'] language = request.form['language'] snippet_id = unicode(uuid.uuid4()) if not title or not content: raise Exception("Empty title or snippet content") writer = index.writer() writer.update_document(id=snippet_id, content=content, tag=tag, title=title, language=language) writer.commit() return '{"success": true, "message": "Snippet added successfully", "snippet_id": "%s"}' % snippet_id
def reindex(self): """Reindex all data This method retrieves all the data from the registered models and calls the ``update_<model>()`` function for every instance of such model. """ for wh in self.whoosheers: index = type(self).get_or_create_index(_get_app(self), wh) writer = index.writer(timeout=_get_config(self)['writer_timeout']) for model in wh.models: method_name = "{0}_{1}".format(UPDATE_KWD, model.__name__.lower()) for item in model.query.all(): getattr(wh, method_name)(writer, item) writer.commit()
def _store(session, context): # Any db updates go through here. We check if any of these models have # ``__searchable__`` fields, indicating they need to be indexed. With these # we update the whoosh index for the model. If no index exists, it will be # created here; this could impose a penalty on the initial commit of a # model. app = session.app changes = session._model_changes.values() bytype = {} # sort changes by type so we can use per-model writer for obj, operation in changes: if hasattr(obj.__class__, __searchable__): bytype.setdefault(obj.__class__, []).append((obj, operation)) for cls, values in bytype.iteritems(): index = whoosh_index(app, cls) with index.writer() as writer: primary_field = cls.pure_whoosh.primary_key_name searchable = cls.__searchable__ for obj, operation in values: if operation in ('update', 'insert'): attrs = {} for item in searchable: if isinstance(item, tuple): key, field = item else: key = item field = None try: value = getattr(obj, key) except AttributeError: raise AttributeError('{0} does not have {1} field {2}' .format(cls.__name__, __searchable__, key)) if callable(value): value = value() if field is None or field.__class__ in UNICODE_TYPES: value = unicode(value) attrs[key] = value attrs[primary_field] = unicode(getattr(obj, primary_field)) #print "update_document", attrs writer.update_document(**attrs) elif operation == 'delete': writer.delete_by_term(primary_field, unicode(getattr(obj, primary_field)))
def loader(index, col): """ takes collection data as input and writes to different indexes. """ writer = index.writer() feed_data = get_data(col) for doc in feed_data: idx = doc["_id"] data = doc # data = json.dumps(doc) # print(data) body = dict_values_to_text(doc) writer.add_document(idx=idx, data=data, body=body) writer.commit() print(f"{index} loaded successfully")
def createIndexWriter(indexPath): if not os.path.exists(indexPath): os.mkdir(indexPath) pass A = whoosh.analysis.StemmingAnalyzer() | whoosh.analysis.CharsetFilter(accent_map) Schema = whoosh.fields.Schema( song_id = whoosh.fields.ID(stored=True), artist_id = whoosh.fields.STORED, artist = whoosh.fields.TEXT(stored=True, field_boost=8.0, analyzer=A), title = whoosh.fields.TEXT(stored=True, field_boost=4.0, analyzer=A), terms = whoosh.fields.KEYWORD(stored=True, scorable=True, commas=True)) index = whoosh.index.create_in(indexPath, Schema) return index.writer()
def _flush_set(self, _set): for instance in _set: mapping = instance.__class__ if mapping not in self.mappings: continue index = self.indexes[mapping.__name__] primary_field = mapping.search_query.primary primary_value = text_type(getattr(instance, primary_field)) with index.writer() as writer: attrs = {} writer.delete_by_term(primary_field, primary_value) attrs[primary_field] = primary_value attrs['body'] = instance.index() writer.add_document(**attrs)
def _after_flush(app, changes): # Any db updates go through here. We check if any of these models have # ``__searchable__`` fields, indicating they need to be indexed. With these # we update the whoosh index for the model. If no index exists, it will be # created here; this could impose a penalty on the initial commit of a # model. bytype = {} # sort changes by type so we can use per-model writer for change in changes: update = change[1] in ('update', 'insert') if hasattr(change[0].__class__, __searchable__): bytype.setdefault(change[0].__class__.__name__, []).append((update, change[0])) for model, values in bytype.iteritems(): index = whoosh_index(app, values[0][1].__class__) with index.writer() as writer: primary_field = values[0][1].pure_whoosh.primary_key_name searchable = values[0][1].__searchable__ for update, v in values: if update: attrs = {} for key in searchable: # Dive into related models before going down the # well-worn, original path. Adventure awaits! if '.' in key: parts = key.split('.') # Let's assume only one-level relationships, # i.e. names contain only one period. related_model, field_name = parts attrs[key] = unicode(getattr(getattr(v, related_model), field_name)) else: try: attrs[key] = unicode(getattr(v, key)) except AttributeError: raise AttributeError('{0} does not have {1} field {2}' .format(model, __searchable__, key)) attrs[primary_field] = unicode(getattr(v, primary_field)) writer.update_document(**attrs) else: writer.delete_by_term(primary_field, unicode(getattr(v, primary_field)))
def update_snippet(snippet_id): title = request.form['title'] content = request.form['content'] language = request.form['language'] tag = request.form['tag'] if not title or not content: raise Exception("Empty title or content") if not get_snippet_by_id(snippet_id): abort(make_response('{"message": "The snippet you are trying to update doesn\'t exist"}', 404)) writer = index.writer() writer.update_document(id=snippet_id, content=content, tag=tag, title=title, language=language) writer.commit() return '{"success": true, "message": "Snippet added successfully", "snippet_id": "%s"}' % snippet_id
def scrape_selections(index): selections = [] for (dirpath, _, file_names) in os.walk(ROSTER_DATA_DIR): for file_name in file_names: file_path = os.path.join(dirpath, file_name) _, ext = os.path.splitext(file_name) if ext != '.cat': print(f'Ignoring file: {file_path}') continue print(f'Parsing file: {file_path}') cat_file = RosterFile(file_path) selections += cat_file.scrape_selections() with index.writer() as index_writer: for selection in selections: index_writer.add_document(**selection) return selections
def _after_flush(app, changes): # Any db updates go through here. We check if any of these models have # ``__searchable__`` fields, indicating they need to be indexed. With these # we update the whoosh index for the model. If no index exists, it will be # created here; this could impose a penalty on the initial commit of a # model. bytype = {} # sort changes by type so we can use per-model writer for change in changes: update = change[1] in ('update', 'insert') if hasattr(change[0].__class__, __searchable__): bytype.setdefault(change[0].__class__.__name__, []).append( (update, change[0])) # for model, values in bytype.iteritems(): for model, values in bytype.items(): index = whoosh_index(app, values[0][1].__class__) with index.writer() as writer: primary_field = values[0][1].pure_whoosh.primary_key_name searchable = values[0][1].__searchable__ for update, v in values: if update: attrs = {} for key in searchable: try: # attrs[key] = unicode(getattr(v, key)) attrs[key] = str(getattr(v, key)) except AttributeError: raise AttributeError( '{0} does not have {1} field {2}'.format( model, __searchable__, key)) # attrs[primary_field] = unicode(getattr(v, primary_field)) attrs[primary_field] = str(getattr(v, primary_field)) writer.update_document(**attrs) else: # writer.delete_by_term(primary_field, unicode(getattr(v, # primary_field))) writer.delete_by_term(primary_field, str(getattr(v, primary_field)))
def index_objects(self, objects): """ Bulk index a list of objets, that must be not indexed yet, and all of the same class. """ if not objects: return model_class = objects[0].__class__ assert all(m.__class__ is model_class for m in objects),\ "All objects must be of the same class." index = self.index_for_model_class(model_class) with index.writer() as writer: primary_field = model_class.search_query.primary indexed_fields = model_class.whoosh_schema.names() for model in objects: document = self.make_document(model, indexed_fields, primary_field) writer.add_document(**document)
def create_index_writer(index_path): ''' Constructs a whoosh index writer, which has an ID field as well as artist and title Input: index_path - Path to whoosh index to be written Output: index - Whoosh index writer ''' if not os.path.exists(index_path): os.mkdir(index_path) A = whoosh.analysis.StemmingAnalyzer() | whoosh.analysis.CharsetFilter(accent_map) Schema = whoosh.fields.Schema( track_id = whoosh.fields.ID(stored=True), artist = whoosh.fields.TEXT(stored=True, analyzer=A), title = whoosh.fields.TEXT(stored=True, analyzer=A)) index = whoosh.index.create_in(index_path, Schema) return index.writer()
def create_index(self): pdf_schema = Schema(id = ID(unique=True,stored=True), path=ID(stored=True), title=TEXT(stored=True), text=TEXT, textdata=TEXT(stored=True)) if not os.path.exists(self.path_index): os.mkdir(self.path_index) index = create_in(self.path_index,pdf_schema) #index = open_dir('paper-index') paper_writer = index.writer() files_txt = [f for f in os.listdir(self.path_text) if f.endswith('.txt')] print('total paper: ', len(files_txt)) for i, f in enumerate(files_txt): print('{}/{} - {}'.format(i, len(files_txt), f)) paper_writer.add_document(id=f, textdata=open(os.path.join(self.path_text, f),encoding='utf-8').read() ) paper_writer.commit()
def build_hybrid_index(index, repo, ref='HEAD'): head = repo.refs[ref] def get_revisions(path): from posixpath import dirname return Walker( store=repo.object_store, include=[head], paths=[dirname(path)], follow=True, ) head_pages_tree = git_storage.get_pages_tree(repo, ref) pages = git_storage.find_pages(repo, head_pages_tree) pages_data = git_storage.load_pages_with_attachments(repo, pages) w = index.writer() try: for path, page, attachments in pages_data: with w.group(): write_page(repo, w, path, page, attachments) revisions = get_revisions(path) for revision in revisions: write_revision(repo, w, revision.commit, path) w.commit(optimize=True) except: w.cancel() raise
def after_commit(self, session): """ Any db updates go through here. We check if any of these models have ``__searchable__`` fields, indicating they need to be indexed. With these we update the whoosh index for the model. If no index exists, it will be created here; this could impose a penalty on the initial commit of a model. """ if not self.running: return for typ, values in self.to_update.iteritems(): model_class = values[0][1].__class__ index = self.index_for_model_class(model_class) with index.writer() as writer: primary_field = model_class.search_query.primary indexed_fields = model_class.whoosh_schema.names() for change_type, model in values: # delete everything. stuff that's updated or inserted will get # added as a new doc. Could probably replace this with a whoosh # update. writer.delete_by_term(primary_field, unicode(getattr(model, primary_field))) if change_type in ("new", "changed"): attrs = {} for key in indexed_fields: value = getattr(model, key) if hasattr(value, "name"): value = value.name if isinstance(value, str): value = unicode(value) elif isinstance(value, int): value = unicode(value) attrs[key] = value attrs[primary_field] = unicode(getattr(model, primary_field)) writer.add_document(**attrs) self.to_update = {}
def main(library_path, music_path): if os.path.exists(library_path): index = whoosh.index.open_dir(library_path) else: os.makedirs(library_path) index = whoosh.index.create_in(library_path, song_schema) songs = [] with index.writer() as writer: for song in song_walker(os.path.expanduser(music_path)): # We index by the original text, but store parsed values in # our own index. writer.update_document(**song) for key, parser in key_parsers.items(): if key in song: song[key] = parser(song[key]) if song[key] is None: del song[key] songs.append(song) pickle.dump(songs, open(os.path.join(library_path, 'songs'), 'w'))
def add_doc(): index = storage.open_index(schema=schema) writer = index.writer() #parser = QueryParser(index_fieldname, schema=schema) #parsed_query = parser.parse('%s:%s' % ('id', qq_id)) #parsed_query = parser.parse('%s:%s' % ('id', qq_id)) #writer.delete_by_query(query) #writer.commit() content,names = get_content('qq7') doc = {} doc['id'] = _from_python(qq_id) doc[index_fieldname] = content try: writer.add_document(**doc) writer.commit() #writer.update_document(**doc) except Exception, e: raise
def add_doc(): index = storage.open_index(schema=schema) writer = index.writer() #parser = QueryParser(index_fieldname, schema=schema) #parsed_query = parser.parse('%s:%s' % ('id', qq_id)) #parsed_query = parser.parse('%s:%s' % ('id', qq_id)) #writer.delete_by_query(query) #writer.commit() content, names = get_content('qq7') doc = {} doc['id'] = _from_python(qq_id) doc[index_fieldname] = content try: writer.add_document(**doc) writer.commit() #writer.update_document(**doc) except Exception, e: raise
def update_snippet(snippet_id): title = request.form['title'] content = request.form['content'] language = request.form['language'] tag = request.form['tag'] if not title or not content: raise Exception("Empty title or content") if not get_snippet_by_id(snippet_id): abort( make_response( '{"message": "The snippet you are trying to update doesn\'t exist"}', 404)) writer = index.writer() writer.update_document(id=snippet_id, content=content, tag=tag, title=title, language=language) writer.commit() return '{"success": true, "message": "Snippet added successfully", "snippet_id": "%s"}' % snippet_id
def update(self, force_rebuild=False): """ Adds/updates all items in repo to index. Note: querying will call this automatically.""" # if we've already updated the index during this script run, we're done! if self.index_updated: return False # we only need to do this once per script lifetime self.index_updated = True # if the index is not based on the current commit, rebuild from scratch if not self.index_based_on_current_commit(): force_rebuild = True if force_rebuild: # get a new clean/empty index index = self.get_index(force_rebuild) # disabled high-performance writer (https://pythonhosted.org/Whoosh/batch.html), causing thread/lock issues # index_writer = index.writer(procs=4, multisegment=True) index_writer = index.writer() # index all documents documents = self.document_iterator() activity_description = 'Rebuilding' # update indexed commit self.set_indexed_commit_hash() else: # use the current index index = self.get_index() index_writer = index.writer() # delete uncommitted files that are in index already for filepath in self.get_indexed_uncommitted_files(): index_writer.delete_by_term('path', filepath) # get list of uncommitted files and persist it uncommitted_files = lib_git.get_uncommitted_oval() self.set_indexed_uncommitted_files(uncommitted_files) # nothing to update? done! if not uncommitted_files: index_writer.commit() return # index only uncommitted files documents = self.document_iterator(uncommitted_files) activity_description = 'Updating' # add all definition files to index counter = 0 try: for document in documents: counter = counter + 1 self.status_spinner( counter, '{0} {1} index'.format(activity_description, self.index_name), self.item_label) if 'deleted' in document and document['deleted']: try: index_writer.delete_by_term( 'oval_id', self.whoosh_escape(document['oval_id'])) except: self.message( 'debug', 'Something was marked as needing to be deleted but it wasnt in the index' ) #self.message('debug', 'Deleting from index:\n\t{0} '.format(self.whoosh_escape(document['oval_id']))) #index_writer.delete_by_term('oval_id', self.whoosh_escape(document['oval_id'])) #self.message('debug', 'Deleting from index:\n\t{0} '.format(self.whoosh_escape(document['oval_id']))) else: index_writer.add_document(**document) #self.message('debug', 'Upserting to index:\n\t{0} '.format(document['path'])) except lib_xml.InvalidXmlError as e: # abort: cannot build index self.message( 'ERROR CANNOT BUILD INDEX', 'Invalid xml fragment\n\tFile: {0}\n\tMessage: {1}'.format( e.path, e.message)) self.message('ERROR', 'deleting index and aborting execution') index_writer.commit() self.index.close() shutil.rmtree(self.get_index_path()) sys.exit(1) self.status_spinner( counter, '{0} {1} index'.format(activity_description, self.index_name), self.item_label, True) index_writer.commit()