def delete_documents(self, doc_set, paths): """Delete documents from the index.""" index = open_dir(self.index_path) writer = AsyncWriter(index) query = And([ Term('set', doc_set), Or([Term('path', path) for path in paths]) ]) writer.delete_by_query(query) writer.commit()
def add(): d = request.get_json(force=True) url = d.get("url") content = d.get("content") if not url or not content: return jsonify({"status": "missing parameters"}) if urlparse.urlparse(url).netloc.startswith("localhost"): return jsonify({"status": "ignored"}) ix = get_index() writer = AsyncWriter(ix) soup = BeautifulSoup(content) # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) writer.update_document(title=d.get("title", "Untitled"), url=url, content=text, modified=datetime.datetime.now()) writer.commit() return jsonify({"status": "ok"})
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) try: writer.update_document(**doc) except Exception, e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } })
def store_page(user, url): writer = AsyncWriter(idx) resp = requests.get(url) content = parse(resp.content) now = datetime.now() writer.add_document(ts=now, user=unicode(user), url=unicode(url), content=content) writer.commit()
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) writer.update_document(**doc) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name)
def incremental_index(t, l, c, dirname): id = (Searcher().getcount() + 1) ix = index.open_dir(dirname) # The set of all paths in the index #with ix.searcher() as searcher: indexed_feeds = set() with ix.searcher() as searcher: writer = AsyncWriter(ix) # Loop over the stored fields in the index for fields in searcher.all_stored_fields(): indexed_feed = fields['title'] indexed_feeds.add(indexed_feed) # Loop over the files in the filesystem # Assume we have a function that gathers the filenames of the # documents to be indexed if t not in indexed_feeds: # This is either a file that's changed, or a new file # that wasn't indexed before. So index it! wooshDocuments(id, writer, t, l, c) writer.commit() return id
def delPage(self, item): index = item.childCount() while index > 0: index = index - 1 self.dirname = item.child(index).text(0) self.delPage(item.child(index)) # remove attachment folder attDir = self.itemToAttachmentDir(item) for info in QtCore.QDir(attDir).entryInfoList(): QtCore.QDir().remove(info.absoluteFilePath()) QtCore.QDir().rmdir(attDir) pagePath = self.itemToPage(item) self.ix = open_dir(self.settings.indexdir) query = QueryParser("path", self.ix.schema).parse(pagePath) # writer = self.ix.writer() writer = AsyncWriter(self.ix) n = writer.delete_by_query(query) # n = writer.delete_by_term('path', pagePath) writer.commit() # self.ix.close() b = QtCore.QDir(self.notePath).remove(self.pageToFile(pagePath)) parent = item.parent() parentPage = self.itemToPage(parent) if parent is not None: index = parent.indexOfChild(item) parent.takeChild(index) if parent.childCount() == 0: # if no child, dir not needed QtCore.QDir(self.notePath).rmdir(parentPage) else: index = self.indexOfTopLevelItem(item) self.takeTopLevelItem(index) QtCore.QDir(self.notePath).rmdir(pagePath)
def update(self, index, document, **options): index = base._resolve_index(index) ix = self._storage.open_index(indexname=index.get_name()) writer = AsyncWriter(ix) adapted_document = index.adapt_document(document) writer.update_document(**adapted_document) writer.commit()
def update_bulk(self, index, documents): index = base._resolve_index(index) ix = self._storage.open_index(indexname=index.get_name()) writer = AsyncWriter(ix) adapted_documents = (index.adapt_document(doc) for doc in documents) for doc in adapted_documents: writer.update_document(**doc) writer.commit()
def addLink(self, url, title, summary, txt): titleb = title + " " title10 = titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb sumario = summary + " " sumario2 = sumario + sumario text = title10 + sumario2 + " " + txt ix = open_dir(self.indexDir, indexname='MAIN', readonly=False) writer = AsyncWriter(ix) writer.add_document(id=url, content=unicode(text)) writer.commit() ix.close()
def whoosh_task(ids, pool_number, ix, model_class): session = sqla['session'] writer = AsyncWriter(ix) for id_ in ids: obj = session.query(model_class).filter_by(id=id_).one() if obj.title is None or obj.summary is None: continue writer.add_document( title=obj.title, summary=obj.summary ) writer.commit()
def whoosh_index(self): it = QTreeWidgetItemIterator( self.notesTree, QTreeWidgetItemIterator.All) print("Starting complete indexing.") #writer = self.ix.writer() writer = AsyncWriter(self.ix) while it.value(): treeItem = it.value() name = self.notesTree.itemToPage(treeItem) path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/') print(path) fileobj = open(path, 'r', encoding='utf-8') content = fileobj.read() fileobj.close() if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions: no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip() self.settings.md.reset().convert(content) writer.update_document( path=name, title=parseTitle(content, name), content=no_metadata_content, tags=','.join(self.settings.md.Meta.get('tags', [])).strip()) else: writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='') it += 1 writer.commit() print("Finished completely reindexing.")
def index_documents(self, documents): """Add or update documents in the index.""" index = open_dir(self.index_path) writer = AsyncWriter(index) needs_commit = False for document in documents: needs_commit = True writer.update_document( uid=':'.join((document['set'], document['path'])), path=document['path'], set=document['set'], hash=document['hash'], title=document['title'], content=document['content'], kind=document['kind'], ) if needs_commit: writer.commit()
def clear(self): """Remove all content from indexes, and unregister all classes. After clear() the service is stopped. It must be started again to create new indexes and register classes. """ logger.info("Resetting indexes") state = self.app_state for _name, idx in state.indexes.items(): writer = AsyncWriter(idx) writer.commit(merge=True, optimize=True, mergetype=CLEAR) state.indexes.clear() state.indexed_classes.clear() state.indexed_fqcn.clear() self.clear_update_queue() if self.running: self.stop()
def createIndex(self): print " Whoosh Loading from SQL " created = self.createIndexDirIfNotExist() if not created: #already exists return conn = sqlite3.connect(self.dbName) c = conn.cursor() c.execute('''SELECT * FROM newsStorage where ARTICLE <> "" ''') feeds = c.fetchall() conn.close() linkN = 1 schema = Schema(id = TEXT(stored = True), content=TEXT) ix = create_in(self.indexDir, schema, indexname='MAIN') writer = AsyncWriter(ix) for feed in feeds: # Descartar links sem Titulo if( isinstance(feed[3], type(None))): #print "is Null" continue index = feed[0] # print " Whoosh Loaded Titulo " + str(linkN) + ":" + feed[3] linkN += 1 titolo = feed[3] + " " titolo10 = titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo sumario = feed[4] + " " sumario2 = sumario + sumario text = titolo10 + sumario2 + " " +feed[5] writer.add_document(id=index, content=unicode(text)) writer.commit() ix.close() print " Done Loading from SQL"
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: try: doc = index.full_prepare(obj) except SkipDocument: self.log.debug(u"Indexing for object `%s` skipped", obj) else: # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) # Document boosts aren't supported in Whoosh 2.5.0+. if 'boost' in doc: del doc['boost'] try: writer.update_document(**doc) except Exception as e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit()
def newPageCore(self, item, newPageName): pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, '/') if not newPageName: dialog = LineEditDialog(pagePath, self) if dialog.exec_(): newPageName = dialog.editor.text() if newPageName: if hasattr(item, 'text'): pagePath = os.path.join(self.notePath, pagePath + '/').replace(os.sep, '/') if not QDir(pagePath).exists(): QDir(self.notePath).mkdir(pagePath) fileName = pagePath + newPageName + self.settings.fileExt fh = QFile(fileName) fh.open(QIODevice.WriteOnly) savestream = QTextStream(fh) savestream << '# ' + newPageName + '\n' savestream << 'Created ' + str(datetime.date.today()) + '\n\n' fh.close() QTreeWidgetItem(item, [newPageName]) newItem = self.pageToItem(pagePath + newPageName) self.sortItems(0, Qt.AscendingOrder) self.setCurrentItem(newItem) if hasattr(item, 'text'): self.expandItem(item) # create attachment folder if not exist attDir = self.itemToAttachmentDir(newItem) if not QDir(attDir).exists(): QDir().mkpath(attDir) # TODO improvement needed, can be reused somehow fileobj = open(fileName, 'r') content = fileobj.read() fileobj.close() self.ix = open_dir(self.settings.indexdir) #writer = self.ix.writer() writer = AsyncWriter(self.ix) writer.add_document(path=pagePath+newPageName, content=content) writer.commit()
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) try: writer.update_document(**doc) except Exception, e: if not self.silently_fail: raise self.log.error("Failed to add documents to Whoosh: %s", e)
def updateIndex(self): ''' Update whoosh index, which cost much computing resource ''' page = self.parent.notesTree.currentPage() content = self.toPlainText() try: #writer = self.ix.writer() writer = AsyncWriter(self.ix) if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions: no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip() self.settings.md.reset().convert(content) writer.update_document( path=page, title=parseTitle(content, page), content=no_metadata_content, tags=','.join(self.settings.md.Meta.get('tags', [])).strip()) writer.commit() else: writer.update_document( path=page, title=parseTitle(content, page), content=content, tags='') writer.commit() except: print("Whoosh commit failed.")
def handle_document(document_id): document = Document.objects.get(id=document_id) mime_type = document.mime_type parser_class = get_parser_class_for_mime_type(mime_type) parser = parser_class(logging_group=uuid.uuid4()) try: parser.parse(document.source_path, mime_type) if parser.get_archive_path(): with transaction.atomic(): with open(parser.get_archive_path(), 'rb') as f: checksum = hashlib.md5(f.read()).hexdigest() # i'm going to save first so that in case the file move # fails, the database is rolled back. # we also don't use save() since that triggers the filehandling # logic, and we don't want that yet (file not yet in place) Document.objects.filter(pk=document.pk).update( archive_checksum=checksum, content=parser.get_text() ) with FileLock(settings.MEDIA_LOCK): create_source_path_directory(document.archive_path) shutil.move(parser.get_archive_path(), document.archive_path) with AsyncWriter(index.open_index()) as writer: index.update_document(writer, document) except Exception as e: logger.error(f"Error while parsing document {document}: {str(e)}") finally: parser.cleanup()
class SearchPipeline(object): cleanup = False def open_spider(self, spider): """ When opening spider, open or create index. """ index_dir = os.path.expanduser('~/.sitesearcher/index') if not os.path.exists(index_dir): os.makedirs(index_dir) self.indexname = spider.allowed_domains[0] if index.exists_in(index_dir, indexname=self.indexname): self.index = index.open_dir(index_dir, indexname=self.indexname) else: self.index = index.create_in( index_dir, indexname=self.indexname, schema=schema, ) self.writer = AsyncWriter(self.index) def process_item(self, item, spider): """ Add crawled item to index. Add items using ``update_document`` to delete any previously indexed versions and avoid duplicates """ self.writer.update_document( url=item.get('url'), content=item.get('content')) def close_spider(self, spider): """ Close index writer on closing of spider an clean up. On closing, delete any previously indexed items that have not been updated in this crawl, as these are obviously no longer reachable sites. """ with self.index.searcher() as searcher: for page in searcher.all_stored_fields(): if page['url'] not in spider.state['update_list']: self.writer.delete_by_term('url', page['url']) self.writer.commit()
def creating_searching_ranking(selected_analyzer, name_of_file, scoring_function, path): """ Method that creates schema and stores index file based on the retrieved 'csv_test.csv' file input: selected_analyzer - selected text analyzer from the whoosh library name_of_file - name of .csv file stored from dataframe variable 'files_text' scoring_function - selected scoring function from the whoosh library path - path where index files are stored """ #creating Schema with fields id, title and content schema = Schema(id=ID(stored=True),\ title=TEXT(stored=False, analyzer=selected_analyzer), content=TEXT(stored=False, analyzer=selected_analyzer)) directory_containing_the_index = path ix = create_in( directory_containing_the_index, schema ) #vrating index based on schema in the directory where the 'path' is directory_containing_the_index = path ix = index.open_dir( directory_containing_the_index) #opening the index file writer = AsyncWriter(ix) #writer will be used to add content to the fields #num_added_records_so_far=0 ALL_DOCUMENTS_file_name = name_of_file #path to the file in_file = open(ALL_DOCUMENTS_file_name, "r", encoding='latin1') csv_reader = csv.reader(in_file, delimiter=',') #reading the file csv_reader.__next__( ) # to skip the header: first line contains the name of each field. #num_added_records_so_far = 0 for record in csv_reader: #for each row in the 'csv_test' file id = record[1] #read id title = record[2] #read title content = record[3] #read body writer.add_document(id=id, content=title + ' ' + content) #num_added_records_so_far +=1 #if (num_added_records_so_far%1000 == 0): # print(" num_added_records_so_far= " + str(num_added_records_so_far)) writer.commit() in_file.close() #finish writing in the index file
def delete(self, name, purge=True): """ Delete a document by its name. name is actually a hash. If purge is true, file is also removed from the boxes. """ # Grab a writer on the index writer = AsyncWriter(self.index) # Delete and commit ffom index writer.delete_by_term(u'hash', name) writer.commit() # Delete the document from the boxes if we want to purge them if not purge: return # We need to remove the doc is box is writable for box in self.boxes: if box.haskey(name) and not box.readonly: del (box[name])
def newPageCore(self, item, newPageName): pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, '/') if not newPageName: dialog = LineEditDialog(pagePath, self) if dialog.exec_(): newPageName = dialog.editor.text() if newPageName: if hasattr(item, 'text'): pagePath = os.path.join(self.notePath, pagePath + '/').replace(os.sep, '/') if not QDir(pagePath).exists(): QDir(self.notePath).mkdir(pagePath) fileName = pagePath + newPageName + self.settings.fileExt fh = QFile(fileName) fh.open(QIODevice.WriteOnly) savestream = QTextStream(fh) savestream << '# ' + newPageName + '\n' savestream << 'Created ' + str(datetime.date.today()) + '\n\n' fh.close() QTreeWidgetItem(item, [newPageName]) newItem = self.pageToItem(pagePath + newPageName) self.sortItems(0, Qt.AscendingOrder) self.setCurrentItem(newItem) if hasattr(item, 'text'): self.expandItem(item) # create attachment folder if not exist attDir = self.itemToAttachmentDir(newItem) if not QDir(attDir).exists(): QDir().mkpath(attDir) # TODO improvement needed, can be reused somehow fileobj = open(fileName, 'r') content = fileobj.read() fileobj.close() self.ix = open_dir(self.settings.indexdir) #writer = self.ix.writer() writer = AsyncWriter(self.ix) writer.add_document(path=pagePath + newPageName, content=content) writer.commit()
def createIndex(self): print " Whoosh Loading from SQL " created = self.createIndexDirIfNotExist() if not created: #already exists return conn = sqlite3.connect(self.dbName) c = conn.cursor() c.execute('''SELECT * FROM newsStorage where ARTICLE <> "" ''') feeds = c.fetchall() conn.close() linkN = 1 schema = Schema(id=TEXT(stored=True), content=TEXT) ix = create_in(self.indexDir, schema, indexname='MAIN') writer = AsyncWriter(ix) for feed in feeds: # Descartar links sem Titulo if (isinstance(feed[3], type(None))): #print "is Null" continue index = feed[0] # print " Whoosh Loaded Titulo " + str(linkN) + ":" + feed[3] linkN += 1 titolo = feed[3] + " " titolo10 = titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo sumario = feed[4] + " " sumario2 = sumario + sumario text = titolo10 + sumario2 + " " + feed[5] writer.add_document(id=index, content=unicode(text)) writer.commit() ix.close() print " Done Loading from SQL"
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() write = AsyncWriter(self.index) for obj in iterable: try: doc = index.full_prepare(obj) except SkipDocument: self.log.debug(u"Indexing for object '%s' skipped", obj) else: for key in doc: doc[key] = self._from_python(doc[key]) if 'boost' in doc: del doc['boost'] try: write.update_document(**doc) except Exception as e: if not self.silently_fail: raise self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ 'data': { 'index': index, 'object': get_identifier(obj) } }) if len(iterable) > 0: write.commit()
def add_items(self, model, objs): for obj in objs: obj._body_ = self.prepare_body(obj) self._delete_parent_model_data(model, objs) index = self.backend.index.refresh() writer = AsyncWriter(index) for obj in objs: doc = { ID: get_identifier(obj), DJANGO_CT: get_model_ct(obj), DJANGO_ID: force_text(obj.pk), 'text': force_text(obj._body_), } try: writer.update_document(**doc) except Exception as e: raise e if len(objs) > 0: writer.commit()
custom_stops=['rt','ht','mt','@','#','!',':',';',',','.',"'s","?","\\n",'http','https',"n't","&","\\",'...','-','"'] stops=list(set(default_stops+custom_stops)) #Set up schema fields my_schema = Schema(id = ID(unique=True, stored=True), text = TEXT(stored=True), contains_retweet= BOOLEAN(stored=True), screen_name = TEXT(stored=True), keyword=KEYWORD(stored=True), created=DATETIME(stored=True) ) #Create index and AsyncWriter object index = create_in("tweetindex", my_schema) writer = AsyncWriter(index) if __name__=='__main__': #Load raw data with open("WC2015_headers.csv",'rb') as to_load: data=csv.DictReader(to_load) for row in data: #Extract required information from date to create python datetime object date=row['created_at'][:19]+' '+row['created_at'][-4:] #Clean text and parse into keywords text=row['text'].replace('\\','') keywords=[word for word in word_tokenize(text) if word not in stops] #Check for Retweets rt=False
class Index(object): def __init__(self, directory, persist): self.log = logging.getLogger("ftpvista.index") self._persist = persist if not os.path.exists(directory): self.log.info("Creating the index in %s" % directory) os.mkdir(directory) self._idx = index.create_in(directory, schema=self.get_schema()) else: self.log.info("Opening the index in %s" % directory) self._idx = index.open_dir(directory) self._searcher = self._idx.searcher() self._writer = None self.open_writer() self._last_optimization = None def open_writer(self): # self._writer = BufferedWriter(self._idx, 120, 4000) self._writer = AsyncWriter(self._idx) def get_schema(self): analyzer = StemmingAnalyzer("([a-zA-Z0-9])+") my_analyzer = analyzer | CharsetFilter(accent_map) return Schema( server_id=ID(stored=True), has_id=ID(), path=TEXT(analyzer=my_analyzer, stored=True), name=TEXT(analyzer=my_analyzer, stored=True), ext=TEXT(analyzer=my_analyzer, stored=True), size=ID(stored=True), mtime=ID(stored=True, sortable=True), audio_album=TEXT(analyzer=my_analyzer, stored=True), audio_artist=TEXT(analyzer=my_analyzer, stored=True), audio_title=TEXT(analyzer=my_analyzer, stored=True), audio_track=ID(stored=True), audio_year=ID(stored=True), ) def delete_all_docs(self, server): self.open_writer() self._writer.delete_by_term("server_id", str(server.get_server_id())) self._writer.commit() self.log.info("All documents of server %s deleted" % server.get_ip_addr()) def incremental_server_update(self, server_id, current_files): """Prepares to incrementaly update the documents for the given server. server_id -- Id of the server to update. current_files -- a list of (path, size, mtime) tuples for each files currently on the server. Delete all the outdated files from the index and returns a list of files needing to be reindexed. """ def delete_doc(writer, serverid, path): writer.delete_by_query(Term("server_id", str(serverid)) & Term("path", path)) # Build a {path => (size, mtime)} mapping for quick lookups to_index = {} for path, size, mtime in current_files: to_index[path] = (size, mtime) results = self._searcher.documents(server_id=str(server_id)) if results: for fields in results: indexed_path = fields["path"] if indexed_path not in to_index: # This file was deleted from the server since it was indexed delete_doc(self._writer, server_id, indexed_path) self.log.debug("%s has been removed" % indexed_path) else: size, mtime = to_index[indexed_path] try: if mtime > datetime.strptime(fields["mtime"], "%Y-%m-%d %H:%M:%S"): # This file has been modified since it was indexed delete_doc(self._writer, server_id, indexed_path) else: # up to date, no need to reindex del to_index[indexed_path] except ValueError: delete_doc(self._writer, server_id, indexed_path) # return the remaining files return [(path, xsize, xmtime) for (path, (xsize, xmtime)) in to_index.items()] def add_document( self, server_id, name, path, size, mtime, audio_album=None, audio_artist=None, audio_title=None, audio_year=None ): """Add a document with the specified fields in the index. Changes need to be commited. """ # passing the optional arguments is quite a mess # let's build a dict for that purpose _, ext = os.path.splitext(name) ext = ext.lstrip(".") kwargs = { "server_id": server_id, "name": name, "ext": ext, "path": path, "size": size, "mtime": mtime, "has_id": "a", } # Add the optional args if audio_album is not None: kwargs["audio_album"] = audio_album if audio_artist is not None: kwargs["audio_artist"] = audio_artist if audio_title is not None: kwargs["audio_title"] = audio_title if audio_year is not None: kwargs["audio_year"] = audio_year try: self._writer.add_document(**kwargs) except IndexingError: self.open_writer() self._writer.add_document(**kwargs) def commit(self, optimize=False): """ Commit the changes in the index and optimize it """ self.log.info(" -- Begin of Commit -- ") try: self._writer.commit(optimize=optimize) except IndexingError: self.open_writer() self._writer.commit(optimize=optimize) self.log.info("Index commited") self._searcher = self._idx.searcher() self.log.info(" -- End of Commit -- ") def close(self): self.log.info(" -- Closing writer and index -- ") # self._writer.close() """ Close the index """ self._idx.close()
def add(self, note): writer = AsyncWriter(self.index) writer.add_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet) writer.commit()
def index_mail(self, mail): with AsyncWriter(self._index) as writer: self._index_mail(writer, mail)
def add_to_index(self, item_id, text): from whoosh.writing import AsyncWriter writer = AsyncWriter(self.ix) writer.update_document(id=item_id, text=text.lower()) writer.commit()
def remove(self, item_id): from whoosh.writing import AsyncWriter writer = AsyncWriter(self.ix) writer.delete_by_term('id', item_id) writer.commit()
def get_writer(self): return AsyncWriter(self.index)
def insert_docs(docs): ix = open_dir(whoosh_index) writer = AsyncWriter(ix) for doc in docs: writer.add_document(**doc) writer.commit()
'html' : html, 'url' : entry.link, 'tags' : get_entry_tags(entry), 'when' : when}) if not len(entries): return log.debug("%s - %d entries in %fs" % (netloc, len(entries),time.time()-now)) now = time.time() records = 0 now = time.time() ix = open_dir(settings.index) writer = AsyncWriter(ix) for entry in entries: try: item = Item.get(guid = entry['guid']) except Item.DoesNotExist: item = Item.create(**entry) records += 1 if len(entry['html']): soup = BeautifulSoup(entry['html'], settings.fetcher.parser) plaintext = ''.join(soup.find_all(text=True)) writer.add_document( id = item.id, guid = unicode(item.guid), title = entry['title'],
def load_all_dset_metadata(self, dsetname, create_index=False): """ Loads into memory the metadata of a dataset. The metadata is read from a CSV file, which should have at least two columns: - filename: Paths to the images in the dataset, relative to the image data folder. For backward compatibility '#filename' is also accepted - file_attributes: JSON string containing information about the file. The most important file attributes are 'caption' and 'keywords'. The 'caption' field should be a short string which will be used as the caption of the image in result lists. The 'keywords' field must contain a comma-separated list of keywords. Each keyword can be used as the source for a search. If create_index is True, it builds a search index with the 'keywords' in the file_attributes. Arguments: dsetname: String corresponding to the dataset within the list of supported datasets. create_index: Boolean indicating whether or not to build a search index with the metadata """ metaindex = None t = time.time() try: for afile in os.listdir(os.path.join(self.metadata_dir, dsetname)): if afile.endswith(".csv"): metadata_file = os.path.join(self.metadata_dir, dsetname, afile) print('Found metadata file at', metadata_file) if create_index: metaindex = open_dir(self.index_dir) with open(metadata_file, 'r') as fin: reader = csv.DictReader(fin) for row in reader: id_field = None if 'filename' in row.keys(): id_field = 'filename' elif '#filename' in row.keys(): id_field = '#filename' if id_field and 'file_attributes' in row.keys(): filename = row[id_field] try: self.fname2meta[dsetname][ filename] = json.loads( row['file_attributes']) except: self.fname2meta[dsetname][filename] = None metadata = self.fname2meta[dsetname][filename] keyword_list = None if metadata and 'keywords' in metadata.keys(): keyword_list = metadata['keywords'] if keyword_list and create_index: keyword_list_splitted = keyword_list.split( ',') writer = AsyncWriter(metaindex) for key in keyword_list_splitted: key = key.strip() # delete previous entry if found query = QueryParser( 'key', metaindex.schema).parse(key) writer.delete_by_query( query, metaindex.searcher()) # add document writer.add_document( key=str(key), dataset=str(dsetname)) writer.commit() if keyword_list: # we would like to do this, even if the index is not created # register link keyword-file keyword_list_splitted = keyword_list.split( ',') for key in keyword_list_splitted: key = key.strip() if key in self.keyword2fname[ dsetname].keys(): self.keyword2fname[dsetname][ key].append(filename) else: self.keyword2fname[dsetname][ key] = [filename] else: raise Exception( '"filename" and/or "file_attributes" columns not found in ' + afile + ' (are you missing the column names?). Metadata will not be available!.' ) print('Finished loading metadata for %s in %s' % (dsetname, str(time.time() - t))) self.is_all_metadata_loaded = True break except Exception as e: print("load_all_dset_metadata Exception:" + str(e) + '\n')
def index_mail(self, mail): if mail is not None: with AsyncWriter(self._index) as writer: self._index_mail(writer, mail)
def remove(self, instance: Model): """Remove an entry from the index. Non-blocking. :param instance: instance of ``self.model`` to be removed from the index """ with AsyncWriter(self.index) as writer: writer.delete_by_term(self.pk_name, getattr(instance, self.pk_name))
i = 0 line = docs.readline() pbar = tqdm(total=3_213_835) while line != "": _, url, _, _ = line.split("\t") writer.update_document(url_text=url.replace(".", " ")) line = docs.readline() i += 1 pbar.update(1) writer.commit() exit(0) ix.writer().commit(mergetype=writing.CLEAR) print(f"Loading documents from {args.data}") writers = [AsyncWriter(ix) for _ in range(args.threads)] with open(args.data, "r", encoding="utf-8") as docs: i = 0 line = docs.readline() pbar = tqdm( total=args.num_docs if args.num_docs is not None else 3_213_835) while line != "" and (args.num_docs is None or i < args.num_docs): docid, url, title, body = line.split("\t") writers[i % args.threads].add_document(docid=docid, url=url, title=title, body=body) line = docs.readline() i += 1 pbar.update(1) pbar.set_description("Committing...")
def add_to_fts(cls, content, title=None, id=None, source_hash=None, tags=None): ix = open_dir(LOCAL_FTS_INDEX) writer = AsyncWriter(ix) writer.add_document(content=content, title=title, id=id, source_hash=source_hash, tags=tags) writer.commit()
def index_update(index, items): """ :param:index: index name :param:items: list of (operation, full class name, primary key, data) tuples. """ index_name = index index = service.app_state.indexes[index_name] adapted = service.adapted session = safe_session() updated = set() writer = AsyncWriter(index) try: for op, cls_name, pk, data in items: if pk is None: continue # always delete. Whoosh manual says that 'update' is actually delete + add # operation object_key = "{}:{}".format(cls_name, pk) writer.delete_by_term("object_key", object_key) adapter = adapted.get(cls_name) if not adapter: # FIXME: log to sentry? continue if object_key in updated: # don't add twice the same document in same transaction. The writer will # not delete previous records, ending in duplicate records for same # document. continue if op in ("new", "changed"): with session.begin(nested=True): obj = adapter.retrieve(pk, _session=session, **data) if obj is None: # deleted after task queued, but before task run continue document = service.get_document(obj, adapter) try: writer.add_document(**document) except ValueError: # logger is here to give us more infos in order to catch a weird bug # that happens regularly on CI but is not reliably # reproductible. logger.error("writer.add_document(%r)", document, exc_info=True) raise updated.add(object_key) except BaseException: writer.cancel() raise session.close() writer.commit() try: # async thread: wait for its termination writer.join() except RuntimeError: # happens when actual writer was already available: asyncwriter didn't need # to start a thread pass
def insert(self, link, title, document): writer = AsyncWriter(self.ix) writer.add_document(link=link,title=title, document=document + title) writer.commit()
def get_writer(): global WIX writer = AsyncWriter(WIX) return writer
t2 = time() print 'Years', '-'.join(years) print 'Restarting database...' # db=create_engine(connection_string,pool_size=proc_num) engine = db.connect() metadata = MetaData(engine) inspector = inspect(engine) #os.system('mongod -f /etc/mongodb.conf --shutdown') #os.system('mongod -f /etc/mongodb.conf &') print 'Done!' print #client=MongoClient() #db=client['pubmed'] writer = AsyncWriter(index) for year in years: #collection=db[year] #total=collection.count() table_name = 'pubmed_sent_' + year table = Table(table_name, metadata, autoload=True) statement = table.count() total = engine.execute(statement).fetchone()[0] print 'Doc count', str(total) print t = time() num = 0 #for post in collection.find(): statement = table.select()
def index_optimize(): ix = index.open_index() writer = AsyncWriter(ix) writer.commit(optimize=True)
def optimize(self): writer = AsyncWriter(self.index) writer.commit(optimize=True)
def build_index(whoosh_index_dir, file_path, hgweb_config_dir, dburi, **kwargs): """ Build two search indexes simultaneously One is for repositories and the other for tools. Returns a tuple with number of repos and tools that were indexed. """ model = ts_mapping.init(file_path, dburi, engine_options={}, create_tables=False) sa_session = model.context.current repo_index, tool_index = _get_or_create_index(whoosh_index_dir) repo_index_writer = AsyncWriter(repo_index) tool_index_writer = AsyncWriter(tool_index) repos_indexed = 0 tools_indexed = 0 execution_timer = ExecutionTimer() with repo_index.searcher() as searcher: for repo in get_repos(sa_session, file_path, hgweb_config_dir, **kwargs): tools_list = repo.pop('tools_list') repo_id = repo['id'] indexed_document = searcher.document(id=repo_id) if indexed_document: if indexed_document['full_last_updated'] == repo.get( 'full_last_updated'): # We're done, since we sorted repos by update time break else: # Got an update, delete the previous document repo_index_writer.delete_by_term('id', repo_id) repo_index_writer.add_document(**repo) # Tools get their own index for tool in tools_list: tool_index_writer.add_document(**tool) tools_indexed += 1 repos_indexed += 1 tool_index_writer.commit() repo_index_writer.commit() log.info("Indexed repos: %s, tools: %s", repos_indexed, tools_indexed) log.info("Toolbox index finished %s", execution_timer) return repos_indexed, tools_indexed
def remove_from_index(self, mail_id): with AsyncWriter(self._index) as writer: writer.delete_by_term('ident', mail_id)
def get_writer(indexname=None, schema=None): return AsyncWriter(Index.get_index(indexname=indexname, schema=schema))
def search(self, query_string, notebook_id=None): with AsyncWriter(self.index).searcher() as searcher: query_parser = MultifieldParser(["title", "snippet"], schema=self.index.schema).parse(query_string) notebook_filter = query.Term("notebook_id", notebook_id) if notebook_id else None results = searcher.search(query_parser, filter=notebook_filter, limit=None) return [res['note_id'] for res in results]
def get_writer(ix): writer = AsyncWriter(ix) # writer = ix.writer() return writer
def open_writer(self): # self._writer = BufferedWriter(self._idx, 120, 4000) self._writer = AsyncWriter(self._idx)
def get_writer(self, parent_path): logger.debug(f"Getting index writer for path:{parent_path}") return WhooshPathIndexer(AsyncWriter(self.ix), parent_path)
def index_update(index, items): """ :param:index: index name :param:items: list of (operation, full class name, primary key, data) tuples. """ index_name = index index = service.app_state.indexes[index_name] adapted = service.adapted session = safe_session() updated = set() writer = AsyncWriter(index) try: for op, cls_name, pk, data in items: if pk is None: continue # always delete. Whoosh manual says that 'update' is actually delete + add # operation object_key = f"{cls_name}:{pk}" writer.delete_by_term("object_key", object_key) adapter = adapted.get(cls_name) if not adapter: # FIXME: log to sentry? continue if object_key in updated: # don't add twice the same document in same transaction. The writer will # not delete previous records, ending in duplicate records for same # document. continue if op in ("new", "changed"): with session.begin(nested=True): obj = adapter.retrieve(pk, _session=session, **data) if obj is None: # deleted after task queued, but before task run continue document = service.get_document(obj, adapter) try: writer.add_document(**document) except ValueError: # logger is here to give us more infos in order to catch a weird bug # that happens regularly on CI but is not reliably # reproductible. logger.error("writer.add_document(%r)", document, exc_info=True) raise updated.add(object_key) except Exception: writer.cancel() raise session.close() writer.commit() try: # async thread: wait for its termination writer.join() except RuntimeError: # happens when actual writer was already available: asyncwriter didn't need # to start a thread pass
def newPageCore(self, item, newPageName, useTemplate=False, templateTitle=None, templateBody=None): pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, "/") if not newPageName: if useTemplate: dialog = mikitemplate.PickTemplateDialog(pagePath, self.settings, parent=self) if dialog.exec_(): curTitleIdx = dialog.titleTemplates.currentIndex() curBodyIdx = dialog.bodyTemplates.currentIndex() dtnow = datetime.datetime.now() if curTitleIdx > -1: titleItem = dialog.titleTemplates.model().item(curTitleIdx) titleItemContent = titleItem.data(TTPL_COL_DATA) titleItemType = titleItem.data(TTPL_COL_EXTRA_DATA) titleParameter = dialog.titleTemplateParameter.text() newPageName = mikitemplate.makeTemplateTitle( titleItemType, titleItemContent, dtnow=dtnow, userinput=titleParameter ) if curBodyIdx > -1: bodyItemIdx = dialog.bodyTemplates.rootModelIndex().child(curBodyIdx, 0) bodyFPath = dialog.bodyTemplates.model().filePath(bodyItemIdx) else: bodyFPath = None else: dialog = LineEditDialog(pagePath, self) if dialog.exec_(): newPageName = dialog.editor.text() prevparitem = None if newPageName: if hasattr(item, "text"): pagePath = os.path.join(self.notePath, pagePath + "/").replace(os.sep, "/") if not QtCore.QDir(pagePath).exists(): QtCore.QDir(self.notePath).mkdir(pagePath) if not QtCore.QDir(os.path.dirname(newPageName)).exists(): curdirname = os.path.dirname(newPageName) needed_parents = [] while curdirname != "": needed_parents.append(curdirname) curdirname = os.path.dirname(curdirname) # create the needed hierarchy in reverse order for i, needed_parent in enumerate(needed_parents[::-1]): paritem = self.pageToItem(needed_parent) if paritem is None: if i == 0: self.newPageCore(item, os.path.basename(needed_parent)) else: self.newPageCore(prevparitem, os.path.basename(needed_parent)) QtCore.QDir(pagePath).mkdir(needed_parent) elif not QtCore.QDir(os.path.join(self.notePath, needed_parent).replace(os.sep, "/")).exists(): QtCore.QDir(pagePath).mkdir(needed_parent) if paritem is not None: prevparitem = paritem else: prevparitem = self.pageToItem(needed_parent) fileName = pagePath + newPageName + self.settings.fileExt fh = QtCore.QFile(fileName) fh.open(QtCore.QIODevice.WriteOnly) savestream = QtCore.QTextStream(fh) if useTemplate and bodyFPath is not None: with open(bodyFPath, "r", encoding="utf-8") as templatef: savestream << mikitemplate.makeTemplateBody( os.path.basename(newPageName), dtnow=dtnow, dt_in_body_txt=self.tr("Created {}"), body=templatef.read(), ) else: savestream << mikitemplate.makeDefaultBody(os.path.basename(newPageName), self.tr("Created {}")) fh.close() if prevparitem is not None: QtWidgets.QTreeWidgetItem(prevparitem, [os.path.basename(newPageName)]) else: QtWidgets.QTreeWidgetItem(item, [os.path.basename(newPageName)]) newItem = self.pageToItem(pagePath + newPageName) self.sortItems(0, Qt.AscendingOrder) self.setCurrentItem(newItem) if hasattr(item, "text"): self.expandItem(item) # create attachment folder if not exist attDir = self.itemToAttachmentDir(newItem) if not QtCore.QDir(attDir).exists(): QtCore.QDir().mkpath(attDir) # TODO improvement needed, can be reused somehow with open(fileName, "r") as fileobj: content = fileobj.read() self.ix = open_dir(self.settings.indexdir) # writer = self.ix.writer() writer = AsyncWriter(self.ix) writer.add_document(path=pagePath + newPageName, content=content) writer.commit()
with writer as writer: writer.update_document(**doc) # update, because store_revision() may give us an existing revid doc = backend_to_index(meta, content, self.schemas[LATEST_REVS], self.wikiname, backend_name) if async: writer = AsyncWriter(self.ix[LATEST_REVS]) else: writer = self.ix[LATEST_REVS].writer() with writer as writer: writer.update_document(**doc) def remove_revision(self, revid, async=True): """ Remove a single revision from indexes. """ if async: writer = AsyncWriter(self.ix[ALL_REVS]) else: writer = self.ix[ALL_REVS].writer() with writer as writer: writer.delete_by_term(REVID, revid) if async: writer = AsyncWriter(self.ix[LATEST_REVS]) else: writer = self.ix[LATEST_REVS].writer() with writer as writer: # find out itemid related to the revid we want to remove: with self.ix[LATEST_REVS].searcher() as searcher: docnum_remove = searcher.document_number(revid=revid) if docnum_remove is not None: itemid = searcher.stored_fields(docnum_remove)[ITEMID] if docnum_remove is not None:
'url': entry.link, 'tags': get_entry_tags(entry), 'when': when }) if not len(entries): return log.debug("%s - %d entries in %fs" % (netloc, len(entries), time.time() - now)) now = time.time() records = 0 now = time.time() ix = open_dir(settings.index) writer = AsyncWriter(ix) for entry in entries: try: item = Item.get(guid=entry['guid']) except Item.DoesNotExist: item = Item.create(**entry) records += 1 if len(entry['html']): soup = BeautifulSoup(entry['html'], settings.fetcher.parser) plaintext = ''.join(soup.find_all(text=True)) writer.add_document(id=item.id, guid=unicode(item.guid), title=entry['title'], text=plaintext,