def handle(self, *directories, **options): from django.db import transaction self.style = color_style() verbose = options.get('verbose') import_picture = options.get('import_picture') wait_until = None if options.get('wait_until'): wait_until = time.mktime(time.strptime(options.get('wait_until'), '%Y-%m-%d %H:%M:%S')) if verbose > 0: print "Will wait until %s; it's %f seconds from now" % ( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(wait_until)), wait_until - time.time()) index = None if options.get('search_index') and not settings.NO_SEARCH_INDEX: index = Index() try: index.index_tags() index.index.commit() except Exception, e: index.index.rollback() raise e
def handle(self, *args, **opts): from catalogue.models import Book from search.index import Index idx = Index() if not opts['just_tags']: if args: books = [] for a in args: if opts['book_id']: books += Book.objects.filter(id=int(a)).all() else: books += Book.objects.filter(slug=a).all() else: books = list(Book.objects.all()) while books: try: b = books[0] print b.title idx.index_book(b) idx.index.commit() books.pop(0) except Exception, e: print "Error occured: %s" % e try: # we might not be able to rollback idx.index.rollback() except: pass retry = query_yes_no("Retry?") if not retry: break
def book_delete(sender, instance, **kwargs): caches[settings.CACHE_MIDDLEWARE_ALIAS].clear() flush_ssi_includes([ '/katalog/%s.json' % lang for lang in [lc for (lc, _ln) in settings.LANGUAGES]]) if not settings.NO_SEARCH_INDEX: # remove the book from search index, when it is deleted. from search.index import Index idx = Index() idx.remove_book(instance) idx.index_tags()
def setUp(self): WLTestCase.setUp(self) index = Index() self.search = Search() index.delete_query(self.search.index.query(uid="*")) index.index.commit() self.do_doktora = Book.from_xml_file( get_fixture('do-doktora.xml', opds)) self.do_anusie = Book.from_xml_file( get_fixture('fraszka-do-anusie.xml', catalogue))
def handle(self, *directories, **options): self.style = color_style() verbose = options.get('verbose') import_picture = options.get('import_picture') if options.get('search_index') and not settings.NO_SEARCH_INDEX: index = Index() try: index.index_tags() index.index.commit() except Exception, e: index.index.rollback() raise e
def handle(self, **opts): from catalogue.models import Book from search.index import Index idx = Index() if not opts['just_tags']: if opts['args']: books = [] for a in opts['args']: if opts['book_id']: books += Book.objects.filter(id=int(a)).all() else: books += Book.objects.filter(slug=a).all() else: books = list(Book.objects.order_by('slug')) start_from = opts.get('start_from') stop_after = opts.get('stop_after') if start_from: start_from = start_from.replace('-', '') if stop_after: stop_after = stop_after.replace('-', '') while books: try: b = books[0] slug = b.slug.replace('-', '') if stop_after and slug > stop_after: break if not start_from or slug >= start_from: print(b.slug) idx.index_book(b) idx.index.commit() books.pop(0) except: traceback.print_exc() try: # we might not be able to rollback idx.index.rollback() except: pass retry = query_yes_no("Retry?") if not retry: break print('Reindexing tags.') idx.index_tags() idx.index.commit()
def setUp(self): WLTestCase.setUp(self) index = Index() index.index.delete_all() index.index.commit() self.do_doktora = Book.from_xml_file(get_fixture('do-doktora.xml')) self.do_anusie = Book.from_xml_file( get_fixture('fraszka-do-anusie.xml', catalogue))
def search_index(self, book_info=None, index=None, index_tags=True, commit=True): if index is None: from search.index import Index index = Index() try: index.index_book(self, book_info) if index_tags: index.index_tags() if commit: index.index.commit() except Exception, e: index.index.rollback() raise e
import requests from download import download_wikipedia_abstracts from load import load_documents from search.timing import timing from search.index import Index @timing def index_documents(documents, index): for i, document in enumerate(documents): index.index_document(document) if i % 5000 == 0: print(f'Indexed {i} documents', end='\r') return index if __name__ == '__main__': # this will only download the xml dump if you don't have a copy already; # just delete the file if you want a fresh copy if not os.path.exists('data/enwiki.latest-abstract.xml.gz'): download_wikipedia_abstracts() index = index_documents(load_documents(), Index()) print(f'Index contains {len(index.documents)} documents') index.search('London Beer Flood', search_type='AND') index.search('London Beer Flood', search_type='OR') index.search('London Beer Flood', search_type='AND', rank=True) index.search('London Beer Flood', search_type='OR', rank=True)
app.add_middleware( CORSMiddleware, allow_origins=(eval(cfg["middleware"]["ALLOWED_HOSTS"]), allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # search interface @app.get("/search/") async def search( q: str = Query(None, max_length=280), page: Optional[int] = Query( None, ge=eval(cfg["search"]["pg_range"])["ge"], le=eval(cfg["search"]["pg_range"])["le"], ), ): return Search()._query(q, page) # Re-queries and populates database at scheduled time # Use cron expression to set refresh rate @aiocron.crontab(cfg["CRAWLER"]["refresh_rate"]) async def background_process(): start_crawl = Crawler() index = Index()._create()
def update_index(sender, instance, **kwargs): from search.index import Index idx = Index() idx.index_tags(instance, remove_only='created' not in kwargs)
def handle(self, **options): self.style = color_style() verbose = options.get('verbose') import_picture = options.get('import_picture') if options.get('search_index') and not settings.NO_SEARCH_INDEX: index = Index() try: index.index_tags() index.index.commit() except Exception as e: index.index.rollback() raise e files_imported = 0 files_skipped = 0 for dir_name in options['directory']: if not os.path.isdir(dir_name): print(self.style.ERROR("%s: Not a directory. Skipping." % dir_name)) else: # files queue files = sorted(os.listdir(dir_name)) postponed = {} while files: file_name = files.pop(0) file_path = os.path.join(dir_name, file_name) file_base, ext = os.path.splitext(file_path) # Skip files that are not XML files if not ext == '.xml': continue if verbose > 0: print("Parsing '%s'" % file_path) else: sys.stdout.write('.') sys.stdout.flush() # Import book files try: if import_picture: self.import_picture(file_path, options) else: self.import_book(file_path, options) files_imported += 1 except (Book.AlreadyExists, Picture.AlreadyExists): print(self.style.ERROR( '%s: Book or Picture already imported. Skipping. To overwrite use --force.' % file_path)) files_skipped += 1 except Book.DoesNotExist as e: if file_name not in postponed or postponed[file_name] < files_imported: # push it back into the queue, maybe the missing child will show up if verbose: print(self.style.NOTICE('Waiting for missing children')) files.append(file_name) postponed[file_name] = files_imported else: # we're in a loop, nothing's being imported - some child is really missing raise e # Print results print() print("Results: %d files imported, %d skipped, %d total." % ( files_imported, files_skipped, files_imported + files_skipped)) print()