def generate_cache(self): """generates the cache """ if not GutenbergCache.exists(): try: self.infoBox.setText( "The cache is being generated. This can take up to 10min.", "warning") GutenbergCache.create(refresh=True, download=True, unpack=True, parse=True, deleteTemp=True) self.infoBox.setText("Cache generated!") self.cacheGenerationButton.setDisabled(True) self.queryBox.setEnabled(True) except Exception as exc: print(exc) self.infoBox.setText( "An error occurred while building the cache", "error") else: self.infoBox.setText("The cache already exists.")
def check_cache(self): """changes layout according to the cache existens """ # disables the search button if cache does not exists if not GutenbergCache.exists(): # disables the search button if not self.queryBox.setDisabled(True) self.infoBox.setText( "Cache must be generated before first launch, it can take up to 10min", "warning") # disables the the cache generation button if it does exists else: self.cacheGenerationButton.setDisabled(True)
def search(self): """ Parse a query string and do a search in the Gutenberg cache """ query_string = self.titleQuery if query_string: # parse query and lookup in gutenbergcache cache = GutenbergCache.get_cache() query_results = cache.native_query( sql_query= "select * from titles where upper(name) like upper('%{query}%') limit {limit}" .format(query=query_string, limit=self.nbr_results)) # get the results self.searchResults = list(query_results) # display info message n_results = len(self.searchResults) self.infoBox.setText("{n} result{s} have been found".format( n=n_results, s="s" if n_results > 0 else "")) # TODO: display results # Update the results list with the search results # in order to display them for idx in self.searchResults: result_string = str(idx[1]) self.titleLabels.append(result_string) self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.addButton.setDisabled(self.selectedTitles == list()) self.controlArea.setDisabled(False) else: self.infoBox.setText("You didn't search anything", "warning")
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some books first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar( self, iterations=len(self.myBasket), ) selectedTexts = list() text_content = list() annotations = list() # get the Gutenberg cache cache = GutenbergCache.get_cache() try: # TODO: Retrieve selected texts from gutenberg for text in self.myBasket: # Get the id of the text query_id = cache.native_query( sql_query= "select gutenbergbookid from books where id == {selected_id}" .format(selected_id=text[2])) gutenberg_id = list(query_id) # Get the text with Gutenbergpy gutenberg_text = gutenbergpy.textget.strip_headers( gutenbergpy.textget.get_text_by_id(gutenberg_id[0][0])) text_content.append(gutenberg_text) annotations.append(text[1]) progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except Exception: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Gutenberg", "error") self.controlArea.setDisabled(False) return # TODO: send gutenberg texts as output # Store downloaded lyrics strings in input objects... for text in text_content: newInput = Input(text, self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # TODO: annotate with book metadata # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update({"title": annotations[idx]}) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Gutenberg importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def generate_cache(self): GutenbergCache.create(refresh=True, download=True, unpack=True, parse=True, deleteTemp=True)
import gutenbergpy.textget from gutenbergpy.gutenbergcache import GutenbergCache # create cache from scratchfrom scratch GutenbergCache.create(refresh=True, download=True, unpack=True, parse=True, cache=True, deleteTemp=True) # get the default cache (SQLite) cache = GutenbergCache.get_cache() # For the query function you can use the following fields: languages authors types titles subjects publishers bookshelves print( cache.query(downloadtype=[ 'application/plain', 'text/plain', 'text/html; charset=utf-8' ])) # Print stripped text print( gutenbergpy.textget.strip_headers( gutenbergpy.textget.get_text_by_id(1000)))
def search(self): """ Parse a query string and do a search in the Gutenberg cache """ query_string = self.titleQuery query_author = self.authorQuery language = self.lang_dict[self.langQuery] # informs the user that he didn't change anything if self.langQuery == 'Any' and query_string == '' and self.authorQuery == '': self.infoBox.setText( "You can't search only by language, if it's set to Any", "warning") else: # Recode author to name, first_name if len(query_author.split()) == 2: if "," not in query_author: query_author = "%, ".join(query_author.split()[::-1]) # parse query and lookup in gutenbergcache cache = GutenbergCache.get_cache() # searches the database try: query_results = cache.native_query(sql_query=""" /* Creates a new table with one author per book by selecting the greatest author id */ WITH unique_book_author AS (SELECT * FROM book_authors WHERE authorid IN (SELECT MAX(authorid) FROM book_authors GROUP BY bookid)) /* Selects title, author, gutenberg id and language */ SELECT titles.name, authors.name, books.gutenbergbookid, languages.name FROM titles /* Merges every needed table into one on shared attributes */ INNER JOIN books ON books.id = titles.bookid INNER JOIN unique_book_author ON books.id = unique_book_author.bookid INNER JOIN authors ON authors.id = unique_book_author.authorid INNER JOIN languages ON books.languageid = languages.id /* Matches users query using % wildcard for more permissive query */ WHERE upper(titles.name) LIKE "%{title}%" AND upper(authors.name) LIKE "%{author}%" AND languages.name LIKE "%{lang}%" LIMIT {limit} """.format(title=query_string, author=query_author, lang=language, limit=self.nbr_results)) except Exception as exc: print(exc) self.infoBox.setText( "An error occurred while interrogating the cache.", "error") return # get the results Results = list(query_results) self.searchResults = list() # creates better results for result in Results: result = list(result) # replaces all newlines types result[0] = re.sub(r'[\n\r]+', r', ', result[0]) # recodes athor from: name, first_name to: fisrt_name name result[1] = " ".join(result[1].split(", ")[::-1]) # gets the key from the lang_dict for the coresponding language abbreviation result[3] = [ key for key, value in self.lang_dict.items() if value == result[3] ][0] self.searchResults.append(result) # display info message n_results = len(self.searchResults) self.infoBox.setText("{n} result{s} have been found".format( n=n_results, s="s" if n_results > 0 else "")) self.clearResults() # Update the results list with the search results # in order to display them for idx in self.searchResults: result_string = "{title} — {author} — {lang}".format( title=idx[0], author=idx[1], lang=idx[3]) self.titleLabels.append(result_string) self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.addButton.setDisabled(self.selectedTitles == list()) self.controlArea.setDisabled(False)
from book_openers.gutenberg import main from gutenbergpy.gutenbergcache import GutenbergCache from sys import argv if __name__ == "__main__": if len(argv) == 1: main() elif argv[1] == "catalogue": GutenbergCache.create() else: try: id = int(argv[1]) except: print("invalid options") else: main(id)
import statistics def fill_book_with_content(book): id = book['id'] fulltext = gutenbergpy.textget.strip_headers( gutenbergpy.textget.get_text_by_id(id)).decode('ASCII', errors='ignore') book['content'] = fulltext NUMBER_OF_BOOKS_TO_INDEX = 10000 # ELASTIC_USER = '******' # ELASTIC_PASSWORD = '******' # ELASTIC_ADDRESS = 'localhost:9200' # es = Elasticsearch([f'http://{ELASTIC_USER}:{ELASTIC_PASSWORD}@{ELASTIC_ADDRESS}/']) es = Elasticsearch() cache = GutenbergCache.get_cache() with open('parsed.json') as books_json: books_list = json.load(books_json) for book in books_list[:NUMBER_OF_BOOKS_TO_INDEX]: id = book['id'] try: fill_book_with_content(book) res = es.index(index="books", id=id, body=book) print(res) except: print(f'Cannot obtain fulltext for book with id {id}.')