Example #1
0
 def generate_cache(self):
     """generates the cache
     """
     if not GutenbergCache.exists():
         try:
             self.infoBox.setText(
                 "The cache is being generated. This can take up to 10min.",
                 "warning")
             GutenbergCache.create(refresh=True,
                                   download=True,
                                   unpack=True,
                                   parse=True,
                                   deleteTemp=True)
             self.infoBox.setText("Cache generated!")
             self.cacheGenerationButton.setDisabled(True)
             self.queryBox.setEnabled(True)
         except Exception as exc:
             print(exc)
             self.infoBox.setText(
                 "An error occurred while building the cache", "error")
     else:
         self.infoBox.setText("The cache already exists.")
Example #2
0
 def check_cache(self):
     """changes layout according to the cache existens
     """
     # disables the search button if cache does not exists
     if not GutenbergCache.exists():
         # disables the search button if not
         self.queryBox.setDisabled(True)
         self.infoBox.setText(
             "Cache must be generated before first launch, it can take up to 10min",
             "warning")
     # disables the the cache generation button if it does exists
     else:
         self.cacheGenerationButton.setDisabled(True)
Example #3
0
    def search(self):
        """ Parse a query string and do a search in the Gutenberg cache
        """

        query_string = self.titleQuery

        if query_string:
            # parse query and lookup in gutenbergcache
            cache = GutenbergCache.get_cache()
            query_results = cache.native_query(
                sql_query=
                "select * from titles where upper(name) like upper('%{query}%') limit {limit}"
                .format(query=query_string, limit=self.nbr_results))
            # get the results
            self.searchResults = list(query_results)

            # display info message
            n_results = len(self.searchResults)
            self.infoBox.setText("{n} result{s} have been found".format(
                n=n_results, s="s" if n_results > 0 else ""))

            # TODO: display results
            # Update the results list with the search results
            # in order to display them
            for idx in self.searchResults:
                result_string = str(idx[1])
                self.titleLabels.append(result_string)

                self.titleLabels = self.titleLabels
                self.clearButton.setDisabled(False)
                self.addButton.setDisabled(self.selectedTitles == list())

                self.controlArea.setDisabled(False)

        else:
            self.infoBox.setText("You didn't search anything", "warning")
Example #4
0
    def sendData(self):
        """Compute result of widget processing and send to output"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some books first", "warning")
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(
            self,
            iterations=len(self.myBasket),
        )

        selectedTexts = list()
        text_content = list()
        annotations = list()
        # get the Gutenberg cache
        cache = GutenbergCache.get_cache()
        try:
            # TODO: Retrieve selected texts from gutenberg
            for text in self.myBasket:

                # Get the id of the text
                query_id = cache.native_query(
                    sql_query=
                    "select gutenbergbookid from books where id == {selected_id}"
                    .format(selected_id=text[2]))
                gutenberg_id = list(query_id)

                # Get the text with Gutenbergpy
                gutenberg_text = gutenbergpy.textget.strip_headers(
                    gutenbergpy.textget.get_text_by_id(gutenberg_id[0][0]))
                text_content.append(gutenberg_text)

                annotations.append(text[1])
                progressBar.advance()

        # If an error occurs (e.g. http error, or memory error)...
        except Exception:
            # Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from Gutenberg",
                                 "error")
            self.controlArea.setDisabled(False)
            return

        # TODO: send gutenberg texts as output
        # Store downloaded lyrics strings in input objects...
        for text in text_content:
            newInput = Input(text, self.captionTitle)
            self.createdInputs.append(newInput)

        # If there"s only one play, the widget"s output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # TODO: annotate with book metadata
        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update({"title": annotations[idx]})
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send("Gutenberg importation", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Example #5
0
 def generate_cache(self):
     GutenbergCache.create(refresh=True,
                           download=True,
                           unpack=True,
                           parse=True,
                           deleteTemp=True)
Example #6
0
import gutenbergpy.textget
from gutenbergpy.gutenbergcache import GutenbergCache

# create cache from scratchfrom scratch
GutenbergCache.create(refresh=True,
                      download=True,
                      unpack=True,
                      parse=True,
                      cache=True,
                      deleteTemp=True)
# get the default cache (SQLite)
cache = GutenbergCache.get_cache()
# For the query function you can use the following fields: languages authors types titles subjects publishers bookshelves
print(
    cache.query(downloadtype=[
        'application/plain', 'text/plain', 'text/html; charset=utf-8'
    ]))
# Print stripped text
print(
    gutenbergpy.textget.strip_headers(
        gutenbergpy.textget.get_text_by_id(1000)))
Example #7
0
    def search(self):
        """
            Parse a query string and do a search in the Gutenberg cache
        """
        query_string = self.titleQuery
        query_author = self.authorQuery
        language = self.lang_dict[self.langQuery]

        # informs the user that he didn't change anything
        if self.langQuery == 'Any' and query_string == '' and self.authorQuery == '':
            self.infoBox.setText(
                "You can't search only by language, if it's set to Any",
                "warning")

        else:
            # Recode author to name, first_name
            if len(query_author.split()) == 2:
                if "," not in query_author:
                    query_author = "%, ".join(query_author.split()[::-1])

            # parse query and lookup in gutenbergcache
            cache = GutenbergCache.get_cache()

            # searches the database
            try:
                query_results = cache.native_query(sql_query="""
                    /* Creates a new table with one author per book
                    by selecting the greatest author id */

                    WITH unique_book_author AS
                    (SELECT * FROM book_authors  
                    WHERE authorid IN (SELECT MAX(authorid) FROM book_authors GROUP BY bookid))

                    /* Selects title, author, gutenberg id and language */

                    SELECT titles.name, authors.name, books.gutenbergbookid, languages.name
                    FROM titles

                    /* Merges every needed table into one on shared attributes */

                    INNER JOIN books ON books.id = titles.bookid
                    INNER JOIN unique_book_author ON  books.id = unique_book_author.bookid 
                    INNER JOIN authors ON authors.id = unique_book_author.authorid
                    INNER JOIN languages ON books.languageid = languages.id

                    /* Matches users query using % wildcard for more permissive query */

                    WHERE upper(titles.name) LIKE "%{title}%"
                    AND upper(authors.name) LIKE "%{author}%"
                    AND languages.name LIKE "%{lang}%"
                    LIMIT {limit}
                    """.format(title=query_string,
                               author=query_author,
                               lang=language,
                               limit=self.nbr_results))
            except Exception as exc:
                print(exc)
                self.infoBox.setText(
                    "An error occurred while interrogating the cache.",
                    "error")
                return
            # get the results
            Results = list(query_results)

            self.searchResults = list()

            # creates better results
            for result in Results:
                result = list(result)
                # replaces all newlines types
                result[0] = re.sub(r'[\n\r]+', r', ', result[0])
                # recodes athor from: name, first_name to: fisrt_name name
                result[1] = " ".join(result[1].split(", ")[::-1])
                # gets the key from the lang_dict for the coresponding language abbreviation
                result[3] = [
                    key for key, value in self.lang_dict.items()
                    if value == result[3]
                ][0]

                self.searchResults.append(result)

            # display info message
            n_results = len(self.searchResults)
            self.infoBox.setText("{n} result{s} have been found".format(
                n=n_results, s="s" if n_results > 0 else ""))

            self.clearResults()
            # Update the results list with the search results
            # in order to display them
            for idx in self.searchResults:

                result_string = "{title} — {author} — {lang}".format(
                    title=idx[0], author=idx[1], lang=idx[3])
                self.titleLabels.append(result_string)

                self.titleLabels = self.titleLabels
                self.clearButton.setDisabled(False)
                self.addButton.setDisabled(self.selectedTitles == list())

                self.controlArea.setDisabled(False)
Example #8
0
from book_openers.gutenberg import main
from gutenbergpy.gutenbergcache import GutenbergCache
from sys import argv

if __name__ == "__main__":
    if len(argv) == 1:
        main()
    elif argv[1] == "catalogue":
        GutenbergCache.create()
    else:
        try:
            id = int(argv[1])
        except:
            print("invalid options")
        else:
            main(id)
Example #9
0
import statistics


def fill_book_with_content(book):
    id = book['id']
    fulltext = gutenbergpy.textget.strip_headers(
        gutenbergpy.textget.get_text_by_id(id)).decode('ASCII',
                                                       errors='ignore')
    book['content'] = fulltext


NUMBER_OF_BOOKS_TO_INDEX = 10000

# ELASTIC_USER = '******'
# ELASTIC_PASSWORD = '******'
# ELASTIC_ADDRESS = 'localhost:9200'

# es = Elasticsearch([f'http://{ELASTIC_USER}:{ELASTIC_PASSWORD}@{ELASTIC_ADDRESS}/'])
es = Elasticsearch()
cache = GutenbergCache.get_cache()

with open('parsed.json') as books_json:
    books_list = json.load(books_json)
    for book in books_list[:NUMBER_OF_BOOKS_TO_INDEX]:
        id = book['id']
        try:
            fill_book_with_content(book)
            res = es.index(index="books", id=id, body=book)
            print(res)
        except:
            print(f'Cannot obtain fulltext for book with id {id}.')