Example #1
0
 def test_load_etext(self):
     loaders = (lambda etextno: load_etext(etextno, refresh_cache=True),
                lambda etextno: load_etext(etextno, refresh_cache=False))
     testcases = (
         SampleMetaData.for_etextno(2701),  # newstyle identifier
         SampleMetaData.for_etextno(5),  # oldstyle identifier
         SampleMetaData.for_etextno(14287),  # unicode text
         SampleMetaData.for_etextno(23962)  # UTF-8 text
     )
     for testcase, loader in itertools.product(testcases, loaders):
         text = loader(testcase.etextno)
         self.assertTrue(isinstance(text, str))
Example #2
0
 def test_load_etext(self):
     loaders = (lambda etextno: load_etext(etextno, refresh_cache=True),
                lambda etextno: load_etext(etextno, refresh_cache=False))
     testcases = (
         SampleMetaData.for_etextno(2701),   # newstyle identifier
         SampleMetaData.for_etextno(5),      # oldstyle identifier
         SampleMetaData.for_etextno(14287),  # unicode text
         SampleMetaData.for_etextno(23962)   # UTF-8 text
     )
     for testcase, loader in itertools.product(testcases, loaders):
         text = loader(testcase.etextno)
         self.assertIsInstance(text, unicode)
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        del data_dir
        del tmp_dir
        del dataset_split

        # pylint: disable=g-import-not-at-top
        from gutenberg import acquire
        from gutenberg import cleanup
        # pylint: enable=g-import-not-at-top

        books = [
            # bookid, skip N lines
            (19221, 223),
            (15553, 522),
        ]

        for (book_id, toskip) in books:
            text = cleanup.strip_headers(acquire.load_etext(book_id)).strip()
            lines = text.split("\n")[toskip:]
            prev_line = None
            ex_count = 0
            for line in lines:
                # Any line that is all upper case is a title or author name
                if not line or line.upper() == line:
                    prev_line = None
                    continue

                line = re.sub("[^a-z]+", " ", line.strip().lower())
                if prev_line and line:
                    yield {
                        "inputs": prev_line,
                        "targets": line,
                    }
                    ex_count += 1
                prev_line = line
Example #4
0
def generate_paragraph():
    '''
    Generates a random paragraph from the Gutenberg Project

    :return: Text the Guttenberg Project with spaces and non-alphabetic characters removed\
    and all characters lower case
    :rtype: str
    '''
    #Get the text from Gutenberg Project, in this case its Moby Dick
    text = strip_headers(load_etext(2701)).strip()
    #text = "Jack and Jill ran up the hill to get a pail of water. " +
    #       "Jack fell down and broke his crown and Jill came tumbling after."
    sentences = []
    paragraph = ""

    for sentence in text.split("."):
        sentences.append(sentence)

    #Select 2 random sentences
    paragraph = random.choice(sentences) + random.choice(sentences)

    paragraph = re.sub(r'\s+', '', paragraph)
    regex = re.compile('[^a-zA-Z]')
    paragraph = regex.sub('', paragraph).lower()
    return paragraph
def get_featurelists(book):

    # Preparation for topic features: get 100 most common uni-, bi- and trigrams of the given book
    common_ngrams = get_common_ngrams(book)

    # Extract the features of the given book
    features_book = (book_id, extract_features(book, common_ngrams))

    # Create new file and write the features of the given book to it
    path_feat_book = "C:/Users/gebruiker/Documents/Master/Text/Project/output_data/features_book.txt"
    with open(path_feat_book, 'r+', encoding="utf-8") as output_book:
        output_book.write(str(features_book))
        output_book.close()

    # Create new file to write the features of the dataset books to
    path_feat_books = "C:/Users/gebruiker/Documents/Master/Text/Project/output_data/features_dataset.txt"
    output_dataset = open(path_feat_books, 'r+', encoding="utf-8")

    # Extract the features of the dataset books
    features_dataset = []
    for i in IDs:
        features_dataset.append((i,
                                 extract_features(
                                     strip_headers(load_etext(i)).strip(),
                                     common_ngrams)))

        # Write the features to the output file
        output_dataset.write("\n Book " + str(i) + ": ")
        output_dataset.write(str(features_dataset[len(features_dataset) - 1]))
    output_dataset.close()

    return features_book, features_dataset
def get_gutenberg_text(id):
    try:
        text = strip_headers(load_etext(id)).strip()
        return text
    except Exception as ex:
        print(ex)
    return ''
Example #7
0
    def load_gutenberg(self, language='en'):
        texts = get_etexts('author', self.author)
        texts = {
            t: list(get_metadata("title", t))[0]
            for t in texts if list(get_metadata("language", t))[0] == language
        }

        new_texts = dict()
        dupes = list()
        for k, d in texts.items():
            d = d.replace("\r\n", " ")
            if d not in dupes:
                dupes.append(d)
                new_texts[k] = d
                try:
                    self.books[d] = strip_headers(
                        load_etext(k)).strip().split("\r\n\r\n")
                except UnknownDownloadUriException:
                    print(
                        f'Book "{d}" does not have a text format and was not loaded.'
                    )
                    del new_texts[k]
                    dupes.remove(d)
                    continue
                self.tokens[d] = [
                    nltk.pos_tag(nltk.word_tokenize(self.books[d][b]))
                    for b in range(len(self.books[d]))
                ]
            else:
                pass

        texts = new_texts

        print(texts)
def load_macbeth():
    """
    Sources Macbeth from Project Gutenberg, returns a cleaned dataframe
    of the play split by act, scene, speaker, and sentence.
    """
    raw_text = load_etext(1533)  # Collect the text
    raw_text = strip_headers(raw_text)  # Remove most metadata

    # Remove in-line stage directions

    raw_text = remove_in_line_stage_directions(raw_text)

    # Split the text into sentences

    sentences = separate_sentences(raw_text)

    # Remove introductory data, keeping only the text

    sentences = sentences[110:]

    # Create a dataframe from the sentences

    macbeth = create_play_data_frame(sentences)

    # Clean the dataframe

    macbeth = clean_macbeth(macbeth)

    # Add a token column

    macbeth["tokens"] = create_token_column(macbeth["sentence"])

    # Return the finished dataframe

    return macbeth
Example #9
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        del data_dir
        del tmp_dir
        del dataset_split

        books = [
            # bookid, skip N lines
            (19221, 223),
            (15553, 522),
        ]

        for (book_id, toskip) in books:
            text = cleanup.strip_headers(acquire.load_etext(book_id)).strip()
            lines = text.split("\n")[toskip:]
            for line in lines:
                # Any line that is all upper case is a title or author name
                if not line or line.upper() == line:
                    continue

                line = re.sub("[^a-z]+", " ", line.strip().lower())
                if line:
                    l = len(line)
                    if l > 100:
                        l = 100
                    yield {
                        "inputs": line,
                        "label": l,
                    }
Example #10
0
def main():
    """
    The main method.
    """

    parser = argparse.ArgumentParser(
        description='Word suggestion based on Project Gutenberg books.')
    parser.add_argument('--book-id',
                        dest='book_ids',
                        nargs='+',
                        type=int,
                        required=True,
                        help='the book id of the Project Gutenberg')
    parser.add_argument('--query',
                        nargs='+',
                        type=str,
                        required=True,
                        help='suggest next word for list of string',
                        action=required_length(1, 5))

    try:
        args = parser.parse_args()
        text_indexer = TextIndexer(len(args.query))

        for book_id in list(dict.fromkeys(args.book_ids)):
            text = strip_headers(load_etext(book_id)).strip()
            text_indexer.add_text(book_id, text)

        print(text_indexer.suggest(*args.query))
    except Exception as exc:  # pylint: disable=W0703
        print(exc)
Example #11
0
def getSomeBooks(howManyBooks, startingAt=1):
    i = howManyBooks
    ii = startingAt
    nothing = 0
    valError = 0
    otherError = 0
    allTheBooks = []
    while i > len(allTheBooks):  # 54096 ceiling
        try:
            theText = strip_headers(
                load_etext(ii)).strip()  #load the full text into theText
            theLength = len(theText)
            if len(theText) > 292:
                allTheBooks.append([ii, theText])
                print("one more book in the list, book number:", i,
                      "book total is:", len(allTheBooks))
            else:
                nothing = nothing + 1
                print("nothing here at number:", i)
        except ValueError:
            valError = valError + 1
            print("valueError at book number:", i)
        except:
            otherError = otherError + 1
            print("otherError at book number:", i)
        ii = ii + 1

    print('all done')
    print(len(allTheBooks))
    return allTheBooks
Example #12
0
def post_corpora(url, auth_token):
    corpora = acquire_corpora()
    text = strip_headers(load_etext(corpora[0])).strip()

    print(corpora, text[:100])

    authentication_token = {'authentication-token': auth_token}

    # data to post
    files = {'file': io.StringIO(text)}
    data = {
        'label': '{} {}'.format(corpora[1], corpora[3]),
        'source': corpora[2]
    }

    # post
    ru = requests.post(url,
                       headers=authentication_token,
                       files=files,
                       data=data)

    print(ru.url, ru.status_code)
    if ru.ok:
        print(ru.json())
    else:
        print(ru.status_code, ru.reason)
Example #13
0
def download(cfg):
    print('Downloading Gutenberg data to: ' + cfg.directory)
    # Load language data for all books.
    path = os.path.join('code', 'utils', 'metadata.txt')
    with open(path, encoding='utf-8') as f:
        counter = 0
        for line in f:
            [index, lang, r, author, title] = line.split('\t')

            r = int(r)
            i = int(index)
            if counter < cfg.max_books and r == 1 and lang in cfg.languages:
                # Get the book.
                try:
                    text = strip_headers(load_etext(i)).strip().encode('utf-8')
                except UnknownDownloadUriException:
                    print('Could not download book: ' + str(i))
                    continue

                # Save the file to the correct directory.
                path = os.path.join(cfg.directory, lang)
                if not os.path.exists(path):
                    os.mkdir(path)
                with open(os.path.join(path, str(i) + '.txt'), 'wb') as f:
                    f.write(text)

                    counter += 1
                    if not counter % 1000:
                        print('Downloaded ' + str(counter) + ' books')
def init_books(author_file, json_file):
    """initialize book list with texts and save it to disk"""
    with open(author_file) as f:
        authors = list(f)

    authors = [i.strip() for i in authors]

    books = []
    for author in authors:
        s = get_etexts('author', author)
        for i in s:
            try:
                if list(get_metadata('language', i))[0] == 'en':
                    title, etext = list(get_metadata(
                        'title', i))[0], strip_headers(load_etext(i)).strip()
                    b = Book(i, title, etext)
                    books.append(b)
            except UnknownDownloadUriException:
                # this book does not have a load_etext corresponding to it.
                pass

    with open(json_file, 'wb') as f:
        pickle.dump(books, f)

    print(len(books))
def poetry_cleaner(poetry_books=BOOKS):
    with open(INPUT_DATA_WRITE_PATH + OUT_PATH, 'w') as ofp:

        lineno = 0

        for (id_nr, toskip, title) in poetry_books:

            startline = lineno
            text = strip_headers(load_etext(id_nr)).strip()
            lines = text.split('\n')[toskip:]

            for line in lines:

                if 0 < len(line) < 50 and line.upper(
                ) != line and not re.match('.*[0-9]+.*', line):
                    cleaned = re.sub('[^a-z\'\-]+', ' ', line.strip().lower())
                    if lineno < 100:
                        ofp.write(cleaned)
                        ofp.write('\n')
                    lineno = lineno + 1

                else:
                    ofp.write('\n')

        print('Wrote lines {} to {} from {}'.format(startline, lineno, title))
Example #16
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    del data_dir
    del tmp_dir
    del dataset_split

    # pylint: disable=g-import-not-at-top
    from gutenberg import acquire
    from gutenberg import cleanup
    # pylint: enable=g-import-not-at-top

    books = [
        # bookid, skip N lines
        (19221, 223),
        (15553, 522),
    ]

    for (book_id, toskip) in books:
      text = cleanup.strip_headers(acquire.load_etext(book_id)).strip()
      lines = text.split("\n")[toskip:]
      prev_line = None
      ex_count = 0
      for line in lines:
        # Any line that is all upper case is a title or author name
        if not line or line.upper() == line:
          prev_line = None
          continue

        line = re.sub("[^a-z]+", " ", line.strip().lower())
        if prev_line and line:
          yield {
              "inputs": prev_line,
              "targets": line,
          }
          ex_count += 1
        prev_line = line
Example #17
0
def sample_paragraphs(book_id, n_parag, min_length):
    """Get book as text file and randomly sample a fixed number of paragraphs."""
    # Get book as string and emove metadata
    book = load_etext(book_id)
    # Remove metadata
    book = strip_headers(book).strip()
    # Remove the character we'll choose as separator
    book = book.replace("|", " ")
    # Split paragraphs
    parag = book.split("\n\n")
    # Remove single line breaks
    parag = [x.replace("\n", " ") for x in parag]
    # Remove paragraphs below a certain length
    parag = [p for p in parag if len(p) > min_length]
    # Exclude first/last 10 parag from sampling as they may contain remaining metadata
    parag = parag[10:-10]

    # Sample paragraphs
    seed(42)
    sample_ind = randint(0, len(parag), n_parag)

    if n_parag is not None:
        if n_parag > len(parag):
            raise ValueError(
                "The number of paragraphs to sample is higher than the "
                "total number of paragraphs."
            )
        else:
            parag_sampled = [parag[i] for i in sample_ind]

    else:
        # If n_parag is None, all paragraphs are sampled
        parag_sampled = parag

    return parag_sampled
Example #18
0
def regular_view(request, book_num):
    name = Book.get_book_name(book_num)
    bookText = strip_headers(load_etext(book_num)).strip()
    filteredText = removeStopWords(bookText)

    args = {'content': [bookText], 'content2': [filteredText], 'name': name}

    return render(request, "pages/regularText.html", args)
Example #19
0
def get_raw_book():
    while True:
        try:
            text = load_etext(random.randrange(46000)) #46000 is approximately size of gutenberg catalogue
        except ValueError: #in case of no download method for that text id
            pass
        else:
            return strip_headers(text)
Example #20
0
def get_joyce_texts():
    joyce_keys = get_etexts('author', 'Joyce, James')
    joyce_titles = []
    joyce_texts = {}
    for key in joyce_keys:
        joyce_titles.append(get_metadata('title', key))
        joyce_texts[key] = strip_headers(load_etext(key)).strip()
    return (joyce_texts)
Example #21
0
def search_display_options(my_catalog):
    search_result_catalog = book_catalog()

    search_type = input(
        'Please select a search type: Author, Subject, Title [Aa/Ss/Tt]:  ')

    if search_type == 'A' or search_type == 'a':
        search_term = input('Please enter a search term for an Author: ')
    elif search_type == 'T' or search_type == 't':
        search_term = input('Please enter a search term for a Title: ')
    elif search_type == 'S' or search_type == 's':
        search_term = input('Please enter a search term for a Subject: ')
    else:
        print('Invalid search type...')
        return

    # set match flag to false
    match = False
    # fill up a set of all the titles that match the search
    for my_book in my_catalog.get_books():
        if (search_type == 'a' or search_type == 'A') and set(
                my_book.get_book_author().lower().split(' ')).intersection(
                    set(search_term.lower().split(' '))):
            search_result_catalog.add_book(my_book)
            match = True

        if (search_type == 't' or search_type == 'T') and set(
                my_book.get_book_title().lower().split(' ')).intersection(
                    set(search_term.lower().split(' '))):
            search_result_catalog.add_book(my_book)
            match = True

        if (search_type == 's' or search_type == 'S') and set(
                my_book.get_book_subject().lower().split(' ')).intersection(
                    set(search_term.lower().split(' '))):
            search_result_catalog.add_book(my_book)
            match = True

    search_result_catalog.display_titles_by_author()

    if match:
        title_num = input('Please type a title number from the above list: ')

        print('Displaying Word Cloud in [Subject: ' +
              my_book.get_book_subject() + '] for [Title: ' +
              my_book.get_book_title() + '] by [Author:' +
              my_book.get_book_author() + ']')
        try:
            my_book = search_result_catalog.get_book(title_num)
            return (strip_headers(load_etext(int(title_num))).strip()
                    )  # call that gets bok text from gutenberg
        except:
            print('Failed to find a textual download candidate for ' +
                  my_book.get_book_title())
            return (None)
    else:
        print('No matches found for [' + search_term + ']...')
        return (None)
Example #22
0
def text_from_pg(id_number):
    # https://github.com/c-w/Gutenberg
    from gutenberg.acquire import load_etext

    # from gutenberg.cleanup import strip_headers

    # text = strip_headers(load_etext(id_number)).strip()
    text = load_etext(id_number).strip()
    return text
Example #23
0
def tab():
    with open("BookRoulette.html", "w") as f:
        x = (random.randint(1, 60059))
        book = strip_headers(load_etext(x)).strip()
        f.write(book)
        f.close
        filename = 'file:///'+os.getcwd()+'/' + 'BookRoulette.html'
        webbrowser.open_new_tab(filename)
        return render_template('BookRoulette.html', book=book)
Example #24
0
 def acquire_and_process(name: str, txt_num: int):
     """
     Convenience function that minhashes a Project Gutenberg
     text given the text id number (can be found on the gutenberg.org,
     for instance in the url).
     """
     txt = strip_headers(load_etext(txt_num))
     with open("texts/%s.txt" % name, "w") as f:
         f.write(txt)
     process_file("texts/%s.txt" % name)
Example #25
0
def download():
    with open("GutProject.doc", "w") as f:
        x = (random.randint(1, 60059))
        text = strip_headers(load_etext(x)).strip()
        f.write(text)
        f.close()
    return send_file('GutProject.doc',
    mimetype='application/msword',
    attachment_filename='GutProject.doc',
    as_attachment=True)
Example #26
0
def downloadBook():
    """If posting, takes in a book number from getty.html, installs the book into
    the database. Otherwise displays getty.html"""
    if request.method == "POST":
        bookNum = int(request.form.get("bookNum"))
        words = strip_headers(load_etext(bookNum)).strip()
        installText(words)
        return render_template("homepage.html")
    else:
        return render_template("getty.html")
Example #27
0
 def __init__(self, book_number=2701, first_page=20, last_page=20):
     self.text = strip_headers(load_etext(book_number))
     # print(list_supported_metadatas())  # prints (u'author', u'formaturi', u'language', ...)
     # print(get_metadata('title', 2701))  # prints frozenset([u'Moby Dick; Or, The Whale'])
     # print(get_metadata('author', 2701))  # prints frozenset([u'Melville, Hermann'])
     # print(text)  # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...'
     self.pages = []
     self.first_page = first_page
     self.last_page = last_page
     self.print_book()
Example #28
0
def gutToText(number,name):
    filename = name+"_raw.txt"
    if os.path.isfile(filename)==False:
        book = open(filename,"w")
        text = strip_headers(load_etext(number)).strip()
        words = text
        print "Loaded and writing %s" % (name)
        book.write(words.encode('utf-8'))
        print "Done writing %s" % (name)
        book.close()
Example #29
0
    def get_text(self, title, author):
        """
		This function will access the title and author of a book from the
		Gutenberg project and save the data as a csv file
		PROBLEM HERE -- gutenberg goes down a lot, so getting a full text 
		did not work. To bypass that, I downloaded some books of mixed languages.
		"""
        guten_number = get_etexts('title', title)[0]
        text = strip_headers(load_etext(guten_number)).strip()
        return (text)
Example #30
0
def get_gutenberg_document(url) -> str:
    """Downloads a document (book, etc.) from Project Gutenberg and returns it as a string."""
    # Get Project Gutenberg document ID from url string
    validate_url(url, expected_netloc='gutenberg.org')
    match = re.search("(?:files|ebooks|epub)\/(\d+)", urlsplit(url).path)
    if not match:
        raise Exception('Not a valid url')
    document_id = int(match.group(1))
    return super_cleaner(strip_headers(load_etext(document_id).strip()),
                         mark_deletions=False)
Example #31
0
 def acquire_and_process(name: str, txt_num: int):
     """
     Convenience function that minhashes a Project Gutenberg
     text given the text id number (can be found on the gutenberg.org,
     for instance in the url).
     """
     txt = strip_headers( load_etext(txt_num) )
     with open("texts/%s.txt" % name, "w") as f:
         f.write(txt)
     process_file("texts/%s.txt" % name)
Example #32
0
def generateBooks(lastBookID):
    firstBookID = 1
    # look through and grab each book
    while firstBookID <= lastBookID:
        # load and grab the eBook
        try:
            text = strip_headers(load_etext(firstBookID)).strip()
            gatherMetaData(firstBookID, text)
            firstBookID = firstBookID + 1
        except:
            print("error with book", firstBookID)
            firstBookID = firstBookID + 1
def trial():
    text = strip_headers(load_etext(2701)).strip()
    print(text)  # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...'
    print(get_metadata(
        'title', 2701))  # prints frozenset([u'Moby Dick; Or, The Whale'])
    print(get_metadata('author',
                       2701))  # prints frozenset([u'Melville, Hermann'])

    print(get_etexts(
        'title', 'Moby Dick; Or, The Whale'))  # prints frozenset([2701, ...])
    print(get_etexts('author',
                     'Melville, Herman'))  # prints frozenset([2701, ...])
Example #34
0
def create_model():
    """Read in Project Gutenberg data, convert each into a markovify Text model object, then combine them into one model. Returns the model. 
	"""

    eap_1 = strip_headers(load_etext(2147)).strip()  #edgar allan poe vol 1
    eap_2 = strip_headers(load_etext(2148)).strip()  #edgar allan poe vol 2
    dickens = strip_headers(
        load_etext(807)).strip()  #charles dickens crime stories
    moonstone = strip_headers(load_etext(155)).strip()  #collins: the moonstone
    lerouge = strip_headers(
        load_etext(3802)).strip()  #gaboriau: the lerouge case
    orcival = strip_headers(
        load_etext(1651)).strip()  #gaboriau: the mystery of orcival
    calais = strip_headers(
        load_etext(16339)).strip()  #griffiths: the passenger from calais\
    leavenworth = strip_headers(
        load_etext(4047)).strip()  #griffiths: the passenger from calais
    agent = strip_headers(load_etext(974)).strip()  #conrad: the secret agent
    thirtynine = strip_headers(
        load_etext(558)).strip()  #conrad: the secret agent

    eap_1_model = markovify.Text(eap_1, state_size=3)
    eap_2_model = markovify.Text(eap_2, state_size=3)
    dickens_model = markovify.Text(dickens, state_size=3)
    moonstone_model = markovify.Text(moonstone, state_size=3)
    lerouge_model = markovify.Text(lerouge, state_size=3)
    orcival_model = markovify.Text(orcival, state_size=3)
    calais_model = markovify.Text(calais, state_size=3)
    leavenworth_model = markovify.Text(leavenworth, state_size=3)
    agent_model = markovify.Text(agent, state_size=3)
    thirtynine_model = markovify.Text(thirtynine, state_size=3)

    #NOTE: will need to play around with the weighting based on the text lengths so that I don't get all sentences from one book
    all_model = markovify.combine([
        eap_1_model, eap_2_model, dickens_model, moonstone_model,
        lerouge_model, orcival_model, calais_model, leavenworth_model,
        agent_model, thirtynine_model
    ])

    return all_model
Example #35
0
def main():
    eap_1 = strip_headers(load_etext(2147)).strip()  #edgar allan poe vol 1
    eap_2 = strip_headers(load_etext(2148)).strip()  #edgar allan poe vol 2
    dickens = strip_headers(
        load_etext(807)).strip()  #charles dickens crime stories
    moonstone = strip_headers(load_etext(155)).strip()  #collins: the moonstone
    lerouge = strip_headers(
        load_etext(3802)).strip()  #gaboriau: the lerouge case
    orcival = strip_headers(
        load_etext(1651)).strip()  #gaboriau: the mystery of orcival

    eap_1_model = markovify.Text(eap_1, state_size=3)
    dickens_model = markovify.Text(dickens, state_size=3)
    moonstone_model = markovify.Text(moonstone, state_size=3)
    lerouge_model = markovify.Text(lerouge, state_size=3)
    orcival_model = markovify.Text(orcival, state_size=3)

    #NOTE: will need to play around with the weighting based on the text lengths so that I don't get all sentences from one book
    all_model = markovify.combine([
        eap_1_model, eap_2_model, dickens_model, moonstone_model,
        lerouge_model, orcival_model
    ])

    #to do: loop to create different chapters - probably make them short (~ten sentences?) at first to make sure that they work properly
    print "\n\n\n Creating Chapters"

    chapters = create_chapters(
    )  #this will be a list of all the chapters, they should be complete at this point (all replacement/etc done)
    """
def main():
    """
    Main function of the test module
    """

    # setting up the API keys from local keys.py file
    goodreads_key = os.environ['GOODREADS_KEY']
    goodreads_secret = os.environ['GOODREADS_SECRET']

    # creating a client for book search and information retrieval
    gc = client.GoodreadsClient(goodreads_key, goodreads_secret)

    current_path = os.getcwd()

    file = open(os.path.join(current_path, "output", "log.json"), "w")

    gutenberg_titles = []

    # Getting the title of the first 3000 books on Project Gutenberg (EXTREMELY FAST)
    for i in range(1, 10):
        title = list(get_metadata('title', i))
        if title:
            # prepare the string for the file name
            filename = ''.join(
                e for e in title[0] if e.isalnum() or e == ' ') + ".txt"
            gutenberg_titles.append(filename[:-4])
            text = strip_headers(load_etext(i)).strip()
            with open(os.path.join(current_path, "output", filename),
                      "w") as output_file:
                output_file.write(text)

    titles = dict()
    # Searching for the books on Goodreads, reading their metadata
    for book_title in gutenberg_titles:
        try:
            lst = gc.search_books(book_title, search_field='title')

            if not lst:
                continue
            else:
                book = lst[0]

            titles[book.title] = (
                book_title + ".txt", str(book.popular_shelves),
                str(book.similar_books), str(book.authors),
                dict(dict(book.work)['original_publication_year'])['#text'])
        except (request.GoodreadsRequestException, KeyError, TypeError):
            continue

    json.dump(titles, file, indent=4)
    file.close()
def get_book_text(csvfile):
    'gets text for book using project gutenberg catalog'
    book_list = open_csv(csvfile)
 
        
    for i, value in enumerate(book_list):
        #print i, value
        
        a = int(book_list[i][2]) # a = book number
        print i, a
        author = book_list[i][0]
        title = book_list[i][1]
        try:
            text = strip_headers(load_etext(a)).strip()
        except ValueError:
            pass
Example #38
0
def process_file(filename, outdir):
    outpath = outdir + '/%s.txt'
    with open(filename) as f:
        for line in f:
          spl = line.split('|')
          book = spl[0]
          uids = map(int, spl[3].strip(string.lowercase + '\n').split(','))
          try:
            with open(outpath % book, 'w') as out:
              for uid in uids:
                raw_text = load_etext(uid)
                try:
                  text = strip_headers(unidecode(raw_text.encode('latin-1').decode('utf-8')))
                except UnicodeDecodeError:
                  text = strip_headers(raw_text)
                out.write(text.encode('utf-8'))
          except ValueError as e:
            print '%s|%s' % (book, uid), e
            os.remove(outpath % book)
 def the_text(self): 
      try:
         self.novel = load_etext(self.novel_num)
      except:
         rejects.append(self.novel_num)
         return False
           
      if re.search('Character set encoding: ASCII', self.novel):
         self.novel = strip_headers(self.novel)
         self.novel = self.novel.replace('\n', ' ')
         self.novel = TextBlob(self.novel)
         self.novel_sentences = self.novel.sentences
         self.m = str(self.novel_num)
         with open('novel_'+self.m +'list_1.csv', 'wb') as f:
             writer = csv.writer(f)
             for sentence in self.novel_sentences:
                 writer.writerow([sentence])
      else: 
         rejects_2.append(self.novel_num) 
         return False    
Example #40
0
def check_text():
    with open('raw.json') as inputfile:
        data = json.load(inputfile)
    for record in tqdm(data):
        id = record['metadata']['id']
        title = clean(record['book']['title'])
        text = load_etext(id)
        if id in lookup_dates:
            release_date = lookup_dates[id]
        else:
            for line in text.split("\n"):
                if line.startswith('Release Date:'):
                    release_date = line.replace('Release Date:', '').split('[')[0]
                    break
            else:
                print id, title
        record['book']['author'] = record['author']
        author_name = record['book']['author']['name']
        vals.add(record['book']['author']['birth'])
        if record['book']['author']['birth'] == None:
            record['book']['author']['birth'] = 0
        if record['book']['author']['death'] == None:
            record['book']['author']['death'] = 0
        vals2.add(record['book']['author']['birth'])
        record['book']['author']['name'] = clean(author_name) if author_name else "Unknown"
        del record['author']
        month, day, year = extract_date(release_date)
        release_date = release_date.strip()
        record['book']['publication'] = {
            'full': release_date  if month != 'Jan' else release_date.replace('Jan', 'January'),
            'year': year,
            'month name': month if month != 'Jan' else 'January',
            'month': month_lookup[month],
            'day': day
        }
        record['bibliography'] = record['book']
        del record['book']
        record['metrics'] = record['statistics']
        del record['statistics']
    with open('classics-2.json', 'w') as output:
        json.dump(data, output, indent=2)
Example #41
0
def fetch_gutenberg(filename=None):
    from gutenberg.acquire import load_etext
    from gutenberg.cleanup import strip_headers
    from gutenbergsettings import popularTitles, saveInterval

    start    = time.time()
    lastsave = start

    with redirect(filename):
        try:
            for title in popularTitles:
                text = strip_headers(load_etext(title)).strip()
                serialize([(title, text)], '../serialized/guten%s' % title)
                sinceLast = time.time() - lastsave
                print('%s since last save' % sinceLast)
                if sinceLast > saveInterval:
                    concatenate('guten')
                    lastsave = time.time()
        except KeyboardInterrupt:
            concatenate('guten')
            sys.exit(0)
def extract_subroutine(data, src_dir, century):
    session = model.get_session()
    backoff = 1

    counter = 0
    for metadata in data:
        contents = extract_book_contents(metadata)

        if contents is None:
            backoff *= 1.5
            continue

        title = metadata['title']
        author = metadata['author']
        e_id = metadata['id']

        if type(title) == list:
            title = dasherize(title)

        text_file_path = os.path.join(src_dir, dasherize(title.split(" ")))
        text = strip_headers(load_etext(e_id)).strip()
        f = open(text_file_path, "w")
        f.write(text)
        f.close()
        book = model.Book(
            title=title,
            author=author,
            e_id=e_id,
            century=century,
            text_file_path=text_file_path
        )
        session.add(book)
        session.commit()
        log.info("successfully added " + title)
        counter += 1
        time.sleep(backoff)

    log.info("---- finished run. added %d books ----" % counter)
Example #43
0
def main():
    filename = "gutenbergscifi.csv"
    json_filename = "gendered_words.json"
    
    if os.path.isfile(filename):
        print "file exists"
        
    else:
        write_csv = extract_text_urls(filename)
        print "file created"

    book_list = open_csv(filename)
    
    print book_list
    
    for i, value in enumerate(book_list):
        #print i, value
        
        a = int(book_list[i][2]) # a = book number
        print i, a
        author = book_list[i][0]
        title = book_list[i][1]
        try:
            text = strip_headers(load_etext(a)).strip()
        except ValueError:
            pass
        #print text  
        
        clean_text = remove_punc_html(text)
        ratio = gender_ratio(clean_text)
        
        print author, title, ratio
        uber_key = author
        sub_key = title
        sub_value = ratio
        uber_value = {author: {title:ratio}}

        json_source = read_write_json_object(json_filename="gendered_words.json", uber_key=uber_key, uber_value=uber_value, sub_key=sub_key, sub_value=sub_value, READ=False, WRITE=True)
def download_book(title, gutenberg_id, data_path, sleep=0):
    print('downloading {:}'.format(title))

    full_text = strip_headers(load_etext(gutenberg_id)).strip()
    summary = downloadSummary(title)

    if full_text is None:
        print('Full text is None. Skipping {:}'.format(title))
        return
    if summary is None:
        print('Summary is None. Skipping {:}'.format(title))
        return

    output_data = {'title': title,
                   'full_text': full_text,
                   'summary': summary}
        
    output_file = os.path.join(data_path,
                               '{:}.json'.format(gutenberg_id))
    with open(output_file, 'w') as f:
        json.dump(output_data, f, ensure_ascii=False)

    time.sleep(sleep)
Example #45
0
def getBook(bookDetails):
    global timeAtLastFetch
    cachedFilename = "cache/" + fileNameForBook(bookDetails) + ".txt"
    if os.path.isfile(cachedFilename):
        with open(cachedFilename) as bookfile:
            text = bookfile.read()
            return TextBlob(text)

    nowMS = milliseconds()
    timeSinceLastFetch = nowMS - timeAtLastFetch
    if timeSinceLastFetch < gutenbergWaitTimeMS:
        waitTime = gutenbergWaitTimeMS - timeSinceLastFetch
        print "    waiting {}ms for Gutenberg...".format(waitTime)
        time.sleep(waitTime / 1000)

    bookId = bookDetails['id']
    print "Fetching from Gutenberg id {}".format(bookId)
    source = load_etext(bookId)
    print "    cleaning...."
    source = removeUnicodeWords.sub("", source)
    source = removeUnicodeCharacters.sub("", source)
    source = removePossessive.sub("", source)
    source = removeWordsWithApostrophe.sub("", source)
    source = removeHyphens.sub(" ", source)
    source = removeChapterHeaders.sub("", source)
    source = removeRomanNumerals.sub("", source)
    source = removeEllipsis.sub("", source)
    text = strip_headers(source).strip()
    timeAtLastFetch = milliseconds()
    if not os.path.isdir("cache"):
        os.mkdir("cache")
    bookfile = open(cachedFilename, 'w')
    bookfile.write(text)
    bookfile.close()
    print "    fetched and cached " + bookDetails['title']
    return TextBlob(text)
Example #46
0
def generate_tweets(gutenberg_id, total=24):
    document = []
    text = strip_headers(load_etext(gutenberg_id)).strip()
    lines = text.split('\n')    
    print get_metadata('title', gutenberg_id)
    for line in lines:
        words = re.findall(regex, line)
        document.extend(words)

    trigrams = zip(document, document[1:], document[2:])
    trigram_transitions = defaultdict(list)
    starts = []

    for prev, current, next in trigrams:
            if prev == ".":
                    starts.append(current)
            trigram_transitions[(prev, current)].append(next)

    def generate_using_trigrams():
            current = random.choice(starts)
            prev = "."
            result = [current]
            while True:
                    next_word_candidates = trigram_transitions[(prev, current)]
                    next_word = random.choice(next_word_candidates)
                    prev, current = current, next_word
                    if current != ".":
                        result.append(current)
                    else:
                        return " ".join(result) + current
    tweets = [];
    while len(tweets) < total:
        tweet = generate_using_trigrams()
        if len(tweet) <= 140:
            tweets.append(tweet)
    return tweets
# -*- coding:utf-8 -*-

# Librerias
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

texto=load_etext(2000)
texto=strip_headers(texto)


#Reemplazar fragmento "qu" por "k" para que no se incluya "(q)ue" o "(q)ui" como diptongo
#Reemplazar diptongos con "y" agregando "-" para que no encuentre palabras en las que la "y" es consonante y no vocal.
texto = texto.replace("que", "ke")
texto = texto.replace("qui", "ki")
texto = texto.replace("gue", "ke")
texto = texto.replace ("gui", "ki")
texto = texto.replace ("ay", "ay-")
texto = texto.replace (u"áy", u"áy-")
texto = texto.replace ("ey", "ey-")
texto = texto.replace (u"éy", u"éy-")
texto = texto.replace ("oy", "oy-")
texto = texto.replace ("uy", "uy-")

texto = texto.lower()


# Dividir texto en palabras
# Diptongo:  Combinación de una vocal abierta (/a e o/) con una cerrada (/i u/), o viceversa, la cerrada no debe ser tónica.
# Hay que indicar con un espacio que la "y" debe quedar al final de palbra
palabras=texto.split()
dic_diptongos={ 
def split_sentences(text):
	for sentence_separator in [u'. ',u'.\n',u'? ',u'! ',u'?\n',u'!\n',u'; ',u';\n',u'- ',u'--',u'...',u'\n',u'\n\n',u'\n\n\n']:
		text=text.replace(sentence_separator,u'|||')
		return text.split(u'|||')


# Saber la cantidad de libros que posee el corpus.
print u'Total de libros en español:',len(codigos_libros.es)


# Ahora se cargan los libros y se suprimen sus encabezados.
dic_oraciones_es={}
total_palabras_es=0
for codigo_libro_es in codigos_libros.es:
	texto=load_etext(codigo_libro_es)
	texto=strip_headers(texto)
	
# En cada libro se separan las oraciones y se delimitan por el símbolo |||.
	oraciones_libro=split_sentences(texto)
	for oracion_libro in oraciones_libro:
		palabras=rufino.split_words(oracion_libro)
		numero_palabras_oracion=len(palabras)
		total_palabras_es+=numero_palabras_oracion
		if numero_palabras_oracion not in dic_oraciones_es:
			dic_oraciones_es[numero_palabras_oracion]=1
		else:
			dic_oraciones_es[numero_palabras_oracion]=dic_oraciones_es[numero_palabras_oracion]+1


print u'Total de oraciones en español:',len(dic_oraciones_es)
Example #49
0
import nltk
from nltk.text import Text
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

id = input("Input gutenberg id to load: ")
text=strip_headers(load_etext(id)).strip()
raw_input("Enter to print text preview...")
print(text[:1000])

text = text.split()
text=Text(text)
def ask():
	test = raw_input("Which analysis to perform ('list' to see list): ")
	if(test == "list"):
		print("concordance, dispersionplot, wordcount, lexicaldiversity, frequency, collocations")
		ask()
	if(test == "concordance"):
		conc = raw_input("word: ")
		text.concordance(conc)
		ask()
	if(test == "dispersionplot"):
		disp = []
		keepasking = True;
		i=0;
		while(keepasking):
			input = raw_input("word " + str(i) + " (blank to stop): ")
			if(len(input) > 0):
				disp.append(input)
			else:
				keepasking = False;
Example #50
0
17406, 17430, 17491, 20401, 21651, 23206, 23236, 24536, 24601, 24925, 25317, 25640, 25687, 
25777, 25988, 26284, 26655, 27736, 29497, 29506, 29663, 29799, 29831, 30053, 30122, 30425, 
30535, 30903, 30986, 31013, 31464, 31541, 31613, 31637, 31707, 32235, 32315, 32364, 33690, 
35882, 36253, 36453, 36573, 36940, 37067, 37095, 37139, 37637, 38814, 39444, 39613, 39990, 
41746, 42727,)



guardar_cadena=0
cadena=u''
interjecciones={}
contador = 0


for texto in textos: #Repito el ciclo para cada libro
    texto= load_etext(texto) #Cargo el texto 
    texto=strip_headers(texto).lower() #Quito las cabeceras
    texto=unicode(texto)
    for caracter in texto: #recorro el texto caracter por caracter
	if caracter == u'¡':      # Si encuentro una apertura de exclamación
        	guardar_cadena=1  # Pongo una variable para empezar a guardar la cadena	
		cadena=cadena+unicode(caracter)	
	if caracter == u'!':      # Si encuentro un cierre de exclamación
		cadena = cadena+unicode(caracter)	 # 1. Guardo ese último caractér (esto es opcional)
		if cadena in interjecciones.keys(): # 2. reviso si la cadena esta en el diccionario
			interjecciones[cadena]+=1  # 3. Si esta le sumo uno a su contador
		else:                           # 4. Si no esta 
			interjecciones[cadena]=1    # La pongo y aranca desde 1
		guardar_cadena=0				# 5. Cambio el valor de la variable para que no se guarde más 
		cadena=''
        if guardar_cadena == 1:         # 6. reviso si el valor de guardar cadena esta en 1
"""
Created on Wed Aug 12 18:06:45 2015

@author: Tony
Description: Pull etext numbers from Project Gutenberg for an author

1) First pip install gutenberg 0.4.0 library for Python from the command line

"""
 
from gutenberg.query import get_etexts
from gutenberg.query import get_metadata
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers


# get the catalogue numbers of all the texts
# by Wilhelm Grimm in Project Gutenberg
bookList=get_etexts('author', 'Grimm, Wilhelm Carl')
# gives bookList = [12704, 12705, 12706, 12707, 12708, 12709, 12710, 37381, 20051, 28044, 30510, 22555, 20050, 11027, 16846, 12250, 20027, 19068, 2591]

#Once We can associate a number with a title we can pull the text
for number in bookList:
    print(number,get_metadata('title',number))
 
print('\n HHHHHHHHHHHHHHH Now for the full text HHHHHHHHHHHHHHHHHHH \n')
# Once we have the text number we can print the text
# example 11027 is the number for Grimm's Fairy Stories 
# can be tempermental truncating text at top (console limit?) may need to trick around  
etext = strip_headers(load_etext(11027)).strip()
print(etext)
    def open_file(self, file_id):
        """
            Opens a file from project gutenberg 
        """

        return load_etext(file_id)
Example #53
0
print total
i = 0
for (dirpath, dirnames, filenames) in walk(gutenberg_path):
    for filename in filenames:
        f =  "/".join([dirpath, filename])
        if(f.endswith(".rdf")):
            #print f
            i+=1
            bf = BeautifulSoup(open(f))
            subjects =  bf.find_all("dcterms:subject")
            if (subjects is not None and len(subjects) > 0):
                for subject in subjects:
                    val =  subject.find_all("rdf:value")[0].contents[0]
                    for i_subject in i_subjects:
                        if(i_subject in val.lower()):
                            #print f, val

                            id =  int(basename(f)[2:-4])
                            fn = str(id).zfill(10) + "_" +  i_subject + ".txt"
                            print fn
                            try:
                                text = strip_headers(load_etext(id)).strip().encode("utf-8")
                                wf = "./texts/" + fn
                                with open(wf, "w") as text_file:
                                    text_file.write(text)
                                print i, total, float(i)/total
                            except:
                                print "broken", id
            # for network in tree.findtext('dcterms subject'):
            #     print network
#Hecho en python 3.5
from gutenberg.acquire import load_etext 
from gutenberg.cleanup import strip_headers

librosCodigo = {"Francés":[13735,13808],"Español":[24925,15027],"Portugés":[14904,16384],"Inglés":[10422,1013]}
dic_idiomas={}

for idioma in librosCodigo.keys():
    diccionario_largo_palabras={}

    for indeCo in librosCodigo[idioma]:
        texto= strip_headers(load_etext(indeCo))
        dic_idiomas[idioma]= diccionario_largo_palabras

        for caracter_especial in ['"',"...","¿","?","=","_","[","]","(",")",",",".",":",";","!","¡","«","»","*","~","' "," '","- "," -","--"]:
            texto=texto.replace(caracter_especial," ")
            palabras=texto.split()

        for palabra in palabras:
            largo_palabra = len(palabra)
            if largo_palabra in diccionario_largo_palabras:
                diccionario_largo_palabras[largo_palabra] = diccionario_largo_palabras[largo_palabra]+1
            else:
                diccionario_largo_palabras[largo_palabra]= 1
print (dic_idiomas)
Example #55
0
Created on Sun Sep 20 19:49:20 2015

@author: weizhi
"""

import nltk
from nltk.corpus import gutenberg
nltk.corpus.gutenberg.fileids()
emma = nltk.corpus.gutenberg.words('austen-emma.txt')

from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers


from gutenberg.acquire import metadata
text = load_etext(201)
print text[:100]
#assert text.startswith('MOBY DICK; OR THE WHALE\n\nBy Herman Melville')

import rdflib
g = rdflib.Graph()



from gutenberg.acquire import metadata


output = metadata._create_metadata_graph(store='Sleepycat')
#downLoad = metadata._download_metadata_archive()

from gutenberg.query.api import get_metadata  # noqa
def downloadText(textID):
    print "Downloading", textID
    text = strip_headers(load_etext(textID)).strip()
    return text
    for item in ners:
        # Loop over the Stanford NER (per/ person) results,
        # and apply probablepeople, which raises when fails, (so try).
        if "per" in item["tag"].lower():
            try:
                result = parse(item.get('string'))
            except:
                log.error("Could not run probablepeople")

            if result:
                result = parse(item["string"])
                pp.append(result)
    ner["pp"] = pp
    return ner


if __name__ == '__main__':
    if len(sys.argv) >= 2 and 'test' in " ".join(sys.argv):
        import doctest
        doctest.testmod(verbose=True)

    if len(sys.argv) >= 2 and 'profile' in " ".join(sys.argv):
        from gutenberg.acquire import load_etext
        from gutenberg.cleanup import strip_headers
        from pycallgraph import PyCallGraph
        from pycallgraph.output import GraphvizOutput

        text = smart_text(strip_headers(load_etext(54807)).strip())
        with PyCallGraph(output=GraphvizOutput()):
            stanford_ner_wrapper(text, 9992, True)
Example #58
0
            except Exception, e:
                logging.error("%s: %s" % (path, e))
                # raise e

    @classmethod
    def text_from_zip(cls, path, rdf_catalog_path=None):
        """Return a ProjectGutenbergText object from a zip file."""
        archive = zipfile.ZipFile(path)
        inside = archive.filelist
        filenames = [x.filename for x in inside]
        if len(inside) != 1:
            logging.warn("Supposedly plain-text %s has %d files in zip: %s" % (
                    path, len(inside), ", ".join(filenames)))
        possibilities = [x for x in filenames if x.lower().endswith(".txt")]
        data = archive.read(possibilities[0])
        return ProjectGutenbergText(data, path, rdf_catalog_path)

    @property
    def paragraphs(self):
        return self.text.split("\r\n\r\n")
Obj = ProjectGutenbergText(text, name=None, rdf_catalog_path=raw_data)
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

text = strip_headers(load_etext(2701)).strip()
assert text.startswith('MOBY DICK; OR THE WHALE\n\nBy Herman Melville')




# -*- coding:utf-8 -*-

from gutenberg.acquire import load_etext 
from gutenberg.cleanup import strip_headers

dic_cont_interjecciones={}

textos=load_etext(1619)
texto=strip_headers(textos).lower()
guardar_cadena=0
cadena=''
interjecciones={}

for texto in textos: #Repito el ciclo para cada libro
    for caracter in texto: #recorro el texto caracter por caracter
	if caracter == u'¡':
        	guardar_cadena=1
	if caracter == u'!':
		cadena = cadena+caracter
		if cadena in interjecciones.keys():
			interjecciones[cadena]+=1
		else:
			interjecciones[cadena]=1
		guardar_cadena=0
        if guardar_cadena == 1:
		cadena = cadena+caracter

for interjeccion in interjecciones.keys().sort():
	print interjeccion, interjecciones[interjeccion]

from numpy import random
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers


bookNumber = set(random.randint(10,50024,size=2000))
#f.write(foo.encode('utf8'))

metaInfo = []

for item in bookNumber:
   # print item
    try: 
       # print item
        # loading the raw txt 
        data = load_etext(item).split("\n")
        
        # save the txt data path
        filePath = rdfPath + '/' +str(item) + '/' +  str(item) + '.txt'
        f = open(filePath,'w')
        f.write(data.encode('utf8'))
        f.close()
        # get the meta data 
        Dict = obj.metaData(data)
        metaInfo.append((Dict,filePath))
        print len(metaInfo)
    except:
        continue
#%%do the data mining to these txt, author, title, release time, etc, need time to work on this part