コード例 #1
0
ファイル: test_acquire.py プロジェクト: rlugojr/Gutenberg-2
 def test_load_etext(self):
     loaders = (lambda etextno: load_etext(etextno, refresh_cache=True),
                lambda etextno: load_etext(etextno, refresh_cache=False))
     testcases = (
         SampleMetaData.for_etextno(2701),  # newstyle identifier
         SampleMetaData.for_etextno(5),  # oldstyle identifier
         SampleMetaData.for_etextno(14287),  # unicode text
         SampleMetaData.for_etextno(23962)  # UTF-8 text
     )
     for testcase, loader in itertools.product(testcases, loaders):
         text = loader(testcase.etextno)
         self.assertTrue(isinstance(text, str))
コード例 #2
0
 def test_load_etext(self):
     loaders = (lambda etextno: load_etext(etextno, refresh_cache=True),
                lambda etextno: load_etext(etextno, refresh_cache=False))
     testcases = (
         SampleMetaData.for_etextno(2701),   # newstyle identifier
         SampleMetaData.for_etextno(5),      # oldstyle identifier
         SampleMetaData.for_etextno(14287),  # unicode text
         SampleMetaData.for_etextno(23962)   # UTF-8 text
     )
     for testcase, loader in itertools.product(testcases, loaders):
         text = loader(testcase.etextno)
         self.assertIsInstance(text, unicode)
コード例 #3
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        del data_dir
        del tmp_dir
        del dataset_split

        # pylint: disable=g-import-not-at-top
        from gutenberg import acquire
        from gutenberg import cleanup
        # pylint: enable=g-import-not-at-top

        books = [
            # bookid, skip N lines
            (19221, 223),
            (15553, 522),
        ]

        for (book_id, toskip) in books:
            text = cleanup.strip_headers(acquire.load_etext(book_id)).strip()
            lines = text.split("\n")[toskip:]
            prev_line = None
            ex_count = 0
            for line in lines:
                # Any line that is all upper case is a title or author name
                if not line or line.upper() == line:
                    prev_line = None
                    continue

                line = re.sub("[^a-z]+", " ", line.strip().lower())
                if prev_line and line:
                    yield {
                        "inputs": prev_line,
                        "targets": line,
                    }
                    ex_count += 1
                prev_line = line
コード例 #4
0
ファイル: general.py プロジェクト: lrascius/CodeCipherCenter
def generate_paragraph():
    '''
    Generates a random paragraph from the Gutenberg Project

    :return: Text the Guttenberg Project with spaces and non-alphabetic characters removed\
    and all characters lower case
    :rtype: str
    '''
    #Get the text from Gutenberg Project, in this case its Moby Dick
    text = strip_headers(load_etext(2701)).strip()
    #text = "Jack and Jill ran up the hill to get a pail of water. " +
    #       "Jack fell down and broke his crown and Jill came tumbling after."
    sentences = []
    paragraph = ""

    for sentence in text.split("."):
        sentences.append(sentence)

    #Select 2 random sentences
    paragraph = random.choice(sentences) + random.choice(sentences)

    paragraph = re.sub(r'\s+', '', paragraph)
    regex = re.compile('[^a-zA-Z]')
    paragraph = regex.sub('', paragraph).lower()
    return paragraph
コード例 #5
0
def get_featurelists(book):

    # Preparation for topic features: get 100 most common uni-, bi- and trigrams of the given book
    common_ngrams = get_common_ngrams(book)

    # Extract the features of the given book
    features_book = (book_id, extract_features(book, common_ngrams))

    # Create new file and write the features of the given book to it
    path_feat_book = "C:/Users/gebruiker/Documents/Master/Text/Project/output_data/features_book.txt"
    with open(path_feat_book, 'r+', encoding="utf-8") as output_book:
        output_book.write(str(features_book))
        output_book.close()

    # Create new file to write the features of the dataset books to
    path_feat_books = "C:/Users/gebruiker/Documents/Master/Text/Project/output_data/features_dataset.txt"
    output_dataset = open(path_feat_books, 'r+', encoding="utf-8")

    # Extract the features of the dataset books
    features_dataset = []
    for i in IDs:
        features_dataset.append((i,
                                 extract_features(
                                     strip_headers(load_etext(i)).strip(),
                                     common_ngrams)))

        # Write the features to the output file
        output_dataset.write("\n Book " + str(i) + ": ")
        output_dataset.write(str(features_dataset[len(features_dataset) - 1]))
    output_dataset.close()

    return features_book, features_dataset
コード例 #6
0
def get_gutenberg_text(id):
    try:
        text = strip_headers(load_etext(id)).strip()
        return text
    except Exception as ex:
        print(ex)
    return ''
コード例 #7
0
    def load_gutenberg(self, language='en'):
        texts = get_etexts('author', self.author)
        texts = {
            t: list(get_metadata("title", t))[0]
            for t in texts if list(get_metadata("language", t))[0] == language
        }

        new_texts = dict()
        dupes = list()
        for k, d in texts.items():
            d = d.replace("\r\n", " ")
            if d not in dupes:
                dupes.append(d)
                new_texts[k] = d
                try:
                    self.books[d] = strip_headers(
                        load_etext(k)).strip().split("\r\n\r\n")
                except UnknownDownloadUriException:
                    print(
                        f'Book "{d}" does not have a text format and was not loaded.'
                    )
                    del new_texts[k]
                    dupes.remove(d)
                    continue
                self.tokens[d] = [
                    nltk.pos_tag(nltk.word_tokenize(self.books[d][b]))
                    for b in range(len(self.books[d]))
                ]
            else:
                pass

        texts = new_texts

        print(texts)
コード例 #8
0
def load_macbeth():
    """
    Sources Macbeth from Project Gutenberg, returns a cleaned dataframe
    of the play split by act, scene, speaker, and sentence.
    """
    raw_text = load_etext(1533)  # Collect the text
    raw_text = strip_headers(raw_text)  # Remove most metadata

    # Remove in-line stage directions

    raw_text = remove_in_line_stage_directions(raw_text)

    # Split the text into sentences

    sentences = separate_sentences(raw_text)

    # Remove introductory data, keeping only the text

    sentences = sentences[110:]

    # Create a dataframe from the sentences

    macbeth = create_play_data_frame(sentences)

    # Clean the dataframe

    macbeth = clean_macbeth(macbeth)

    # Add a token column

    macbeth["tokens"] = create_token_column(macbeth["sentence"])

    # Return the finished dataframe

    return macbeth
コード例 #9
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        del data_dir
        del tmp_dir
        del dataset_split

        books = [
            # bookid, skip N lines
            (19221, 223),
            (15553, 522),
        ]

        for (book_id, toskip) in books:
            text = cleanup.strip_headers(acquire.load_etext(book_id)).strip()
            lines = text.split("\n")[toskip:]
            for line in lines:
                # Any line that is all upper case is a title or author name
                if not line or line.upper() == line:
                    continue

                line = re.sub("[^a-z]+", " ", line.strip().lower())
                if line:
                    l = len(line)
                    if l > 100:
                        l = 100
                    yield {
                        "inputs": line,
                        "label": l,
                    }
コード例 #10
0
def main():
    """
    The main method.
    """

    parser = argparse.ArgumentParser(
        description='Word suggestion based on Project Gutenberg books.')
    parser.add_argument('--book-id',
                        dest='book_ids',
                        nargs='+',
                        type=int,
                        required=True,
                        help='the book id of the Project Gutenberg')
    parser.add_argument('--query',
                        nargs='+',
                        type=str,
                        required=True,
                        help='suggest next word for list of string',
                        action=required_length(1, 5))

    try:
        args = parser.parse_args()
        text_indexer = TextIndexer(len(args.query))

        for book_id in list(dict.fromkeys(args.book_ids)):
            text = strip_headers(load_etext(book_id)).strip()
            text_indexer.add_text(book_id, text)

        print(text_indexer.suggest(*args.query))
    except Exception as exc:  # pylint: disable=W0703
        print(exc)
コード例 #11
0
def getSomeBooks(howManyBooks, startingAt=1):
    i = howManyBooks
    ii = startingAt
    nothing = 0
    valError = 0
    otherError = 0
    allTheBooks = []
    while i > len(allTheBooks):  # 54096 ceiling
        try:
            theText = strip_headers(
                load_etext(ii)).strip()  #load the full text into theText
            theLength = len(theText)
            if len(theText) > 292:
                allTheBooks.append([ii, theText])
                print("one more book in the list, book number:", i,
                      "book total is:", len(allTheBooks))
            else:
                nothing = nothing + 1
                print("nothing here at number:", i)
        except ValueError:
            valError = valError + 1
            print("valueError at book number:", i)
        except:
            otherError = otherError + 1
            print("otherError at book number:", i)
        ii = ii + 1

    print('all done')
    print(len(allTheBooks))
    return allTheBooks
コード例 #12
0
ファイル: gutenbot.py プロジェクト: slippers/gutenbot
def post_corpora(url, auth_token):
    corpora = acquire_corpora()
    text = strip_headers(load_etext(corpora[0])).strip()

    print(corpora, text[:100])

    authentication_token = {'authentication-token': auth_token}

    # data to post
    files = {'file': io.StringIO(text)}
    data = {
        'label': '{} {}'.format(corpora[1], corpora[3]),
        'source': corpora[2]
    }

    # post
    ru = requests.post(url,
                       headers=authentication_token,
                       files=files,
                       data=data)

    print(ru.url, ru.status_code)
    if ru.ok:
        print(ru.json())
    else:
        print(ru.status_code, ru.reason)
コード例 #13
0
def download(cfg):
    print('Downloading Gutenberg data to: ' + cfg.directory)
    # Load language data for all books.
    path = os.path.join('code', 'utils', 'metadata.txt')
    with open(path, encoding='utf-8') as f:
        counter = 0
        for line in f:
            [index, lang, r, author, title] = line.split('\t')

            r = int(r)
            i = int(index)
            if counter < cfg.max_books and r == 1 and lang in cfg.languages:
                # Get the book.
                try:
                    text = strip_headers(load_etext(i)).strip().encode('utf-8')
                except UnknownDownloadUriException:
                    print('Could not download book: ' + str(i))
                    continue

                # Save the file to the correct directory.
                path = os.path.join(cfg.directory, lang)
                if not os.path.exists(path):
                    os.mkdir(path)
                with open(os.path.join(path, str(i) + '.txt'), 'wb') as f:
                    f.write(text)

                    counter += 1
                    if not counter % 1000:
                        print('Downloaded ' + str(counter) + ' books')
コード例 #14
0
def init_books(author_file, json_file):
    """initialize book list with texts and save it to disk"""
    with open(author_file) as f:
        authors = list(f)

    authors = [i.strip() for i in authors]

    books = []
    for author in authors:
        s = get_etexts('author', author)
        for i in s:
            try:
                if list(get_metadata('language', i))[0] == 'en':
                    title, etext = list(get_metadata(
                        'title', i))[0], strip_headers(load_etext(i)).strip()
                    b = Book(i, title, etext)
                    books.append(b)
            except UnknownDownloadUriException:
                # this book does not have a load_etext corresponding to it.
                pass

    with open(json_file, 'wb') as f:
        pickle.dump(books, f)

    print(len(books))
コード例 #15
0
def poetry_cleaner(poetry_books=BOOKS):
    with open(INPUT_DATA_WRITE_PATH + OUT_PATH, 'w') as ofp:

        lineno = 0

        for (id_nr, toskip, title) in poetry_books:

            startline = lineno
            text = strip_headers(load_etext(id_nr)).strip()
            lines = text.split('\n')[toskip:]

            for line in lines:

                if 0 < len(line) < 50 and line.upper(
                ) != line and not re.match('.*[0-9]+.*', line):
                    cleaned = re.sub('[^a-z\'\-]+', ' ', line.strip().lower())
                    if lineno < 100:
                        ofp.write(cleaned)
                        ofp.write('\n')
                    lineno = lineno + 1

                else:
                    ofp.write('\n')

        print('Wrote lines {} to {} from {}'.format(startline, lineno, title))
コード例 #16
0
ファイル: my_submodule.py プロジェクト: kltony/tensor2tensor
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    del data_dir
    del tmp_dir
    del dataset_split

    # pylint: disable=g-import-not-at-top
    from gutenberg import acquire
    from gutenberg import cleanup
    # pylint: enable=g-import-not-at-top

    books = [
        # bookid, skip N lines
        (19221, 223),
        (15553, 522),
    ]

    for (book_id, toskip) in books:
      text = cleanup.strip_headers(acquire.load_etext(book_id)).strip()
      lines = text.split("\n")[toskip:]
      prev_line = None
      ex_count = 0
      for line in lines:
        # Any line that is all upper case is a title or author name
        if not line or line.upper() == line:
          prev_line = None
          continue

        line = re.sub("[^a-z]+", " ", line.strip().lower())
        if prev_line and line:
          yield {
              "inputs": prev_line,
              "targets": line,
          }
          ex_count += 1
        prev_line = line
コード例 #17
0
def sample_paragraphs(book_id, n_parag, min_length):
    """Get book as text file and randomly sample a fixed number of paragraphs."""
    # Get book as string and emove metadata
    book = load_etext(book_id)
    # Remove metadata
    book = strip_headers(book).strip()
    # Remove the character we'll choose as separator
    book = book.replace("|", " ")
    # Split paragraphs
    parag = book.split("\n\n")
    # Remove single line breaks
    parag = [x.replace("\n", " ") for x in parag]
    # Remove paragraphs below a certain length
    parag = [p for p in parag if len(p) > min_length]
    # Exclude first/last 10 parag from sampling as they may contain remaining metadata
    parag = parag[10:-10]

    # Sample paragraphs
    seed(42)
    sample_ind = randint(0, len(parag), n_parag)

    if n_parag is not None:
        if n_parag > len(parag):
            raise ValueError(
                "The number of paragraphs to sample is higher than the "
                "total number of paragraphs."
            )
        else:
            parag_sampled = [parag[i] for i in sample_ind]

    else:
        # If n_parag is None, all paragraphs are sampled
        parag_sampled = parag

    return parag_sampled
コード例 #18
0
def regular_view(request, book_num):
    name = Book.get_book_name(book_num)
    bookText = strip_headers(load_etext(book_num)).strip()
    filteredText = removeStopWords(bookText)

    args = {'content': [bookText], 'content2': [filteredText], 'name': name}

    return render(request, "pages/regularText.html", args)
コード例 #19
0
ファイル: prose.py プロジェクト: oscarbyrne/oulipo
def get_raw_book():
    while True:
        try:
            text = load_etext(random.randrange(46000)) #46000 is approximately size of gutenberg catalogue
        except ValueError: #in case of no download method for that text id
            pass
        else:
            return strip_headers(text)
コード例 #20
0
def get_joyce_texts():
    joyce_keys = get_etexts('author', 'Joyce, James')
    joyce_titles = []
    joyce_texts = {}
    for key in joyce_keys:
        joyce_titles.append(get_metadata('title', key))
        joyce_texts[key] = strip_headers(load_etext(key)).strip()
    return (joyce_texts)
コード例 #21
0
def search_display_options(my_catalog):
    search_result_catalog = book_catalog()

    search_type = input(
        'Please select a search type: Author, Subject, Title [Aa/Ss/Tt]:  ')

    if search_type == 'A' or search_type == 'a':
        search_term = input('Please enter a search term for an Author: ')
    elif search_type == 'T' or search_type == 't':
        search_term = input('Please enter a search term for a Title: ')
    elif search_type == 'S' or search_type == 's':
        search_term = input('Please enter a search term for a Subject: ')
    else:
        print('Invalid search type...')
        return

    # set match flag to false
    match = False
    # fill up a set of all the titles that match the search
    for my_book in my_catalog.get_books():
        if (search_type == 'a' or search_type == 'A') and set(
                my_book.get_book_author().lower().split(' ')).intersection(
                    set(search_term.lower().split(' '))):
            search_result_catalog.add_book(my_book)
            match = True

        if (search_type == 't' or search_type == 'T') and set(
                my_book.get_book_title().lower().split(' ')).intersection(
                    set(search_term.lower().split(' '))):
            search_result_catalog.add_book(my_book)
            match = True

        if (search_type == 's' or search_type == 'S') and set(
                my_book.get_book_subject().lower().split(' ')).intersection(
                    set(search_term.lower().split(' '))):
            search_result_catalog.add_book(my_book)
            match = True

    search_result_catalog.display_titles_by_author()

    if match:
        title_num = input('Please type a title number from the above list: ')

        print('Displaying Word Cloud in [Subject: ' +
              my_book.get_book_subject() + '] for [Title: ' +
              my_book.get_book_title() + '] by [Author:' +
              my_book.get_book_author() + ']')
        try:
            my_book = search_result_catalog.get_book(title_num)
            return (strip_headers(load_etext(int(title_num))).strip()
                    )  # call that gets bok text from gutenberg
        except:
            print('Failed to find a textual download candidate for ' +
                  my_book.get_book_title())
            return (None)
    else:
        print('No matches found for [' + search_term + ']...')
        return (None)
コード例 #22
0
ファイル: cutthroat-finder.py プロジェクト: hugovk/cutthroats
def text_from_pg(id_number):
    # https://github.com/c-w/Gutenberg
    from gutenberg.acquire import load_etext

    # from gutenberg.cleanup import strip_headers

    # text = strip_headers(load_etext(id_number)).strip()
    text = load_etext(id_number).strip()
    return text
コード例 #23
0
ファイル: app.py プロジェクト: jbrandes/BookRoulette
def tab():
    with open("BookRoulette.html", "w") as f:
        x = (random.randint(1, 60059))
        book = strip_headers(load_etext(x)).strip()
        f.write(book)
        f.close
        filename = 'file:///'+os.getcwd()+'/' + 'BookRoulette.html'
        webbrowser.open_new_tab(filename)
        return render_template('BookRoulette.html', book=book)
コード例 #24
0
ファイル: main.py プロジェクト: Ryan-M3/minhasher
 def acquire_and_process(name: str, txt_num: int):
     """
     Convenience function that minhashes a Project Gutenberg
     text given the text id number (can be found on the gutenberg.org,
     for instance in the url).
     """
     txt = strip_headers(load_etext(txt_num))
     with open("texts/%s.txt" % name, "w") as f:
         f.write(txt)
     process_file("texts/%s.txt" % name)
コード例 #25
0
ファイル: app.py プロジェクト: jbrandes/BookRoulette
def download():
    with open("GutProject.doc", "w") as f:
        x = (random.randint(1, 60059))
        text = strip_headers(load_etext(x)).strip()
        f.write(text)
        f.close()
    return send_file('GutProject.doc',
    mimetype='application/msword',
    attachment_filename='GutProject.doc',
    as_attachment=True)
コード例 #26
0
def downloadBook():
    """If posting, takes in a book number from getty.html, installs the book into
    the database. Otherwise displays getty.html"""
    if request.method == "POST":
        bookNum = int(request.form.get("bookNum"))
        words = strip_headers(load_etext(bookNum)).strip()
        installText(words)
        return render_template("homepage.html")
    else:
        return render_template("getty.html")
コード例 #27
0
 def __init__(self, book_number=2701, first_page=20, last_page=20):
     self.text = strip_headers(load_etext(book_number))
     # print(list_supported_metadatas())  # prints (u'author', u'formaturi', u'language', ...)
     # print(get_metadata('title', 2701))  # prints frozenset([u'Moby Dick; Or, The Whale'])
     # print(get_metadata('author', 2701))  # prints frozenset([u'Melville, Hermann'])
     # print(text)  # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...'
     self.pages = []
     self.first_page = first_page
     self.last_page = last_page
     self.print_book()
コード例 #28
0
def gutToText(number,name):
    filename = name+"_raw.txt"
    if os.path.isfile(filename)==False:
        book = open(filename,"w")
        text = strip_headers(load_etext(number)).strip()
        words = text
        print "Loaded and writing %s" % (name)
        book.write(words.encode('utf-8'))
        print "Done writing %s" % (name)
        book.close()
コード例 #29
0
ファイル: yewno.py プロジェクト: skyballin/yewno
    def get_text(self, title, author):
        """
		This function will access the title and author of a book from the
		Gutenberg project and save the data as a csv file
		PROBLEM HERE -- gutenberg goes down a lot, so getting a full text 
		did not work. To bypass that, I downloaded some books of mixed languages.
		"""
        guten_number = get_etexts('title', title)[0]
        text = strip_headers(load_etext(guten_number)).strip()
        return (text)
コード例 #30
0
def get_gutenberg_document(url) -> str:
    """Downloads a document (book, etc.) from Project Gutenberg and returns it as a string."""
    # Get Project Gutenberg document ID from url string
    validate_url(url, expected_netloc='gutenberg.org')
    match = re.search("(?:files|ebooks|epub)\/(\d+)", urlsplit(url).path)
    if not match:
        raise Exception('Not a valid url')
    document_id = int(match.group(1))
    return super_cleaner(strip_headers(load_etext(document_id).strip()),
                         mark_deletions=False)
コード例 #31
0
ファイル: main.py プロジェクト: Ryan-M3/minhasher
 def acquire_and_process(name: str, txt_num: int):
     """
     Convenience function that minhashes a Project Gutenberg
     text given the text id number (can be found on the gutenberg.org,
     for instance in the url).
     """
     txt = strip_headers( load_etext(txt_num) )
     with open("texts/%s.txt" % name, "w") as f:
         f.write(txt)
     process_file("texts/%s.txt" % name)
コード例 #32
0
def generateBooks(lastBookID):
    firstBookID = 1
    # look through and grab each book
    while firstBookID <= lastBookID:
        # load and grab the eBook
        try:
            text = strip_headers(load_etext(firstBookID)).strip()
            gatherMetaData(firstBookID, text)
            firstBookID = firstBookID + 1
        except:
            print("error with book", firstBookID)
            firstBookID = firstBookID + 1
コード例 #33
0
def trial():
    text = strip_headers(load_etext(2701)).strip()
    print(text)  # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...'
    print(get_metadata(
        'title', 2701))  # prints frozenset([u'Moby Dick; Or, The Whale'])
    print(get_metadata('author',
                       2701))  # prints frozenset([u'Melville, Hermann'])

    print(get_etexts(
        'title', 'Moby Dick; Or, The Whale'))  # prints frozenset([2701, ...])
    print(get_etexts('author',
                     'Melville, Herman'))  # prints frozenset([2701, ...])
コード例 #34
0
def create_model():
    """Read in Project Gutenberg data, convert each into a markovify Text model object, then combine them into one model. Returns the model. 
	"""

    eap_1 = strip_headers(load_etext(2147)).strip()  #edgar allan poe vol 1
    eap_2 = strip_headers(load_etext(2148)).strip()  #edgar allan poe vol 2
    dickens = strip_headers(
        load_etext(807)).strip()  #charles dickens crime stories
    moonstone = strip_headers(load_etext(155)).strip()  #collins: the moonstone
    lerouge = strip_headers(
        load_etext(3802)).strip()  #gaboriau: the lerouge case
    orcival = strip_headers(
        load_etext(1651)).strip()  #gaboriau: the mystery of orcival
    calais = strip_headers(
        load_etext(16339)).strip()  #griffiths: the passenger from calais\
    leavenworth = strip_headers(
        load_etext(4047)).strip()  #griffiths: the passenger from calais
    agent = strip_headers(load_etext(974)).strip()  #conrad: the secret agent
    thirtynine = strip_headers(
        load_etext(558)).strip()  #conrad: the secret agent

    eap_1_model = markovify.Text(eap_1, state_size=3)
    eap_2_model = markovify.Text(eap_2, state_size=3)
    dickens_model = markovify.Text(dickens, state_size=3)
    moonstone_model = markovify.Text(moonstone, state_size=3)
    lerouge_model = markovify.Text(lerouge, state_size=3)
    orcival_model = markovify.Text(orcival, state_size=3)
    calais_model = markovify.Text(calais, state_size=3)
    leavenworth_model = markovify.Text(leavenworth, state_size=3)
    agent_model = markovify.Text(agent, state_size=3)
    thirtynine_model = markovify.Text(thirtynine, state_size=3)

    #NOTE: will need to play around with the weighting based on the text lengths so that I don't get all sentences from one book
    all_model = markovify.combine([
        eap_1_model, eap_2_model, dickens_model, moonstone_model,
        lerouge_model, orcival_model, calais_model, leavenworth_model,
        agent_model, thirtynine_model
    ])

    return all_model
コード例 #35
0
def main():
    eap_1 = strip_headers(load_etext(2147)).strip()  #edgar allan poe vol 1
    eap_2 = strip_headers(load_etext(2148)).strip()  #edgar allan poe vol 2
    dickens = strip_headers(
        load_etext(807)).strip()  #charles dickens crime stories
    moonstone = strip_headers(load_etext(155)).strip()  #collins: the moonstone
    lerouge = strip_headers(
        load_etext(3802)).strip()  #gaboriau: the lerouge case
    orcival = strip_headers(
        load_etext(1651)).strip()  #gaboriau: the mystery of orcival

    eap_1_model = markovify.Text(eap_1, state_size=3)
    dickens_model = markovify.Text(dickens, state_size=3)
    moonstone_model = markovify.Text(moonstone, state_size=3)
    lerouge_model = markovify.Text(lerouge, state_size=3)
    orcival_model = markovify.Text(orcival, state_size=3)

    #NOTE: will need to play around with the weighting based on the text lengths so that I don't get all sentences from one book
    all_model = markovify.combine([
        eap_1_model, eap_2_model, dickens_model, moonstone_model,
        lerouge_model, orcival_model
    ])

    #to do: loop to create different chapters - probably make them short (~ten sentences?) at first to make sure that they work properly
    print "\n\n\n Creating Chapters"

    chapters = create_chapters(
    )  #this will be a list of all the chapters, they should be complete at this point (all replacement/etc done)
    """
コード例 #36
0
def main():
    """
    Main function of the test module
    """

    # setting up the API keys from local keys.py file
    goodreads_key = os.environ['GOODREADS_KEY']
    goodreads_secret = os.environ['GOODREADS_SECRET']

    # creating a client for book search and information retrieval
    gc = client.GoodreadsClient(goodreads_key, goodreads_secret)

    current_path = os.getcwd()

    file = open(os.path.join(current_path, "output", "log.json"), "w")

    gutenberg_titles = []

    # Getting the title of the first 3000 books on Project Gutenberg (EXTREMELY FAST)
    for i in range(1, 10):
        title = list(get_metadata('title', i))
        if title:
            # prepare the string for the file name
            filename = ''.join(
                e for e in title[0] if e.isalnum() or e == ' ') + ".txt"
            gutenberg_titles.append(filename[:-4])
            text = strip_headers(load_etext(i)).strip()
            with open(os.path.join(current_path, "output", filename),
                      "w") as output_file:
                output_file.write(text)

    titles = dict()
    # Searching for the books on Goodreads, reading their metadata
    for book_title in gutenberg_titles:
        try:
            lst = gc.search_books(book_title, search_field='title')

            if not lst:
                continue
            else:
                book = lst[0]

            titles[book.title] = (
                book_title + ".txt", str(book.popular_shelves),
                str(book.similar_books), str(book.authors),
                dict(dict(book.work)['original_publication_year'])['#text'])
        except (request.GoodreadsRequestException, KeyError, TypeError):
            continue

    json.dump(titles, file, indent=4)
    file.close()
コード例 #37
0
def get_book_text(csvfile):
    'gets text for book using project gutenberg catalog'
    book_list = open_csv(csvfile)
 
        
    for i, value in enumerate(book_list):
        #print i, value
        
        a = int(book_list[i][2]) # a = book number
        print i, a
        author = book_list[i][0]
        title = book_list[i][1]
        try:
            text = strip_headers(load_etext(a)).strip()
        except ValueError:
            pass
コード例 #38
0
def process_file(filename, outdir):
    outpath = outdir + '/%s.txt'
    with open(filename) as f:
        for line in f:
          spl = line.split('|')
          book = spl[0]
          uids = map(int, spl[3].strip(string.lowercase + '\n').split(','))
          try:
            with open(outpath % book, 'w') as out:
              for uid in uids:
                raw_text = load_etext(uid)
                try:
                  text = strip_headers(unidecode(raw_text.encode('latin-1').decode('utf-8')))
                except UnicodeDecodeError:
                  text = strip_headers(raw_text)
                out.write(text.encode('utf-8'))
          except ValueError as e:
            print '%s|%s' % (book, uid), e
            os.remove(outpath % book)
コード例 #39
0
 def the_text(self): 
      try:
         self.novel = load_etext(self.novel_num)
      except:
         rejects.append(self.novel_num)
         return False
           
      if re.search('Character set encoding: ASCII', self.novel):
         self.novel = strip_headers(self.novel)
         self.novel = self.novel.replace('\n', ' ')
         self.novel = TextBlob(self.novel)
         self.novel_sentences = self.novel.sentences
         self.m = str(self.novel_num)
         with open('novel_'+self.m +'list_1.csv', 'wb') as f:
             writer = csv.writer(f)
             for sentence in self.novel_sentences:
                 writer.writerow([sentence])
      else: 
         rejects_2.append(self.novel_num) 
         return False    
コード例 #40
0
ファイル: process.py プロジェクト: RealTimeWeb/datasets
def check_text():
    with open('raw.json') as inputfile:
        data = json.load(inputfile)
    for record in tqdm(data):
        id = record['metadata']['id']
        title = clean(record['book']['title'])
        text = load_etext(id)
        if id in lookup_dates:
            release_date = lookup_dates[id]
        else:
            for line in text.split("\n"):
                if line.startswith('Release Date:'):
                    release_date = line.replace('Release Date:', '').split('[')[0]
                    break
            else:
                print id, title
        record['book']['author'] = record['author']
        author_name = record['book']['author']['name']
        vals.add(record['book']['author']['birth'])
        if record['book']['author']['birth'] == None:
            record['book']['author']['birth'] = 0
        if record['book']['author']['death'] == None:
            record['book']['author']['death'] = 0
        vals2.add(record['book']['author']['birth'])
        record['book']['author']['name'] = clean(author_name) if author_name else "Unknown"
        del record['author']
        month, day, year = extract_date(release_date)
        release_date = release_date.strip()
        record['book']['publication'] = {
            'full': release_date  if month != 'Jan' else release_date.replace('Jan', 'January'),
            'year': year,
            'month name': month if month != 'Jan' else 'January',
            'month': month_lookup[month],
            'day': day
        }
        record['bibliography'] = record['book']
        del record['book']
        record['metrics'] = record['statistics']
        del record['statistics']
    with open('classics-2.json', 'w') as output:
        json.dump(data, output, indent=2)
コード例 #41
0
ファイル: fetch.py プロジェクト: LSaldyt/Plutus
def fetch_gutenberg(filename=None):
    from gutenberg.acquire import load_etext
    from gutenberg.cleanup import strip_headers
    from gutenbergsettings import popularTitles, saveInterval

    start    = time.time()
    lastsave = start

    with redirect(filename):
        try:
            for title in popularTitles:
                text = strip_headers(load_etext(title)).strip()
                serialize([(title, text)], '../serialized/guten%s' % title)
                sinceLast = time.time() - lastsave
                print('%s since last save' % sinceLast)
                if sinceLast > saveInterval:
                    concatenate('guten')
                    lastsave = time.time()
        except KeyboardInterrupt:
            concatenate('guten')
            sys.exit(0)
コード例 #42
0
def extract_subroutine(data, src_dir, century):
    session = model.get_session()
    backoff = 1

    counter = 0
    for metadata in data:
        contents = extract_book_contents(metadata)

        if contents is None:
            backoff *= 1.5
            continue

        title = metadata['title']
        author = metadata['author']
        e_id = metadata['id']

        if type(title) == list:
            title = dasherize(title)

        text_file_path = os.path.join(src_dir, dasherize(title.split(" ")))
        text = strip_headers(load_etext(e_id)).strip()
        f = open(text_file_path, "w")
        f.write(text)
        f.close()
        book = model.Book(
            title=title,
            author=author,
            e_id=e_id,
            century=century,
            text_file_path=text_file_path
        )
        session.add(book)
        session.commit()
        log.info("successfully added " + title)
        counter += 1
        time.sleep(backoff)

    log.info("---- finished run. added %d books ----" % counter)
コード例 #43
0
ファイル: admin_gender.py プロジェクト: tcql/gutenberg_scifi
def main():
    filename = "gutenbergscifi.csv"
    json_filename = "gendered_words.json"
    
    if os.path.isfile(filename):
        print "file exists"
        
    else:
        write_csv = extract_text_urls(filename)
        print "file created"

    book_list = open_csv(filename)
    
    print book_list
    
    for i, value in enumerate(book_list):
        #print i, value
        
        a = int(book_list[i][2]) # a = book number
        print i, a
        author = book_list[i][0]
        title = book_list[i][1]
        try:
            text = strip_headers(load_etext(a)).strip()
        except ValueError:
            pass
        #print text  
        
        clean_text = remove_punc_html(text)
        ratio = gender_ratio(clean_text)
        
        print author, title, ratio
        uber_key = author
        sub_key = title
        sub_value = ratio
        uber_value = {author: {title:ratio}}

        json_source = read_write_json_object(json_filename="gendered_words.json", uber_key=uber_key, uber_value=uber_value, sub_key=sub_key, sub_value=sub_value, READ=False, WRITE=True)
コード例 #44
0
def download_book(title, gutenberg_id, data_path, sleep=0):
    print('downloading {:}'.format(title))

    full_text = strip_headers(load_etext(gutenberg_id)).strip()
    summary = downloadSummary(title)

    if full_text is None:
        print('Full text is None. Skipping {:}'.format(title))
        return
    if summary is None:
        print('Summary is None. Skipping {:}'.format(title))
        return

    output_data = {'title': title,
                   'full_text': full_text,
                   'summary': summary}
        
    output_file = os.path.join(data_path,
                               '{:}.json'.format(gutenberg_id))
    with open(output_file, 'w') as f:
        json.dump(output_data, f, ensure_ascii=False)

    time.sleep(sleep)
コード例 #45
0
ファイル: getbooks.py プロジェクト: sarah-j-smith/gutenwords
def getBook(bookDetails):
    global timeAtLastFetch
    cachedFilename = "cache/" + fileNameForBook(bookDetails) + ".txt"
    if os.path.isfile(cachedFilename):
        with open(cachedFilename) as bookfile:
            text = bookfile.read()
            return TextBlob(text)

    nowMS = milliseconds()
    timeSinceLastFetch = nowMS - timeAtLastFetch
    if timeSinceLastFetch < gutenbergWaitTimeMS:
        waitTime = gutenbergWaitTimeMS - timeSinceLastFetch
        print "    waiting {}ms for Gutenberg...".format(waitTime)
        time.sleep(waitTime / 1000)

    bookId = bookDetails['id']
    print "Fetching from Gutenberg id {}".format(bookId)
    source = load_etext(bookId)
    print "    cleaning...."
    source = removeUnicodeWords.sub("", source)
    source = removeUnicodeCharacters.sub("", source)
    source = removePossessive.sub("", source)
    source = removeWordsWithApostrophe.sub("", source)
    source = removeHyphens.sub(" ", source)
    source = removeChapterHeaders.sub("", source)
    source = removeRomanNumerals.sub("", source)
    source = removeEllipsis.sub("", source)
    text = strip_headers(source).strip()
    timeAtLastFetch = milliseconds()
    if not os.path.isdir("cache"):
        os.mkdir("cache")
    bookfile = open(cachedFilename, 'w')
    bookfile.write(text)
    bookfile.close()
    print "    fetched and cached " + bookDetails['title']
    return TextBlob(text)
コード例 #46
0
ファイル: reader.py プロジェクト: sfrapoport/litbot
def generate_tweets(gutenberg_id, total=24):
    document = []
    text = strip_headers(load_etext(gutenberg_id)).strip()
    lines = text.split('\n')    
    print get_metadata('title', gutenberg_id)
    for line in lines:
        words = re.findall(regex, line)
        document.extend(words)

    trigrams = zip(document, document[1:], document[2:])
    trigram_transitions = defaultdict(list)
    starts = []

    for prev, current, next in trigrams:
            if prev == ".":
                    starts.append(current)
            trigram_transitions[(prev, current)].append(next)

    def generate_using_trigrams():
            current = random.choice(starts)
            prev = "."
            result = [current]
            while True:
                    next_word_candidates = trigram_transitions[(prev, current)]
                    next_word = random.choice(next_word_candidates)
                    prev, current = current, next_word
                    if current != ".":
                        result.append(current)
                    else:
                        return " ".join(result) + current
    tweets = [];
    while len(tweets) < total:
        tweet = generate_using_trigrams()
        if len(tweet) <= 140:
            tweets.append(tweet)
    return tweets
コード例 #47
0
# -*- coding:utf-8 -*-

# Librerias
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

texto=load_etext(2000)
texto=strip_headers(texto)


#Reemplazar fragmento "qu" por "k" para que no se incluya "(q)ue" o "(q)ui" como diptongo
#Reemplazar diptongos con "y" agregando "-" para que no encuentre palabras en las que la "y" es consonante y no vocal.
texto = texto.replace("que", "ke")
texto = texto.replace("qui", "ki")
texto = texto.replace("gue", "ke")
texto = texto.replace ("gui", "ki")
texto = texto.replace ("ay", "ay-")
texto = texto.replace (u"áy", u"áy-")
texto = texto.replace ("ey", "ey-")
texto = texto.replace (u"éy", u"éy-")
texto = texto.replace ("oy", "oy-")
texto = texto.replace ("uy", "uy-")

texto = texto.lower()


# Dividir texto en palabras
# Diptongo:  Combinación de una vocal abierta (/a e o/) con una cerrada (/i u/), o viceversa, la cerrada no debe ser tónica.
# Hay que indicar con un espacio que la "y" debe quedar al final de palbra
palabras=texto.split()
dic_diptongos={ 
コード例 #48
0
def split_sentences(text):
	for sentence_separator in [u'. ',u'.\n',u'? ',u'! ',u'?\n',u'!\n',u'; ',u';\n',u'- ',u'--',u'...',u'\n',u'\n\n',u'\n\n\n']:
		text=text.replace(sentence_separator,u'|||')
		return text.split(u'|||')


# Saber la cantidad de libros que posee el corpus.
print u'Total de libros en español:',len(codigos_libros.es)


# Ahora se cargan los libros y se suprimen sus encabezados.
dic_oraciones_es={}
total_palabras_es=0
for codigo_libro_es in codigos_libros.es:
	texto=load_etext(codigo_libro_es)
	texto=strip_headers(texto)
	
# En cada libro se separan las oraciones y se delimitan por el símbolo |||.
	oraciones_libro=split_sentences(texto)
	for oracion_libro in oraciones_libro:
		palabras=rufino.split_words(oracion_libro)
		numero_palabras_oracion=len(palabras)
		total_palabras_es+=numero_palabras_oracion
		if numero_palabras_oracion not in dic_oraciones_es:
			dic_oraciones_es[numero_palabras_oracion]=1
		else:
			dic_oraciones_es[numero_palabras_oracion]=dic_oraciones_es[numero_palabras_oracion]+1


print u'Total de oraciones en español:',len(dic_oraciones_es)
コード例 #49
0
ファイル: test.py プロジェクト: aklreaxmer/LATAWCapstone
import nltk
from nltk.text import Text
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

id = input("Input gutenberg id to load: ")
text=strip_headers(load_etext(id)).strip()
raw_input("Enter to print text preview...")
print(text[:1000])

text = text.split()
text=Text(text)
def ask():
	test = raw_input("Which analysis to perform ('list' to see list): ")
	if(test == "list"):
		print("concordance, dispersionplot, wordcount, lexicaldiversity, frequency, collocations")
		ask()
	if(test == "concordance"):
		conc = raw_input("word: ")
		text.concordance(conc)
		ask()
	if(test == "dispersionplot"):
		disp = []
		keepasking = True;
		i=0;
		while(keepasking):
			input = raw_input("word " + str(i) + " (blank to stop): ")
			if(len(input) > 0):
				disp.append(input)
			else:
				keepasking = False;
コード例 #50
0
ファイル: inter.py プロジェクト: andresfpc/problemasenclase
17406, 17430, 17491, 20401, 21651, 23206, 23236, 24536, 24601, 24925, 25317, 25640, 25687, 
25777, 25988, 26284, 26655, 27736, 29497, 29506, 29663, 29799, 29831, 30053, 30122, 30425, 
30535, 30903, 30986, 31013, 31464, 31541, 31613, 31637, 31707, 32235, 32315, 32364, 33690, 
35882, 36253, 36453, 36573, 36940, 37067, 37095, 37139, 37637, 38814, 39444, 39613, 39990, 
41746, 42727,)



guardar_cadena=0
cadena=u''
interjecciones={}
contador = 0


for texto in textos: #Repito el ciclo para cada libro
    texto= load_etext(texto) #Cargo el texto 
    texto=strip_headers(texto).lower() #Quito las cabeceras
    texto=unicode(texto)
    for caracter in texto: #recorro el texto caracter por caracter
	if caracter == u'¡':      # Si encuentro una apertura de exclamación
        	guardar_cadena=1  # Pongo una variable para empezar a guardar la cadena	
		cadena=cadena+unicode(caracter)	
	if caracter == u'!':      # Si encuentro un cierre de exclamación
		cadena = cadena+unicode(caracter)	 # 1. Guardo ese último caractér (esto es opcional)
		if cadena in interjecciones.keys(): # 2. reviso si la cadena esta en el diccionario
			interjecciones[cadena]+=1  # 3. Si esta le sumo uno a su contador
		else:                           # 4. Si no esta 
			interjecciones[cadena]=1    # La pongo y aranca desde 1
		guardar_cadena=0				# 5. Cambio el valor de la variable para que no se guarde más 
		cadena=''
        if guardar_cadena == 1:         # 6. reviso si el valor de guardar cadena esta en 1
コード例 #51
0
"""
Created on Wed Aug 12 18:06:45 2015

@author: Tony
Description: Pull etext numbers from Project Gutenberg for an author

1) First pip install gutenberg 0.4.0 library for Python from the command line

"""
 
from gutenberg.query import get_etexts
from gutenberg.query import get_metadata
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers


# get the catalogue numbers of all the texts
# by Wilhelm Grimm in Project Gutenberg
bookList=get_etexts('author', 'Grimm, Wilhelm Carl')
# gives bookList = [12704, 12705, 12706, 12707, 12708, 12709, 12710, 37381, 20051, 28044, 30510, 22555, 20050, 11027, 16846, 12250, 20027, 19068, 2591]

#Once We can associate a number with a title we can pull the text
for number in bookList:
    print(number,get_metadata('title',number))
 
print('\n HHHHHHHHHHHHHHH Now for the full text HHHHHHHHHHHHHHHHHHH \n')
# Once we have the text number we can print the text
# example 11027 is the number for Grimm's Fairy Stories 
# can be tempermental truncating text at top (console limit?) may need to trick around  
etext = strip_headers(load_etext(11027)).strip()
print(etext)
コード例 #52
0
    def open_file(self, file_id):
        """
            Opens a file from project gutenberg 
        """

        return load_etext(file_id)
コード例 #53
0
ファイル: process_gutenberg.py プロジェクト: ssamot/Novels
print total
i = 0
for (dirpath, dirnames, filenames) in walk(gutenberg_path):
    for filename in filenames:
        f =  "/".join([dirpath, filename])
        if(f.endswith(".rdf")):
            #print f
            i+=1
            bf = BeautifulSoup(open(f))
            subjects =  bf.find_all("dcterms:subject")
            if (subjects is not None and len(subjects) > 0):
                for subject in subjects:
                    val =  subject.find_all("rdf:value")[0].contents[0]
                    for i_subject in i_subjects:
                        if(i_subject in val.lower()):
                            #print f, val

                            id =  int(basename(f)[2:-4])
                            fn = str(id).zfill(10) + "_" +  i_subject + ".txt"
                            print fn
                            try:
                                text = strip_headers(load_etext(id)).strip().encode("utf-8")
                                wf = "./texts/" + fn
                                with open(wf, "w") as text_file:
                                    text_file.write(text)
                                print i, total, float(i)/total
                            except:
                                print "broken", id
            # for network in tree.findtext('dcterms subject'):
            #     print network
コード例 #54
0
#Hecho en python 3.5
from gutenberg.acquire import load_etext 
from gutenberg.cleanup import strip_headers

librosCodigo = {"Francés":[13735,13808],"Español":[24925,15027],"Portugés":[14904,16384],"Inglés":[10422,1013]}
dic_idiomas={}

for idioma in librosCodigo.keys():
    diccionario_largo_palabras={}

    for indeCo in librosCodigo[idioma]:
        texto= strip_headers(load_etext(indeCo))
        dic_idiomas[idioma]= diccionario_largo_palabras

        for caracter_especial in ['"',"...","¿","?","=","_","[","]","(",")",",",".",":",";","!","¡","«","»","*","~","' "," '","- "," -","--"]:
            texto=texto.replace(caracter_especial," ")
            palabras=texto.split()

        for palabra in palabras:
            largo_palabra = len(palabra)
            if largo_palabra in diccionario_largo_palabras:
                diccionario_largo_palabras[largo_palabra] = diccionario_largo_palabras[largo_palabra]+1
            else:
                diccionario_largo_palabras[largo_palabra]= 1
print (dic_idiomas)
コード例 #55
0
Created on Sun Sep 20 19:49:20 2015

@author: weizhi
"""

import nltk
from nltk.corpus import gutenberg
nltk.corpus.gutenberg.fileids()
emma = nltk.corpus.gutenberg.words('austen-emma.txt')

from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers


from gutenberg.acquire import metadata
text = load_etext(201)
print text[:100]
#assert text.startswith('MOBY DICK; OR THE WHALE\n\nBy Herman Melville')

import rdflib
g = rdflib.Graph()



from gutenberg.acquire import metadata


output = metadata._create_metadata_graph(store='Sleepycat')
#downLoad = metadata._download_metadata_archive()

from gutenberg.query.api import get_metadata  # noqa
コード例 #56
0
def downloadText(textID):
    print "Downloading", textID
    text = strip_headers(load_etext(textID)).strip()
    return text
コード例 #57
0
    for item in ners:
        # Loop over the Stanford NER (per/ person) results,
        # and apply probablepeople, which raises when fails, (so try).
        if "per" in item["tag"].lower():
            try:
                result = parse(item.get('string'))
            except:
                log.error("Could not run probablepeople")

            if result:
                result = parse(item["string"])
                pp.append(result)
    ner["pp"] = pp
    return ner


if __name__ == '__main__':
    if len(sys.argv) >= 2 and 'test' in " ".join(sys.argv):
        import doctest
        doctest.testmod(verbose=True)

    if len(sys.argv) >= 2 and 'profile' in " ".join(sys.argv):
        from gutenberg.acquire import load_etext
        from gutenberg.cleanup import strip_headers
        from pycallgraph import PyCallGraph
        from pycallgraph.output import GraphvizOutput

        text = smart_text(strip_headers(load_etext(54807)).strip())
        with PyCallGraph(output=GraphvizOutput()):
            stanford_ner_wrapper(text, 9992, True)
コード例 #58
0
            except Exception, e:
                logging.error("%s: %s" % (path, e))
                # raise e

    @classmethod
    def text_from_zip(cls, path, rdf_catalog_path=None):
        """Return a ProjectGutenbergText object from a zip file."""
        archive = zipfile.ZipFile(path)
        inside = archive.filelist
        filenames = [x.filename for x in inside]
        if len(inside) != 1:
            logging.warn("Supposedly plain-text %s has %d files in zip: %s" % (
                    path, len(inside), ", ".join(filenames)))
        possibilities = [x for x in filenames if x.lower().endswith(".txt")]
        data = archive.read(possibilities[0])
        return ProjectGutenbergText(data, path, rdf_catalog_path)

    @property
    def paragraphs(self):
        return self.text.split("\r\n\r\n")
Obj = ProjectGutenbergText(text, name=None, rdf_catalog_path=raw_data)
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

text = strip_headers(load_etext(2701)).strip()
assert text.startswith('MOBY DICK; OR THE WHALE\n\nBy Herman Melville')




コード例 #59
0
# -*- coding:utf-8 -*-

from gutenberg.acquire import load_etext 
from gutenberg.cleanup import strip_headers

dic_cont_interjecciones={}

textos=load_etext(1619)
texto=strip_headers(textos).lower()
guardar_cadena=0
cadena=''
interjecciones={}

for texto in textos: #Repito el ciclo para cada libro
    for caracter in texto: #recorro el texto caracter por caracter
	if caracter == u'¡':
        	guardar_cadena=1
	if caracter == u'!':
		cadena = cadena+caracter
		if cadena in interjecciones.keys():
			interjecciones[cadena]+=1
		else:
			interjecciones[cadena]=1
		guardar_cadena=0
        if guardar_cadena == 1:
		cadena = cadena+caracter

for interjeccion in interjecciones.keys().sort():
	print interjeccion, interjecciones[interjeccion]

コード例 #60
0
from numpy import random
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers


bookNumber = set(random.randint(10,50024,size=2000))
#f.write(foo.encode('utf8'))

metaInfo = []

for item in bookNumber:
   # print item
    try: 
       # print item
        # loading the raw txt 
        data = load_etext(item).split("\n")
        
        # save the txt data path
        filePath = rdfPath + '/' +str(item) + '/' +  str(item) + '.txt'
        f = open(filePath,'w')
        f.write(data.encode('utf8'))
        f.close()
        # get the meta data 
        Dict = obj.metaData(data)
        metaInfo.append((Dict,filePath))
        print len(metaInfo)
    except:
        continue
#%%do the data mining to these txt, author, title, release time, etc, need time to work on this part