Beispiel #1
0
    def load_gutenberg(self, language='en'):
        texts = get_etexts('author', self.author)
        texts = {
            t: list(get_metadata("title", t))[0]
            for t in texts if list(get_metadata("language", t))[0] == language
        }

        new_texts = dict()
        dupes = list()
        for k, d in texts.items():
            d = d.replace("\r\n", " ")
            if d not in dupes:
                dupes.append(d)
                new_texts[k] = d
                try:
                    self.books[d] = strip_headers(
                        load_etext(k)).strip().split("\r\n\r\n")
                except UnknownDownloadUriException:
                    print(
                        f'Book "{d}" does not have a text format and was not loaded.'
                    )
                    del new_texts[k]
                    dupes.remove(d)
                    continue
                self.tokens[d] = [
                    nltk.pos_tag(nltk.word_tokenize(self.books[d][b]))
                    for b in range(len(self.books[d]))
                ]
            else:
                pass

        texts = new_texts

        print(texts)
def init_books(author_file, json_file):
    """initialize book list with texts and save it to disk"""
    with open(author_file) as f:
        authors = list(f)

    authors = [i.strip() for i in authors]

    books = []
    for author in authors:
        s = get_etexts('author', author)
        for i in s:
            try:
                if list(get_metadata('language', i))[0] == 'en':
                    title, etext = list(get_metadata(
                        'title', i))[0], strip_headers(load_etext(i)).strip()
                    b = Book(i, title, etext)
                    books.append(b)
            except UnknownDownloadUriException:
                # this book does not have a load_etext corresponding to it.
                pass

    with open(json_file, 'wb') as f:
        pickle.dump(books, f)

    print(len(books))
Beispiel #3
0
def gatherMetaData(bookID, text):
    # Meta data types
    ''' taken from - https://github.com/hugovk/gutenberg-metadata
        id
        author
        formaturi - we want to filter or ignore this one. Since we are more interested in the supporting information.
        language
        rights
        subject
        title
        fulltext - we will add this on
    '''
    subjectItems = []

    title, = get_metadata("title", bookID)
    author, = get_metadata("author", bookID)
    language, = get_metadata("language", bookID)
    rights, = get_metadata("rights", bookID)
    subject = get_metadata("subject", bookID)

    for item in subject:
        subjectItems.append(item)
    # the dict needs to be turned into a byte array.
    # we can do this using the json library.
    # refs: https://stackoverflow.com/questions/19232011/convert-dictionary-to-bytes-and-back-again-python
    SubjectDict = dict(subject=subjectItems)
    SubjectToBytes = json.dumps(SubjectDict)
    # The last step is to turn the binary to a bytes array
    # ref: https://www.w3resource.com/python/python-bytes.php
    binary = bytes(SubjectToBytes, "utf8")

    storeInformationIntoDatabase(bookID, text, title, author, language, rights,
                                 binary)
Beispiel #4
0
 def test_read_deleted_cache(self):
     self.cache.populate()
     set_metadata_cache(self.cache)
     self.cache.delete()
     try:
         get_metadata('title', 50405)
     except InvalidCacheException:
         pass
 def test_read_deleted_cache(self):
     self.cache.populate()
     set_metadata_cache(self.cache)
     self.cache.delete()
     try:
         get_metadata('title', 50405)
     except InvalidCacheException:
         pass
 def test_read_unpopulated_cache(self):
     set_metadata_cache(self.cache)
     try:
         get_metadata('title', 50405)
     except InvalidCacheException:
         pass
     except:
         raise
Beispiel #7
0
 def test_read_unpopulated_cache(self):
     set_metadata_cache(self.cache)
     try:
         get_metadata('title', 50405)
     except InvalidCacheException:
         pass
     except:
         raise
Beispiel #8
0
    def test_refresh(self):
        self.cache.populate()
        set_metadata_cache(self.cache)
        title = get_metadata('title', 30929)
        self.assertIn('Het loterijbriefje', title)

        self.cache.refresh()
        title = get_metadata('title', 30929)
        self.assertIn('Het loterijbriefje', title)
Beispiel #9
0
    def test_refresh(self):
        self.cache.populate()
        set_metadata_cache(self.cache)
        title = get_metadata('title', 30929)
        self.assertIn('Het loterijbriefje', title)

        self.cache.refresh()
        title = get_metadata('title', 30929)
        self.assertIn('Het loterijbriefje', title)
def trial():
    text = strip_headers(load_etext(2701)).strip()
    print(text)  # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...'
    print(get_metadata(
        'title', 2701))  # prints frozenset([u'Moby Dick; Or, The Whale'])
    print(get_metadata('author',
                       2701))  # prints frozenset([u'Melville, Hermann'])

    print(get_etexts(
        'title', 'Moby Dick; Or, The Whale'))  # prints frozenset([2701, ...])
    print(get_etexts('author',
                     'Melville, Herman'))  # prints frozenset([2701, ...])
def metadata(book_id):
    """

    Args:
        book_id: book id (integer)

    Returns:
        metadata of that book: a dictionary with keys ["title", "authors", "language", "bookshelves"],
                                which bookshelves is an empty set
    """
    title = {str(x) for x in gq.get_metadata('title', book_id)}
    authors = {str(x) for x in gq.get_metadata('author', book_id)}
    language = {str(x) for x in gq.get_metadata('language', book_id)}
    bookshelves = set()
    return book_metadata(book_id, title, authors, language, bookshelves)
Beispiel #12
0
 def test_repopulate(self):
     self.cache.populate()
     set_metadata_cache(self.cache)
     self.cache.delete()
     self.cache.populate()
     title = get_metadata('title', 30929)
     self.assertIn(u('Het loterijbriefje'), title)
Beispiel #13
0
def get_all_metadata(last_ebook_id):
    metadata = AutoVivification()
    i = 1  # First ebook starts at 1

    while i <= last_ebook_id:
        if i % 100 == 0:
            sys.stdout.write(str(i) + "\r")
        for feature_name in [
                "author",
                "formaturi",
                "language",
                "rights",
                "subject",
                "title",
        ]:
            data = get_metadata(feature_name, i)
            metadata[i][feature_name] = data

        i += 1
    sys.stdout.write("\r\n")

    # from pprint import pprint
    # pprint(metadata)

    with open("gutenberg-metadata.json", "w") as fp:
        json.dump(
            metadata,
            fp,
            cls=SetEncoder,
            indent=0,
            separators=(",", ":"),
            sort_keys=True,
        )
Beispiel #14
0
 def test_repopulate(self):
     self.cache.populate()
     set_metadata_cache(self.cache)
     self.cache.delete()
     self.cache.populate()
     title = get_metadata('title', 30929)
     self.assertIn(u('Het loterijbriefje'), title)
Beispiel #15
0
def acquire_corpora():
    while True:
        book = randint(100, 10000)
        if get_metadata('title', book):
            # uri = get_uri(book, 'images')
            uri = 'http://www.gutenberg.org/ebooks/{}'.format(book)
            return [book, get_title(book), uri, get_author(book)]
Beispiel #16
0
def get_joyce_texts():
    joyce_keys = get_etexts('author', 'Joyce, James')
    joyce_titles = []
    joyce_texts = {}
    for key in joyce_keys:
        joyce_titles.append(get_metadata('title', key))
        joyce_texts[key] = strip_headers(load_etext(key)).strip()
    return (joyce_texts)
Beispiel #17
0
def get_metadata(idx):
    ret = {}
    keys = q.list_supported_metadatas()
    for key in keys:
        ret[key] = list(q.get_metadata(key, idx))
    content = get_content(idx)
    ret['length'] = len(content)
    print(ret)
    return ret
Beispiel #18
0
def get_all_titles():
    my_catalog = book_catalog()

    for i in range(1, 65000):

        title = ''.join(list(get_metadata('title', i)))

        if (title):
            lang = list(get_metadata('language', i))[0]
            if lang == 'en':
                my_book = book(
                    i, ''.join(list(get_metadata('author', i))).replace(
                        "\n", " ").replace("\r", " "),
                    ''.join(list(get_metadata('title',
                                              i))).replace("\n", " ").replace(
                                                  "\r", " "),
                    ''.join(list(get_metadata('subject', i))).replace(
                        "\n", " ").replace("\r", " "))
                my_catalog.add_book(my_book)
    return (my_catalog)
Beispiel #19
0
 def _run_get_metadata_for_feature(self, feature):
     for testcase in self.sample_data():
         expected = getattr(testcase, feature)
         actual = get_metadata(feature, testcase.etextno)
         self.assertTrue(
             set(actual) == set(expected),
             u('non-matching {feature} for book {etextno}: '
               'expected={expected} actual={actual}').format(
                   feature=feature,
                   etextno=testcase.etextno,
                   actual=actual,
                   expected=expected))
Beispiel #20
0
 def _run_get_metadata_for_feature(self, feature):
     for testcase in self.sample_data():
         expected = getattr(testcase, feature)
         actual = get_metadata(feature, testcase.etextno)
         self.assertTrue(
             set(actual) == set(expected),
             u('non-matching {feature} for book {etextno}: '
               'expected={expected} actual={actual}')
             .format(
                 feature=feature,
                 etextno=testcase.etextno,
                 actual=actual,
                 expected=expected))
Beispiel #21
0
def search(query: str, include: Optional[str] = None) -> List[dict]:
    fields = parse_include(include) if include else []
    conjunction = parse_search(query)

    parts = iter(get_etexts(field, value) for field, value in conjunction)
    results = set(next(parts))
    [results.intersection_update(part) for part in parts]  # type: ignore

    return [
        dict([('text_id', text_id)] + [(field, get_metadata(field, text_id))
                                       for field in fields])
        for text_id in results
    ]
def main():
    """
    Main function of the test module
    """

    # setting up the API keys from local keys.py file
    goodreads_key = os.environ['GOODREADS_KEY']
    goodreads_secret = os.environ['GOODREADS_SECRET']

    # creating a client for book search and information retrieval
    gc = client.GoodreadsClient(goodreads_key, goodreads_secret)

    current_path = os.getcwd()

    file = open(os.path.join(current_path, "output", "log.json"), "w")

    gutenberg_titles = []

    # Getting the title of the first 3000 books on Project Gutenberg (EXTREMELY FAST)
    for i in range(1, 10):
        title = list(get_metadata('title', i))
        if title:
            # prepare the string for the file name
            filename = ''.join(
                e for e in title[0] if e.isalnum() or e == ' ') + ".txt"
            gutenberg_titles.append(filename[:-4])
            text = strip_headers(load_etext(i)).strip()
            with open(os.path.join(current_path, "output", filename),
                      "w") as output_file:
                output_file.write(text)

    titles = dict()
    # Searching for the books on Goodreads, reading their metadata
    for book_title in gutenberg_titles:
        try:
            lst = gc.search_books(book_title, search_field='title')

            if not lst:
                continue
            else:
                book = lst[0]

            titles[book.title] = (
                book_title + ".txt", str(book.popular_shelves),
                str(book.similar_books), str(book.authors),
                dict(dict(book.work)['original_publication_year'])['#text'])
        except (request.GoodreadsRequestException, KeyError, TypeError):
            continue

    json.dump(titles, file, indent=4)
    file.close()
Beispiel #23
0
def get_title_gutenberg(gutenberg_id):
    """
    Gets title for novel with this gutenberg id

    >>> from gender_novels import corpus_gen
    >>> get_title_gutenberg(33)
    'The Scarlet Letter'

    """

    title = list(get_metadata('title', gutenberg_id))[0]
    for sep in TRUNCATORS:
        title = title.split(sep,1)[0]
    return title
Beispiel #24
0
def generate_tweets(gutenberg_id, total=24):
    document = []
    text = strip_headers(load_etext(gutenberg_id)).strip()
    lines = text.split('\n')
    print get_metadata('title', gutenberg_id)
    for line in lines:
        words = re.findall(regex, line)
        document.extend(words)

    trigrams = zip(document, document[1:], document[2:])
    trigram_transitions = defaultdict(list)
    starts = []

    for prev, current, next in trigrams:
        if prev == ".":
            starts.append(current)
        trigram_transitions[(prev, current)].append(next)

    def generate_using_trigrams():
        current = random.choice(starts)
        prev = "."
        result = [current]
        while True:
            next_word_candidates = trigram_transitions[(prev, current)]
            next_word = random.choice(next_word_candidates)
            prev, current = current, next_word
            if current != ".":
                result.append(current)
            else:
                return " ".join(result) + current

    tweets = []
    while len(tweets) < total:
        tweet = generate_using_trigrams()
        if len(tweet) <= 140:
            tweets.append(tweet)
    return tweets
Beispiel #25
0
def generate_tweets(gutenberg_id, total=24):
    document = []
    text = strip_headers(load_etext(gutenberg_id)).strip()
    lines = text.split('\n')    
    print get_metadata('title', gutenberg_id)
    for line in lines:
        words = re.findall(regex, line)
        document.extend(words)

    trigrams = zip(document, document[1:], document[2:])
    trigram_transitions = defaultdict(list)
    starts = []

    for prev, current, next in trigrams:
            if prev == ".":
                    starts.append(current)
            trigram_transitions[(prev, current)].append(next)

    def generate_using_trigrams():
            current = random.choice(starts)
            prev = "."
            result = [current]
            while True:
                    next_word_candidates = trigram_transitions[(prev, current)]
                    next_word = random.choice(next_word_candidates)
                    prev, current = current, next_word
                    if current != ".":
                        result.append(current)
                    else:
                        return " ".join(result) + current
    tweets = [];
    while len(tweets) < total:
        tweet = generate_using_trigrams()
        if len(tweet) <= 140:
            tweets.append(tweet)
    return tweets
Beispiel #26
0
def get_subject_gutenberg(gutenberg_id):
    """
    Tries to get subjects

    >>> from gender_novels import corpus_gen
    >>> get_subject_gutenberg(5200)
    ['Metamorphosis -- Fiction', 'PT', 'Psychological fiction']

    :param: author: str
    :param: title: str
    :param: id: int
    :return: list
    """
    # TODO: run doctest on computer with populated cache

    return sorted(list(get_metadata('subject', gutenberg_id)))
Beispiel #27
0
def get_author_gutenberg(gutenberg_id):
    """
    Gets author or authors for novel with this gutenberg id

    >>> from gender_novels import corpus_gen
    >>> get_author_gutenberg(33)
    ['Hawthorne, Nathaniel']
    >>> get_author_gutenberg(3178)
    ['Twain, Mark', 'Warner, Charles Dudley']

    :param gutenberg_id: int
    :return: list
    """
    # TODO: should we format author names like this?

    return list(get_metadata('author', gutenberg_id))
Beispiel #28
0
def language_invalidates_entry(gutenberg_id):
    """
    Returns False if book with gutenberg id is in English, True otherwise

    >>> from gender_novels.corpus_gen import language_invalidates_entry
    >>> language_invalidates_entry(46) # A Christmas Carol
    False
    >>> language_invalidates_entry(27217) # Some Chinese thing
    True

    :param gutenberg_id: int
    :return: boolean
    """
    language = list(get_metadata('language', gutenberg_id))[0]
    if language != 'en':
        return True
    else:
        return False
Beispiel #29
0
def rights_invalidate_entry(gutenberg_id):
    """
    Returns False if book with gutenberg id is in public domain in US, True otherwise

    >>> from gender_novels.corpus_gen import rights_invalidate_entry
    >>> rights_invalidate_entry(5200) # Metamorphosis by Franz Kafka
    True
    >>> rights_invalidate_entry(8066) # The Bible, King James version, Book 66: Revelation
    False

    :param gutenberg_id: int
    :return: boolean
    """
    rights = get_metadata('rights', gutenberg_id)
    if 'Public domain in the USA.' in rights:
        return False
    else:
        return True
Beispiel #30
0
def process(question, candidates=None, top_n=3, n_docs=3):
    torch.cuda.empty_cache()
    title = ''
    author = ''
    predictions = DrQA.process(
        question, candidates, top_n, n_docs, return_context=True
    )
    table = prettytable.PrettyTable(
        ['Rank', 'Answer', 'Doc-ID', 'Doc-Title', 'Doc-Author', 'Doc-Link', 'Answer Score', 'Doc Score']
    )
    for i, p in enumerate(predictions, 1):
        
        if not list(get_metadata('title', p['doc_id'])):
            title = 'Not Available'
        else:
            tittle = list(get_metadata('title', p['doc_id']))[0]

        if not list(get_metadata('author', p['doc_id'])):
            author = 'Not Available'
        else:
            author = list(get_metadata('author', p['doc_id']))[0]
       
        if not list(get_metadata('formaturi', p['doc_id'])):
            url = 'Not Available'
        else:
            url = list(get_metadata('formaturi', p['doc_id']))[0]

        table.add_row([i, p['span'], p['doc_id'], tittle, author, url, '%.5g' % p['span_score'], '%.5g' % p['doc_score']])
    print('Top Predictions:')
    print(table)
    strtable = table.get_string()
    '''
    print('\nContexts:')
    for p in predictions:
        text = p['context']['text']
        start = p['context']['start']
        end = p['context']['end']
        output = (text[:start] +
                  colored(text[start: end], 'green', attrs=['bold']) +
                  text[end:])
        print('[ Doc = %s ]' % p['doc_id'])
        print(output + '\n')
    print(type(strtable))
    print(type(p['doc_id']))
    print(type(output))
    #code.interact(banner=banner, local=locals())
    retstring = strtable + '\n' + '[ Doc = ' + str(p['doc_id']) + ']' + '\n' + output + '\n'
    '''
    return strtable
Beispiel #31
0
def random_gutenberg_document(language_filter='en') -> str:
    """Downloads a random document (book, etc.) from Project Gutenberg and returns it as a stirng.

    Keyword arguments:
        language_filter (str) -- restrict the random document to a paritcular language (default: English)
    """
    doc_language = None
    document = ''
    while (not doc_language or language_filter
           ) and doc_language != language_filter and len(document) == 0:
        # Keep grabbing random documents until 1 meets the language filter, if specified, and verify it really has text
        document_id = random.randint(
            1, 60134)  # Pick book at random (max id is currently 60134)
        lang_metadata = get_metadata('language', document_id)
        doc_language = next(
            iter(lang_metadata)) if len(lang_metadata) else False
        document = super_cleaner(strip_headers(
            load_etext(document_id).strip()),
                                 mark_deletions=False)
    return document
Beispiel #32
0
def getNBooks(nBooks, lang, loc):
    from gutenberg.acquire import load_etext
    from gutenberg.query import get_metadata
    from gutenberg.cleanup import strip_headers
    i = 0
    while i < nBooks:
        n = rr(0, 10000)
        try:
            l = get_metadata("language", n)
            # if('en' not in l):
            #     print(l)
            if (lang in l):
                t = strip_headers(load_etext(n)).strip()
                f = open(loc + str(n) + '.txt', 'w')
                f.write(t)
                f.flush()
                f.close()
                print(i + 1, n)
                i += 1
        except:
            pass
def main():
    """
    Main function of the test module
    """

    # setting up the API keys from local keys.py file
    goodreads_key = os.environ['GOODREADS_KEY']
    goodreads_secret = os.environ['GOODREADS_SECRET']

    # creating a client for book search and information retrieval
    gc = client.GoodreadsClient(goodreads_key, goodreads_secret)

    current_path = os.getcwd()

    file = open(os.path.join(current_path, "output", "log.txt"), "a")

    # Getting the title of the first 3000 books on Project Gutenberg (EXTREMELY FAST)
    for i in range(1, 10):
        title = list(get_metadata('title', i))
        if title:
            # prepare the string for the file name
            filename = ''.join(e for e in title[0] if e.isalnum()) + ".txt"
            text = strip_headers(load_etext(i)).strip()
            with open(os.path.join(current_path, "output", filename),
                      "w") as output_file:
                output_file.write(text)
            file.write(f"{title[0]} plaintext saved to '{title[0]}.txt'\n")

    # Getting the titles and publishing years for the first 3000 books on Goodreads
    # Pretty slow because Goodreads allows 1 request per second
    for i in range(1, 20):
        try:
            book = gc.book(i)
            file.write(
                f"{book.title} - published in {dict(dict(book.work)['original_publication_year'])['#text']}\n"
            )
        except (request.GoodreadsRequestException, KeyError):
            continue
Beispiel #34
0
def isvalid(id_num):
    """

    Check if a gutenberg book is an english textbook.

    Args:
        id_num: id of gutenberg book.

    Returns:
        a boolean which shows if id_num is id of an english text book

    """
    try:
        language = metadata(id_num)['language']
        form = get_metadata('formaturi', id_num)
        if 'en' not in language:
            return False
        form = ' '.join(form)
        if re.search(r'\d+\.txt', form):
            return True
        return False
    except:
        return False
Beispiel #35
0
def download():
    """
    ściąga dokumenty
    :return:
    """
    for i in range(1, 3000):
        try:
            query = strip_headers(load_etext(i)).strip()
        except Exception:
            continue
        if 'DOCTYPE HTML PUBLIC' not in query[0:100]:
            a = ''.join(get_metadata('title', 2701))
            with open('files/' + a + '.txt', 'a') as f:
                try:
                    f.write(query)
                    print('Downloaded ' + a + '.\n')
                except UnicodeEncodeError:
                    f.close()
                    os.remove('files/' + a + '.txt')
                    continue

        else:
            print("Not Downloaded")
Beispiel #36
0
# get a random sample of labels.

from gutenberg.query import get_metadata
import random

random.seed(a=19031003)
random_docs = random.sample(range(57700), 100)
labels = set()

for doc in random_docs:
    subjects = get_metadata('subject', doc)
    for s in subjects:
        labels.add(s)

with open('genres.txt', 'w') as file:
    file.write(str(labels))
Beispiel #37
0
if __name__ == '__main__':
    mypath = "/home/ssamot/projects/github/gutenberg/processed/results/"

    onlyfiles = [ (join(mypath,f), f[:-4], f) for f in listdir(mypath) if isfile(join(mypath,f)) and f.endswith(".txt")]
    means = np.loadtxt("./data/ytotals.csv")
    means = list(means)
    totals = []
    print means

    starting_point = len(means)
    print starting_point
    for i, file in enumerate(onlyfiles):
        if(i> starting_point ):
            print i, starting_point
            fictid = int( file[-1].split("_")[0])
            title = list(get_metadata('title', fictid))
            author = list(get_metadata('author', fictid))
            print fictid, title, author
            if(author == []):
                author.append("")
            #print get_metadata("author", fictid)
            try:
                mean, total =  get_mean(title[0] + " " + author[0])
                #time.sleep(2.0)
                means.append(mean)
                totals.append(total)
            except KeyError:
                means.append(-1)
            #print means
            #print  np.array(means)
"""
Created on Wed Aug 12 18:06:45 2015

@author: Tony
Description: Pull etext numbers from Project Gutenberg for an author

1) First pip install gutenberg 0.4.0 library for Python from the command line

"""
 
from gutenberg.query import get_etexts
from gutenberg.query import get_metadata
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers


# get the catalogue numbers of all the texts
# by Wilhelm Grimm in Project Gutenberg
bookList=get_etexts('author', 'Grimm, Wilhelm Carl')
# gives bookList = [12704, 12705, 12706, 12707, 12708, 12709, 12710, 37381, 20051, 28044, 30510, 22555, 20050, 11027, 16846, 12250, 20027, 19068, 2591]

#Once We can associate a number with a title we can pull the text
for number in bookList:
    print(number,get_metadata('title',number))
 
print('\n HHHHHHHHHHHHHHH Now for the full text HHHHHHHHHHHHHHHHHHH \n')
# Once we have the text number we can print the text
# example 11027 is the number for Grimm's Fairy Stories 
# can be tempermental truncating text at top (console limit?) may need to trick around  
etext = strip_headers(load_etext(11027)).strip()
print(etext)
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 20 13:05:59 2015

@author: weizhi
"""

from gutenberg.query import get_etexts
from gutenberg.query import get_metadata

print(get_metadata('title', 2701))  # prints 'Moby Dick; Or, The Whale'
print(get_metadata('author', 2701)) # prints 'Melville, Hermann'

print(get_etexts('title', 'Moby Dick; Or, The Whale'))  # prints (2701, ...)
print(get_etexts('author', 'Melville, Hermann'))        # prints (2701, ...)