Beispiel #1
0
def get_metadata(idx):
    ret = {}
    keys = q.list_supported_metadatas()
    for key in keys:
        ret[key] = list(q.get_metadata(key, idx))
    content = get_content(idx)
    ret['length'] = len(content)
    print(ret)
    return ret
def main():
    if not os.path.isdir('books'):
        os.makedirs('books')
    print(list_supported_metadatas())
    for i in range(19363, 100000):
        try:
            print(i, get_metadata('title', i))
            if len(get_metadata('title', i)) > 0:
                want = False
                for lang in get_metadata('language', i):
                    language = lang
                if language.lower() == 'en':
                    categories = ""
                    for genre in get_metadata('subject', i):
                        for category in genre.split("--"):
                            categories += "_" + category.strip().lower().replace("-", " ").replace(".", " ")\
                                .replace(",", " ").strip().replace("  ", " ").replace(" ", "-")\
                                .replace("(", "").replace(")", "").replace("'", "")
                        if re.search('science fiction', genre.lower()):
                            want = True
                        elif re.search('horror', genre.lower()):
                            want = True
                        elif re.search('adventure', genre.lower()):
                            want = True
                        elif re.search('humor', genre.lower()):
                            want = True
                        elif re.search('western', genre.lower()):
                            want = True
                        elif re.search('mystery fiction', genre.lower()):
                            want = True
                        elif re.search('gothic fiction', genre.lower()):
                            want = True
                        else:
                            want = False
                    if want is True:
                        text = str(strip_headers(load_etext(i)).strip())
                        title = " "
                        title = str(i) + categories
                        print('writing to file %s' % title)
                        f = open("books/%s.txt" % title, "wb")
                        f.write(text.encode('utf8'))

        except Exception as e:
            print(e)
            pass
Beispiel #3
0
from re import compile as re_compile
from typing import Optional
from urllib.parse import unquote

from gutenberg.query import list_supported_metadatas

from gutenberg_http.errors import MisformedQuery
from gutenberg_http.errors import NoQuery
from gutenberg_http.errors import NoQueryValue
from gutenberg_http.errors import UnknownFields
from gutenberg_http.errors import UnknownQueryOperator

ALL_FIELDS = frozenset(list_supported_metadatas())

ALL_OPERATORS = frozenset({'eq'})
ALL_COMBINATORS = frozenset({'and'})

split_combinators = re_compile('(%s)' % '|'.join(ALL_COMBINATORS)).split


def parse_include(query: Optional[str]):
    if not query:
        return ALL_FIELDS

    query = unquote(query)

    requested_fields = set(query.split(','))
    unknown_fields = requested_fields - ALL_FIELDS
    if unknown_fields:
        raise UnknownFields(unknown_fields, ALL_FIELDS)
Beispiel #4
0
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
from gutenberg.query import get_etexts
from gutenberg.query import get_metadata
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import pandas as pd
import re
# ====== Connection ====== #
# Connection to ElasticSearch
es = Elasticsearch(['http://localhost:9220'], timeout=600)
# Simple index creation with no particular mapping
# es.indices.create(index='books',body={})
from gutenberg.query import list_supported_metadatas

print(list_supported_metadatas()
      )  # prints (u'author', u'formaturi', u'language', ...)

cols = ['id', 'text', 'author', 'title', 'language', 'subject', 'textsummary']
# cols = ['id', 'text', 'author', 'title', 'language']

# cols = ['id', 'text']

dat = pd.DataFrame(columns=cols)
i = 0
for i in range(500, 1500):
    try:
        text = strip_headers(load_etext(i + 1)).strip()
        author = str(get_metadata('author', i +
                                  1)).partition("[u'")[2].partition("'])")[0]
        title = str(get_metadata('title', i +
Beispiel #5
0
import argparse
from gutenberg.acquire import get_metadata_cache
from gutenberg.query import list_supported_metadatas, get_metadata
from entities import Book, Collection
from entities.book import TAG_BLOCK
from arweave_extensions import WalletWithTxAnchor
from graphql.query import find_collection_ids, find_books_from_collection
from typing import List, Tuple
import warnings, logging
from unidecode import unidecode

logger = logging.getLogger("arweave.arweave_lib").setLevel(logging.WARNING)

METADATA_ARGS = list_supported_metadatas()
TAG_TEXTNO = "textno"


def populate_cache():
    """
    Loads Gutenberg metadata information into your machine. Mandatory step before publishing.
    """
    cache = get_metadata_cache()
    cache.populate()


def _is_public_domain(id: str) -> bool:
    rights = ",".join(get_metadata('rights', id))
    if rights.find('Public domain') < 0:
        warnings.warn("Textno '%d' has rights '%s'" % (id, rights))
        return False
    return True
Beispiel #6
0
 def test_has_supported_metadatas(self):
     metadatas = list_supported_metadatas()
     self.assertGreater(len(metadatas), 0)