def get_metadata(idx): ret = {} keys = q.list_supported_metadatas() for key in keys: ret[key] = list(q.get_metadata(key, idx)) content = get_content(idx) ret['length'] = len(content) print(ret) return ret
def main(): if not os.path.isdir('books'): os.makedirs('books') print(list_supported_metadatas()) for i in range(19363, 100000): try: print(i, get_metadata('title', i)) if len(get_metadata('title', i)) > 0: want = False for lang in get_metadata('language', i): language = lang if language.lower() == 'en': categories = "" for genre in get_metadata('subject', i): for category in genre.split("--"): categories += "_" + category.strip().lower().replace("-", " ").replace(".", " ")\ .replace(",", " ").strip().replace(" ", " ").replace(" ", "-")\ .replace("(", "").replace(")", "").replace("'", "") if re.search('science fiction', genre.lower()): want = True elif re.search('horror', genre.lower()): want = True elif re.search('adventure', genre.lower()): want = True elif re.search('humor', genre.lower()): want = True elif re.search('western', genre.lower()): want = True elif re.search('mystery fiction', genre.lower()): want = True elif re.search('gothic fiction', genre.lower()): want = True else: want = False if want is True: text = str(strip_headers(load_etext(i)).strip()) title = " " title = str(i) + categories print('writing to file %s' % title) f = open("books/%s.txt" % title, "wb") f.write(text.encode('utf8')) except Exception as e: print(e) pass
from re import compile as re_compile from typing import Optional from urllib.parse import unquote from gutenberg.query import list_supported_metadatas from gutenberg_http.errors import MisformedQuery from gutenberg_http.errors import NoQuery from gutenberg_http.errors import NoQueryValue from gutenberg_http.errors import UnknownFields from gutenberg_http.errors import UnknownQueryOperator ALL_FIELDS = frozenset(list_supported_metadatas()) ALL_OPERATORS = frozenset({'eq'}) ALL_COMBINATORS = frozenset({'and'}) split_combinators = re_compile('(%s)' % '|'.join(ALL_COMBINATORS)).split def parse_include(query: Optional[str]): if not query: return ALL_FIELDS query = unquote(query) requested_fields = set(query.split(',')) unknown_fields = requested_fields - ALL_FIELDS if unknown_fields: raise UnknownFields(unknown_fields, ALL_FIELDS)
from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers from gutenberg.query import get_etexts from gutenberg.query import get_metadata from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk import pandas as pd import re # ====== Connection ====== # # Connection to ElasticSearch es = Elasticsearch(['http://localhost:9220'], timeout=600) # Simple index creation with no particular mapping # es.indices.create(index='books',body={}) from gutenberg.query import list_supported_metadatas print(list_supported_metadatas() ) # prints (u'author', u'formaturi', u'language', ...) cols = ['id', 'text', 'author', 'title', 'language', 'subject', 'textsummary'] # cols = ['id', 'text', 'author', 'title', 'language'] # cols = ['id', 'text'] dat = pd.DataFrame(columns=cols) i = 0 for i in range(500, 1500): try: text = strip_headers(load_etext(i + 1)).strip() author = str(get_metadata('author', i + 1)).partition("[u'")[2].partition("'])")[0] title = str(get_metadata('title', i +
import argparse from gutenberg.acquire import get_metadata_cache from gutenberg.query import list_supported_metadatas, get_metadata from entities import Book, Collection from entities.book import TAG_BLOCK from arweave_extensions import WalletWithTxAnchor from graphql.query import find_collection_ids, find_books_from_collection from typing import List, Tuple import warnings, logging from unidecode import unidecode logger = logging.getLogger("arweave.arweave_lib").setLevel(logging.WARNING) METADATA_ARGS = list_supported_metadatas() TAG_TEXTNO = "textno" def populate_cache(): """ Loads Gutenberg metadata information into your machine. Mandatory step before publishing. """ cache = get_metadata_cache() cache.populate() def _is_public_domain(id: str) -> bool: rights = ",".join(get_metadata('rights', id)) if rights.find('Public domain') < 0: warnings.warn("Textno '%d' has rights '%s'" % (id, rights)) return False return True
def test_has_supported_metadatas(self): metadatas = list_supported_metadatas() self.assertGreater(len(metadatas), 0)