Ejemplo n.º 1
0
 def test_rtf_metadata(self):
     stream = BytesIO(br'{\rtf1\ansi\ansicpg1252}')
     m = Metadata('Test ø̄title', ['Author One', 'Author БTwo'])
     m.tags = 'tag1 見tag2'.split()
     m.comments = '<p>some ⊹comments</p>'
     m.publisher = 'publiSher'
     set_metadata(stream, m)
     stream.seek(0)
     o = get_metadata(stream)
     for attr in 'title authors publisher comments tags'.split():
         self.assertEqual(getattr(m, attr), getattr(o, attr))
Ejemplo n.º 2
0
 def test_input_comment_multi(self):
     stream_meta = get_metadata(self.get_stream('comment_multi'))
     canon_meta = Metadata('A Comment Tag &amp; Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams'])
     canon_meta.publisher = 'Publisher C'
     canon_meta.languages = ['French', 'Japanese']
     canon_meta.pubdate = parse_date('2015-01-01')
     canon_meta.timestamp = parse_date('2014-01-01')
     canon_meta.series = 'Comment Series'
     canon_meta.series_index = float(3)
     canon_meta.rating = float(0)
     canon_meta.comments = 'comment &quot;comments&quot; ♥ HTML -- too &amp;amp;'
     canon_meta.tags = ['tag d', 'tag e', 'tag f']
     canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
     self.compare_metadata(stream_meta, canon_meta)
Ejemplo n.º 3
0
 def test_input_meta_multi(self):
     stream_meta = get_metadata(self.get_stream('meta_multi'))
     canon_meta = Metadata('A Meta Tag &amp; Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson'])
     canon_meta.publisher = 'Publisher A'
     canon_meta.languages = ['English', 'Spanish']
     canon_meta.pubdate = parse_date('2019-01-01')
     canon_meta.timestamp = parse_date('2018-01-01')
     canon_meta.series = 'Meta Series'
     canon_meta.series_index = float(1)
     canon_meta.rating = float(8)
     canon_meta.comments = 'meta &quot;comments&quot; ♥ HTML &amp;amp;'
     canon_meta.tags = ['tag a', 'tag b', 'tag c']
     canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'})
     self.compare_metadata(stream_meta, canon_meta)
Ejemplo n.º 4
0
 def test_input_meta_single(self):
     stream_meta = get_metadata(self.get_stream('meta_single'))
     canon_meta = Metadata('A Meta Tag &amp; Title Ⓒ', ['George Washington'])
     canon_meta.publisher = 'Publisher A'
     canon_meta.languages = ['English']
     canon_meta.pubdate = parse_date('2019-01-01')
     canon_meta.timestamp = parse_date('2018-01-01')
     canon_meta.series = 'Meta Series'
     canon_meta.series_index = float(1)
     # canon_meta.rating = float(0)
     # canon_meta.comments = ''
     canon_meta.tags = ['tag a', 'tag b']
     canon_meta.set_identifiers({'isbn': '1234567890'})
     self.compare_metadata(stream_meta, canon_meta)
Ejemplo n.º 5
0
def generate_test_db(
        library_path,  # {{{
        num_of_records=20000,
        num_of_authors=6000,
        num_of_tags=10000,
        tag_length=7,
        author_length=7,
        title_length=10,
        max_authors=10,
        max_tags=10):
    import random, string, os, sys, time
    from ebook_converter.constants_old import preferred_encoding

    if not os.path.exists(library_path):
        os.makedirs(library_path)

    letters = string.letters.decode(preferred_encoding)

    def randstr(length):
        return ''.join(random.choice(letters) for i in range(length))

    all_tags = [randstr(tag_length) for j in range(num_of_tags)]
    print('Generated', num_of_tags, 'tags')
    all_authors = [randstr(author_length) for j in range(num_of_authors)]
    print('Generated', num_of_authors, 'authors')
    all_titles = [randstr(title_length) for j in range(num_of_records)]
    print('Generated', num_of_records, 'titles')

    testdb = db(library_path)

    print('Creating', num_of_records, 'records...')

    start = time.time()

    for i, title in enumerate(all_titles):
        print(i + 1, end=' ')
        sys.stdout.flush()
        authors = random.randint(1, max_authors)
        authors = [random.choice(all_authors) for i in range(authors)]
        tags = random.randint(0, max_tags)
        tags = [random.choice(all_tags) for i in range(tags)]
        from ebook_converter.ebooks.metadata.book.base import Metadata
        mi = Metadata(title, authors)
        mi.tags = tags
        testdb.import_book(mi, [])

    t = time.time() - start
    print('\nGenerated', num_of_records, 'records in:', t, 'seconds')
    print('Time per record:', t / num_of_records)
Ejemplo n.º 6
0
def get_metadata_(src, encoding=None):
    # Meta data definitions as in
    # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9

    if isinstance(src, bytes):
        if not encoding:
            src = xml_to_unicode(src)[0]
        else:
            src = src.decode(encoding, 'replace')
    src = src[:150000]  # Searching shouldn't take too long
    comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src)

    def get_all(field):
        ans = comment_tags.get(field, meta_tags.get(field, None))
        if ans:
            ans = [x.strip() for x in ans if x.strip()]
        if not ans:
            ans = None
        return ans

    def get(field):
        ans = get_all(field)
        if ans:
            ans = ans[0]
        return ans

    # Title
    title = get('title') or title_tag.strip() or 'Unknown'

    # Author
    authors = authors_to_string(get_all('authors')) or 'Unknown'

    # Create MetaInformation with Title and Author
    mi = Metadata(title, string_to_authors(authors))

    # Single-value text fields
    for field in ('publisher', 'isbn'):
        val = get(field)
        if val:
            setattr(mi, field, val)

    # Multi-value text fields
    for field in ('languages',):
        val = get_all(field)
        if val:
            setattr(mi, field, val)

    # HTML fields
    for field in ('comments',):
        val = get(field)
        if val:
            setattr(mi, field, val.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&apos;'))

    # Date fields
    for field in ('pubdate', 'timestamp'):
        try:
            val = parse_date(get(field))
        except:
            pass
        else:
            if not is_date_undefined(val):
                setattr(mi, field, val)

    # SERIES
    series = get('series')
    if series:
        pat = re.compile(r'\[([.0-9]+)\]$')
        match = pat.search(series)
        series_index = None
        if match is not None:
            try:
                series_index = float(match.group(1))
            except:
                pass
            series = series.replace(match.group(), '').strip()
        mi.series = series
        if series_index is None:
            series_index = get('series_index')
            try:
                series_index = float(series_index)
            except:
                pass
        if series_index is not None:
            mi.series_index = series_index

    # RATING
    rating = get('rating')
    if rating:
        try:
            mi.rating = float(rating)
            if mi.rating < 0:
                mi.rating = 0
            if mi.rating > 10:
                mi.rating = 0
        except:
            pass

    # TAGS
    tags = get_all('tags')
    if tags:
        tags = [x.strip() for s in tags for x in s.split(',') if x.strip()]
        if tags:
            mi.tags = tags

    # IDENTIFIERS
    for (k,v) in meta_tag_ids.items():
        v = [x.strip() for x in v if x.strip()]
        if v:
            mi.set_identifier(k, v[0])

    return mi
Ejemplo n.º 7
0
def metadata_from_xmp_packet(raw_bytes):
    root = parse_xmp_packet(raw_bytes)
    mi = Metadata('Unknown')
    title = first_alt('//dc:title', root)
    if title:
        if title.startswith(r'\376\377'):
            # corrupted XMP packet generated by Nitro PDF. See
            # https://bugs.launchpad.net/calibre/+bug/1541981
            raise ValueError('Corrupted XMP metadata packet detected, '
                             'probably generated by Nitro PDF')
        mi.title = title
    authors = multiple_sequences('//dc:creator', root)
    if authors:
        mi.authors = authors
    tags = multiple_sequences('//dc:subject', root) or multiple_sequences(
        '//pdf:Keywords', root)
    if tags:
        mi.tags = tags
    comments = first_alt('//dc:description', root)
    if comments:
        mi.comments = comments
    publishers = multiple_sequences('//dc:publisher', root)
    if publishers:
        mi.publisher = publishers[0]
    try:
        pubdate = (parse_date(first_sequence('//dc:date', root)
                              or first_simple('//xmp:CreateDate', root),
                              assume_utc=False))
    except Exception:
        pass
    else:
        mi.pubdate = pubdate
    bkp = first_simple('//xmp:CreatorTool', root)
    if bkp:
        mi.book_producer = bkp
    md = safe_parse_date(first_simple('//xmp:MetadataDate', root))
    mod = safe_parse_date(first_simple('//xmp:ModifyDate', root))
    fd = more_recent(md, mod)
    if fd is not None:
        mi.metadata_date = fd
    rating = first_simple('//calibre:rating', root)
    if rating is not None:
        try:
            rating = float(rating)
            if 0 <= rating <= 10:
                mi.rating = rating
        except (ValueError, TypeError):
            pass
    series, series_index = read_series(root)
    if series:
        mi.series, mi.series_index = series, series_index
    for x in ('title_sort', 'author_sort'):
        for elem in XPath('//calibre:' + x)(root):
            val = read_simple_property(elem)
            if val:
                setattr(mi, x, val)
                break
    for x in ('author_link_map', 'user_categories'):
        val = first_simple('//calibre:' + x, root)
        if val:
            try:
                setattr(mi, x, json.loads(val))
            except Exception:
                pass

    languages = multiple_sequences('//dc:language', root)
    if languages:
        languages = list(filter(None, map(canonicalize_lang, languages)))
        if languages:
            mi.languages = languages

    identifiers = {}
    for xmpid in XPath('//xmp:Identifier')(root):
        for scheme, value in read_xmp_identifers(xmpid):
            if scheme and value:
                identifiers[scheme.lower()] = value

    for namespace in ('prism', 'pdfx'):
        for scheme in KNOWN_ID_SCHEMES:
            if scheme not in identifiers:
                val = first_simple('//%s:%s' % (namespace, scheme), root)
                scheme = scheme.lower()
                if scheme == 'isbn':
                    val = check_isbn(val)
                elif scheme == 'doi':
                    val = check_doi(val)
                if val:
                    identifiers[scheme] = val

    # Check Dublin Core for recognizable identifier types
    for scheme, check_func in {'doi': check_doi, 'isbn': check_isbn}.items():
        if scheme not in identifiers:
            val = check_func(first_simple('//dc:identifier', root))
            if val:
                identifiers['doi'] = val

    if identifiers:
        mi.set_identifiers(identifiers)

    read_user_metadata(mi, root)

    return mi