Beispiel #1
0
def get_metadata(stream, extract_cover=True):
    '''
    Return metadata as a L{MetaInfo} object
    '''
    name = getattr(stream, 'name', '').rpartition('.')[0]
    if name:
        name = os.path.basename(name)
    mi = MetaInformation(name or 'Unknown', ['Unknown'])
    stream.seek(0)

    mdata = ''
    for x in range(0, 4):
        line = stream.readline().decode('utf-8', 'replace')
        if not line:
            break
        else:
            mdata += line

    mdata = mdata[:1024]

    mo = re.search('(?u)^[ ]*(?P<title>.+)[ ]*(\n{3}|(\r\n){3}|\r{3})[ ]*(?P<author>.+)[ ]*(\n|\r\n|\r)$', mdata)
    if mo is not None:
        mi.title = mo.group('title')
        mi.authors = mo.group('author').split(',')

    return mi
Beispiel #2
0
def _metadata_from_formats(formats, force_read_metadata=False, pattern=None):
    mi = MetaInformation(None, None)
    formats.sort(key=lambda x: METADATA_PRIORITIES[path_to_ext(x)])
    extensions = list(map(path_to_ext, formats))
    if 'opf' in extensions:
        opf = formats[extensions.index('opf')]
        mi2 = opf_metadata(opf)
        if mi2 is not None and mi2.title:
            return mi2

    for path, ext in zip(formats, extensions):
        with open(path, 'rb') as stream:
            try:
                newmi = get_metadata(stream, stream_type=ext,
                                     use_libprs_metadata=True,
                                     force_read_metadata=force_read_metadata,
                                     pattern=pattern)
                mi.smart_update(newmi)
            except Exception:
                continue
            if getattr(mi, 'application_id', None) is not None:
                return mi

    if not mi.title:
        mi.title = 'Unknown'
    if not mi.authors:
        mi.authors = ['Unknown']

    return mi
Beispiel #3
0
def get_metadata(stream):
    """
    Return basic meta-data about the LRF file in C{stream} as a
    L{MetaInformation} object.
    @param stream: A file like object or an instance of L{LRFMetaFile}
    """
    lrf = stream if isinstance(stream, LRFMetaFile) else LRFMetaFile(stream)
    authors = string_to_authors(lrf.author)
    mi = MetaInformation(lrf.title.strip(), authors)
    mi.author = lrf.author.strip()
    mi.comments = lrf.free_text.strip()
    mi.category = lrf.category.strip() + ', ' + lrf.classification.strip()
    tags = [x.strip() for x in mi.category.split(',') if x.strip()]
    if tags:
        mi.tags = tags
    if mi.category.strip() == ',':
        mi.category = None
    mi.publisher = lrf.publisher.strip()
    mi.cover_data = lrf.get_cover()
    try:
        mi.title_sort = lrf.title_reading.strip()
        if not mi.title_sort:
            mi.title_sort = None
    except Exception:
        pass

    try:
        mi.author_sort = lrf.author_reading.strip()
        if not mi.author_sort:
            mi.author_sort = None
    except Exception:
        pass

    if not mi.title or 'unknown' in mi.title.lower():
        mi.title = None
    if not mi.authors:
        mi.authors = None
    if not mi.author or 'unknown' in mi.author.lower():
        mi.author = None
    if not mi.category or 'unknown' in mi.category.lower():
        mi.category = None
    if not mi.publisher or 'unknown' in mi.publisher.lower() or \
            'some publisher' in mi.publisher.lower():
        mi.publisher = None

    return mi
Beispiel #4
0
def get_metadata(stream):
    """
    Return metadata as a L{MetaInfo} object
    """
    stream.seek(0)
    if stream.read(5) != br'{\rtf':
        return MetaInformation('Unknown')
    block = get_document_info(stream)[0]
    if not block:
        return MetaInformation('Unknown')

    stream.seek(0)
    cpg = detect_codepage(stream)
    stream.seek(0)

    title_match = title_pat.search(block)
    if title_match is not None:
        title = decode(title_match.group(1).strip(), cpg)
    else:
        title = 'Unknown'
    author_match = author_pat.search(block)
    if author_match is not None:
        author = decode(author_match.group(1).strip(), cpg)
    else:
        author = None
    mi = MetaInformation(title)
    if author:
        mi.authors = [x.strip() for x in author.split(',')]

    comment_match = comment_pat.search(block)
    if comment_match is not None:
        comment = decode(comment_match.group(1).strip(), cpg)
        mi.comments = comment
    tags_match = tags_pat.search(block)
    if tags_match is not None:
        tags = decode(tags_match.group(1).strip(), cpg)
        mi.tags = list(filter(None, (x.strip() for x in tags.split(','))))
    publisher_match = publisher_pat.search(block)
    if publisher_match is not None:
        publisher = decode(publisher_match.group(1).strip(), cpg)
        mi.publisher = publisher

    return mi
Beispiel #5
0
def metadata_from_filename(name, pat=None, fallback_pat=None):
    if isinstance(name, bytes):
        name = name.decode(filesystem_encoding, 'replace')
    name = name.rpartition('.')[0]
    mi = MetaInformation(None, None)
    if pat is None:
        pat = re.compile(prefs.get('filename_pattern'))
    name = name.replace('_', ' ')
    match = pat.search(name)
    if match is None and fallback_pat is not None:
        match = fallback_pat.search(name)
    if match is not None:
        try:
            mi.title = match.group('title')
        except IndexError:
            pass
        try:
            au = match.group('author')
            aus = string_to_authors(au)
            if aus:
                mi.authors = aus
                if prefs['swap_author_names'] and mi.authors:
                    def swap(a):
                        if ',' in a:
                            parts = a.split(',', 1)
                        else:
                            parts = a.split(None, 1)
                        if len(parts) > 1:
                            t = parts[-1]
                            parts = parts[:-1]
                            parts.insert(0, t)
                        return ' '.join(parts)
                    mi.authors = [swap(x) for x in mi.authors]
        except (IndexError, ValueError):
            pass
        try:
            mi.series = match.group('series')
        except IndexError:
            pass
        try:
            si = match.group('series_index')
            mi.series_index = float(si)
        except (IndexError, ValueError, TypeError):
            pass
        try:
            si = match.group('isbn')
            mi.isbn = si
        except (IndexError, ValueError):
            pass
        try:
            publisher = match.group('publisher')
            mi.publisher = publisher
        except (IndexError, ValueError):
            pass
        try:
            pubdate = match.group('published')
            if pubdate:
                from ebook_converter.utils.date import parse_only_date
                mi.pubdate = parse_only_date(pubdate)
        except:
            pass
        try:
            comments = match.group('comments')
            mi.comments = comments
        except (IndexError, ValueError):
            pass

    if mi.is_null('title'):
        mi.title = name
    return mi
Beispiel #6
0
def get_metadata(stream, extract_cover=True):
    whitespace = re.compile(r'\s+')

    def normalize(s):
        return whitespace.sub(' ', s).strip()

    with ZipFile(stream) as zf:
        meta = zf.read('meta.xml')
        root = fromstring(meta)

        def find(field):
            ns, tag = fields[field]
            ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns})
            if ans:
                return normalize(
                    tostring(ans[0],
                             method='text',
                             encoding='unicode',
                             with_tail=False)).strip()

        mi = MetaInformation(None, [])
        title = find('title')
        if title:
            mi.title = title
        creator = find('initial-creator') or find('creator')
        if creator:
            mi.authors = string_to_authors(creator)
        desc = find('description')
        if desc:
            mi.comments = desc
        lang = find('language')
        if lang and canonicalize_lang(lang):
            mi.languages = [canonicalize_lang(lang)]
        kw = find('keyword') or find('keywords')
        if kw:
            mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
        data = {}
        for tag in root.xpath('//ns0:user-defined',
                              namespaces={'ns0': fields['user-defined'][0]}):
            name = (tag.get('{%s}name' % METANS) or '').lower()
            vtype = tag.get('{%s}value-type' % METANS) or 'string'
            val = tag.text
            if name and val:
                if vtype == 'boolean':
                    val = val == 'true'
                data[name] = val
        opfmeta = False  # we need this later for the cover
        opfnocover = False
        if data.get('opf.metadata'):
            # custom metadata contains OPF information
            opfmeta = True
            if data.get('opf.titlesort', ''):
                mi.title_sort = data['opf.titlesort']
            if data.get('opf.authors', ''):
                mi.authors = string_to_authors(data['opf.authors'])
            if data.get('opf.authorsort', ''):
                mi.author_sort = data['opf.authorsort']
            if data.get('opf.isbn', ''):
                isbn = check_isbn(data['opf.isbn'])
                if isbn is not None:
                    mi.isbn = isbn
            if data.get('opf.publisher', ''):
                mi.publisher = data['opf.publisher']
            if data.get('opf.pubdate', ''):
                mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
            if data.get('opf.identifiers'):
                try:
                    mi.identifiers = json.loads(data['opf.identifiers'])
                except Exception:
                    pass
            if data.get('opf.rating'):
                try:
                    mi.rating = max(0, min(float(data['opf.rating']), 10))
                except Exception:
                    pass
            if data.get('opf.series', ''):
                mi.series = data['opf.series']
                if data.get('opf.seriesindex', ''):
                    try:
                        mi.series_index = float(data['opf.seriesindex'])
                    except Exception:
                        mi.series_index = 1.0
            if data.get('opf.language', ''):
                cl = canonicalize_lang(data['opf.language'])
                if cl:
                    mi.languages = [cl]
            opfnocover = data.get('opf.nocover', False)
        if not opfnocover:
            try:
                read_cover(stream, zf, mi, opfmeta, extract_cover)
            except Exception:
                pass  # Do not let an error reading the cover prevent reading other data

    return mi