Esempio n. 1
0
def parse_meta_tags(src):
    rmap = {}
    for field, names in iteritems(META_NAMES):
        for name in names:
            rmap[name.lower()] = field
    all_names = '|'.join(rmap)
    ans = {}
    npat = r'''name\s*=\s*['"]{0,1}(?P<name>%s)['"]{0,1}''' % all_names
    cpat = r'content\s*=\s*%s' % attr_pat
    for pat in (
        r'<meta\s+%s\s+%s' % (npat, cpat),
        r'<meta\s+%s\s+%s' % (cpat, npat),
    ):
        for match in re.finditer(pat, src, flags=re.IGNORECASE):
            x = match.group('name').lower()
            try:
                field = rmap[x]
            except KeyError:
                try:
                    field = rmap[x.replace(':', '.')]
                except KeyError:
                    continue

            if field not in ans:
                ans[field] = replace_entities(match.group('content'))
            if len(ans) == len(META_NAMES):
                return ans
    return ans
Esempio n. 2
0
def all_links(html):
    ''' Return set of all links in the file '''
    ans = set()
    for match in re.finditer(
            r'''<\s*[Aa]\s+.*?[hH][Rr][Ee][Ff]\s*=\s*(['"])(.+?)\1''', html, re.MULTILINE|re.DOTALL):
        ans.add(replace_entities(match.group(2)))
    return ans
Esempio n. 3
0
def all_links(html):
    ''' Return set of all links in the file '''
    ans = set()
    for match in re.finditer(
            r'''<\s*[Aa]\s+.*?[hH][Rr][Ee][Ff]\s*=\s*(['"])(.+?)\1''', html, re.MULTILINE|re.DOTALL):
        ans.add(replace_entities(match.group(2)))
    return ans
Esempio n. 4
0
def parse_meta_tags(src):
    rmap = {}
    for field, names in META_NAMES.iteritems():
        for name in names:
            rmap[name.lower()] = field
    all_names = "|".join(rmap)
    ans = {}
    npat = r"""name\s*=\s*['"]{0,1}(?P<name>%s)['"]{0,1}""" % all_names
    cpat = "content\s*=\s*%s" % attr_pat
    for pat in ("<meta\s+%s\s+%s" % (npat, cpat), "<meta\s+%s\s+%s" % (cpat, npat)):
        for match in re.finditer(pat, src, flags=re.IGNORECASE):
            x = match.group("name").lower()
            try:
                field = rmap[x]
            except KeyError:
                try:
                    field = rmap[x.replace(":", ".")]
                except KeyError:
                    continue

            if field not in ans:
                ans[field] = replace_entities(match.group("content"))
            if len(ans) == len(META_NAMES):
                return ans
    return ans
Esempio n. 5
0
    def __init__(self, raw, codec, title):
        self.doctype = raw[:4]
        self.length, self.num_items = struct.unpack('>LL', raw[4:12])
        raw = raw[12:]
        pos = 0
        self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
        self.has_fake_cover = True
        self.start_offset = None
        left = self.num_items
        self.kf8_header = None
        self.uuid = self.cdetype = None

        while left > 0:
            left -= 1
            idx, size = struct.unpack('>LL', raw[pos:pos + 8])
            content = raw[pos + 8:pos + size]
            pos += size
            if idx >= 100 and idx < 200:
                self.process_metadata(idx, content, codec)
            elif idx == 203:
                self.has_fake_cover = bool(struct.unpack('>L', content)[0])
            elif idx == 201:
                co, = struct.unpack('>L', content)
                if co < NULL_INDEX:
                    self.cover_offset = co
            elif idx == 202:
                self.thumbnail_offset, = struct.unpack('>L', content)
            elif idx == 501:
                try:
                    self.cdetype = content.decode('ascii')
                except UnicodeDecodeError:
                    self.cdetype = None
                # cdetype
                if content == b'EBSP':
                    if not self.mi.tags:
                        self.mi.tags = []
                    self.mi.tags.append(_('Sample Book'))
            elif idx == 502:
                # last update time
                pass
            elif idx == 503: # Long title
                # Amazon seems to regard this as the definitive book title
                # rather than the title from the PDB header. In fact when
                # sending MOBI files through Amazon's email service if the
                # title contains non ASCII chars or non filename safe chars
                # they are messed up in the PDB header
                try:
                    title = content.decode(codec)
                except:
                    pass
            #else:
            #    print 'unknown record', idx, repr(content)
        if title:
            self.mi.title = replace_entities(title)
Esempio n. 6
0
def parse_comment_tags(src):
    all_names = '|'.join(itervalues(COMMENT_NAMES))
    rmap = {v:k for k, v in iteritems(COMMENT_NAMES)}
    ans = {}
    for match in re.finditer(r'''<!--\s*(?P<name>%s)\s*=\s*%s''' % (all_names, attr_pat), src):
        field = rmap[match.group('name')]
        if field not in ans:
            ans[field] = replace_entities(match.group('content'))
        if len(ans) == len(COMMENT_NAMES):
            break
    return ans
Esempio n. 7
0
def add_words_from_escaped_html(text, words, file_name, node, attr, locale):
    text = replace_entities(text)
    root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8'))
    ewords = defaultdict(list)
    ewords[None] = 0
    read_words_from_html(root, ewords, file_name, locale)
    words[None] += ewords.pop(None)
    for k, locs in iteritems(ewords):
        for loc in locs:
            loc.location_node, loc.node_item = node, (False, attr)
        words[k].extend(locs)
Esempio n. 8
0
def parse_comment_tags(src):
    all_names = "|".join(COMMENT_NAMES.itervalues())
    rmap = {v: k for k, v in COMMENT_NAMES.iteritems()}
    ans = {}
    for match in re.finditer(r"""<!--\s*(?P<name>%s)\s*=\s*%s""" % (all_names, attr_pat), src):
        field = rmap[match.group("name")]
        if field not in ans:
            ans[field] = replace_entities(match.group("content"))
        if len(ans) == len(COMMENT_NAMES):
            break
    return ans
Esempio n. 9
0
def handle_comment(data, comment_tags):
    if not hasattr(handle_comment, 'pat'):
        handle_comment.pat = re.compile(r'''(?P<name>\S+)\s*=\s*%s''' %
                                        attr_pat)
    for match in handle_comment.pat.finditer(data):
        x = match.group('name')
        field = None
        try:
            field = rmap_comment[x]
        except KeyError:
            pass
        if field:
            comment_tags[field].append(replace_entities(
                match.group('content')))
Esempio n. 10
0
 def find_links(self, src):
     for match in self.LINK_PAT.finditer(src):
         url = None
         for i in ('url1', 'url2', 'url3'):
             url = match.group(i)
             if url:
                 break
         url = replace_entities(url)
         try:
             link = self.resolve(url)
         except ValueError:
             # Unparseable URL, ignore
             continue
         if link not in self.links:
             self.links.append(link)
Esempio n. 11
0
 def find_links(self, src):
     for match in self.LINK_PAT.finditer(src):
         url = None
         for i in ('url1', 'url2', 'url3'):
             url = match.group(i)
             if url:
                 break
         url = replace_entities(url)
         try:
             link = self.resolve(url)
         except ValueError:
             # Unparseable URL, ignore
             continue
         if link not in self.links:
             self.links.append(link)
Esempio n. 12
0
def process_node(node, html_parent):
    ntype = node.get('type')
    if ntype == 'tag':
        c = html_parent.makeelement(node['name'])
        c.attrib.update(node.get('attribs', {}))
        html_parent.append(c)
        for nc in node.get('children', ()):
            process_node(nc, c)
    elif ntype == 'text':
        text = node.get('data')
        if text:
            text = replace_entities(text)
            if len(html_parent):
                t = html_parent[-1]
                t.tail = (t.tail or '') + text
            else:
                html_parent.text = (html_parent.text or '') + text
Esempio n. 13
0
def build_toc(index_entries):
    ans = TOC(base_path=os.getcwdu())
    levels = {x['hlvl'] for x in index_entries}
    num_map = {-1: ans}
    level_map = {l:[x for x in index_entries if x['hlvl'] == l] for l in
            levels}
    for lvl in sorted(levels):
        for item in level_map[lvl]:
            parent = num_map[item['parent']]
            child = parent.add_item(item['href'], item['idtag'],
                    replace_entities(item['text'], encoding=None))
            num_map[item['num']] = child

    # Set play orders in depth first order
    for i, item in enumerate(ans.flat()):
        item.play_order = i

    return ans
Esempio n. 14
0
def build_toc(index_entries):
    ans = TOC(base_path=os.getcwdu())
    levels = {x['hlvl'] for x in index_entries}
    num_map = {-1: ans}
    level_map = {l:[x for x in index_entries if x['hlvl'] == l] for l in
            levels}
    for lvl in sorted(levels):
        for item in level_map[lvl]:
            parent = num_map[item['parent']]
            child = parent.add_item(item['href'], item['idtag'],
                    replace_entities(item['text'], encoding=None))
            num_map[item['num']] = child

    # Set play orders in depth first order
    for i, item in enumerate(ans.flat()):
        item.play_order = i

    return ans
Esempio n. 15
0
def handle_entities(text, func):
    return func(replace_entities(text))
Esempio n. 16
0
def count_chars_in_escaped_html(text, counter, file_name, node, attr, locale):
    text = replace_entities(text)
    root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8'))
    count_chars_in_html(root, counter, file_name, locale)
Esempio n. 17
0
def handle_entities(text, func):
    return func(replace_entities(text))
Esempio n. 18
0
def handle_entities(text, func):
    return prepare_string_for_xml(func(replace_entities(text)))
Esempio n. 19
0
def read_simple_property(elem):
    # A simple property
    if elem.text:
        return replace_entities(elem.text)
    return replace_entities(elem.get(expand('rdf:resource'), ''))
Esempio n. 20
0
def handle_entities(text, func):
    return prepare_string_for_xml(func(replace_entities(text)))
Esempio n. 21
0
def get_metadata_(src, encoding=None):
    # Meta data definitions as in
    # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9

    if isbytestring(src):
        if not encoding:
            src = xml_to_unicode(src)[0]
        else:
            src = src.decode(encoding, 'replace')
    src = src[:150000]  # Searching shouldn't take too long
    comment_tags = parse_comment_tags(src)
    meta_tags = parse_meta_tags(src)

    def get(field):
        ans = comment_tags.get(field, meta_tags.get(field, None))
        if ans:
            ans = ans.strip()
        if not ans:
            ans = None
        return ans

    # Title
    title = get('title')
    if not title:
        pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE)
        match = pat.search(src)
        if match:
            title = replace_entities(match.group(1))

    # Author
    authors = get('authors') or _('Unknown')

    # Create MetaInformation with Title and Author
    mi = Metadata(title or _('Unknown'), string_to_authors(authors))

    for field in ('publisher', 'isbn', 'language', 'comments'):
        val = get(field)
        if val:
            setattr(mi, field, val)

    for field in ('pubdate', 'timestamp'):
        try:
            val = parse_date(get(field))
        except:
            pass
        else:
            if not is_date_undefined(val):
                setattr(mi, field, val)

    # SERIES
    series = get('series')
    if series:
        pat = re.compile(r'\[([.0-9]+)\]$')
        match = pat.search(series)
        series_index = None
        if match is not None:
            try:
                series_index = float(match.group(1))
            except:
                pass
            series = series.replace(match.group(), '').strip()
        mi.series = series
        if series_index is None:
            series_index = get('series_index')
            try:
                series_index = float(series_index)
            except:
                pass
        if series_index is not None:
            mi.series_index = series_index

    # RATING
    rating = get('rating')
    if rating:
        try:
            mi.rating = float(rating)
            if mi.rating < 0:
                mi.rating = 0
            if mi.rating > 5:
                mi.rating /= 2.
            if mi.rating > 5:
                mi.rating = 0
        except:
            pass

    # TAGS
    tags = get('tags')
    if tags:
        tags = [x.strip() for x in tags.split(',') if x.strip()]
        if tags:
            mi.tags = tags

    return mi
Esempio n. 22
0
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
    '''
    Convert the pdf into html using the pdftohtml app.
    This will write the html as index.html into output_dir.
    It will also write all extracted images to the output_dir
    '''

    pdfsrc = os.path.join(output_dir, 'src.pdf')
    index = os.path.join(output_dir, 'index.'+('xml' if as_xml else 'html'))

    with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest:
        shutil.copyfileobj(src, dest)

    with CurrentDir(output_dir):

        def a(x):
            return os.path.basename(x)

        exe = PDFTOHTML
        cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
                '-nodrm', a(pdfsrc), a(index)]

        if isbsd:
            cmd.remove('-nodrm')
        if no_images:
            cmd.append('-i')
        if as_xml:
            cmd.append('-xml')

        logf = PersistentTemporaryFile('pdftohtml_log')
        try:
            p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
                    stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError(
                    _('Could not find pdftohtml, check it is in your PATH'))
            else:
                raise
        ret = eintr_retry_call(p.wait)
        logf.flush()
        logf.close()
        out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
        if ret != 0:
            raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out))
        if out:
            prints("pdftohtml log:")
            prints(out)
        if not os.path.exists(index) or os.stat(index).st_size < 100:
            raise DRMError()

        if not as_xml:
            with lopen(index, 'r+b') as i:
                raw = i.read().decode('utf-8')
                raw = flip_images(raw)
                raw = raw.replace('<head', '<!-- created by calibre\'s pdftohtml -->\n  <head', 1)
                i.seek(0)
                i.truncate()
                # versions of pdftohtml >= 0.20 output self closing <br> tags, this
                # breaks the pdf heuristics regexps, so replace them
                raw = raw.replace('<br/>', '<br>')
                raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I)
                raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
                raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I)
                raw = replace_entities(raw)
                raw = raw.replace('\u00a0', ' ')

                i.write(raw.encode('utf-8'))

            cmd = [exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
                    '-nodrm', '-q', '-stdout', a(pdfsrc)]
            if isbsd:
                cmd.remove('-nodrm')
            p = popen(cmd, stdout=subprocess.PIPE)
            raw = p.stdout.read().strip()
            if p.wait() == 0 and raw:
                parse_outline(raw, output_dir)

        try:
            os.remove(pdfsrc)
        except:
            pass
Esempio n. 23
0
    def get_book_details(self, log, metadata, timeout, cachedPage):  # {{{
        from lxml import etree, html
        from calibre.ebooks.chardet import xml_to_unicode

        if not cachedPage:
            url = self.get_book_url(metadata.get_identifiers())[2]
            # log.debug(u'book_details_url', url)

            raw = self.browser.open_novisit(url, timeout=timeout).read()
            fulldoc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
        else:
            fulldoc = cachedPage
            log.debug(u'book_details -> using cached page')

        fullString = etree.tostring(fulldoc)
        doc = fulldoc.xpath(u'//div[@class="bDetailPage"][1]')[0]

        # series Серия/Серии
        series_elem = doc.xpath(u'//div[contains(text(), "Сери")]')
        if series_elem:
            series_text_elem = series_elem[0].getnext()
            metadata.series = series_text_elem.xpath(u'.//a/text()')[0]
            log.debug(u'**Seria: ', metadata.series)

        isbn = None
        isbn_elem = doc.xpath(u'//div[contains(text(), "ISBN")]')
        if isbn_elem:
            isbn = isbn_elem[0].getnext().xpath(u'normalize-space(./text())')
            metadata.identifiers['isbn'] = isbn

        # get authors/editors if no authors are available
        authors_joined = ','.join(metadata.authors)

        if authors_joined == '' or authors_joined == "Unknown":
            authors_from_detail = []
            editor_elem = doc.xpath(u'//div[contains(text(), "Редактор")]')
            if editor_elem:
                editor = editor_elem[0].getnext().xpath(u'.//a/text()')[0]
                authors_from_detail.append(editor + u' (ред.)')
            authors_elem = doc.xpath(u'//div[contains(text(), "Автор")]')
            if authors_elem:
                authors = authors_elem[0].getnext().xpath(
                    u'.//a/text()')  # list
                authors_from_detail.extend(authors)
            if len(authors_from_detail) > 0:
                metadata.authors = authors_from_detail

        cover = doc.xpath('.//img[contains(@class, "fullImage")]/@src')[0]
        metadata.ozon_cover_url = _translateToBigCoverUrl(cover)

        publishers = None
        publishers_elem = doc.xpath(u'//div[contains(text(), "Издатель")]')
        if publishers_elem:
            publishers_elem = publishers_elem[0].getnext()
            publishers = publishers_elem.xpath(u'.//a/text()')[0]

        if publishers:
            metadata.publisher = publishers

        displ_lang = None
        langs = None
        langs_elem = doc.xpath(u'//div[contains(text(), "зык")]')
        if langs_elem:
            langs_elem = langs_elem[0].getnext()
            langs = langs_elem.xpath(u'text()')[0].strip()
        if langs:
            lng_splt = langs.split(u',')
            if lng_splt:
                displ_lang = lng_splt[0].strip()
                # log.debug(u'displ_lang1: ', displ_lang)
        metadata.language = _translageLanguageToCode(displ_lang)
        # log.debug(u'Language: ', metadata.language)

        # can be set before from xml search response
        if not metadata.pubdate:
            pubdate_elem = doc.xpath(u'//div[contains(text(), "Год выпуска")]')
            if pubdate_elem:
                pubYear = pubdate_elem[0].getnext().xpath(u'text()')[0].strip()
                if pubYear:
                    matcher = re.search(r'\d{4}', pubYear)
                    if matcher:
                        metadata.pubdate = toPubdate(log, matcher.group(0))
        # log.debug(u'Pubdate: ', metadata.pubdate)

        # comments, from Javascript data
        beginning = fullString.find(u'FirstBlock')
        end = fullString.find(u'}', beginning)
        comments = unicode(fullString[beginning + 75:end -
                                      1]).decode("unicode-escape")
        metadata.comments = replace_entities(comments, 'utf-8')
Esempio n. 24
0
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
    '''
    Convert the pdf into html using the pdftohtml app.
    This will write the html as index.html into output_dir.
    It will also write all extracted images to the output_dir
    '''

    pdfsrc = os.path.join(output_dir, 'src.pdf')
    index = os.path.join(output_dir, 'index.' + ('xml' if as_xml else 'html'))

    with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest:
        shutil.copyfileobj(src, dest)

    with CurrentDir(output_dir):

        def a(x):
            return os.path.basename(x)

        exe = PDFTOHTML
        cmd = [
            exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm',
            a(pdfsrc),
            a(index)
        ]

        if isbsd:
            cmd.remove('-nodrm')
        if no_images:
            cmd.append('-i')
        if as_xml:
            cmd.append('-xml')

        logf = PersistentTemporaryFile('pdftohtml_log')
        try:
            p = popen(cmd,
                      stderr=logf._fd,
                      stdout=logf._fd,
                      stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError(
                    _('Could not find pdftohtml, check it is in your PATH'))
            else:
                raise
        ret = eintr_retry_call(p.wait)
        logf.flush()
        logf.close()
        out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
        if ret != 0:
            raise ConversionError('pdftohtml failed with return code: %d\n%s' %
                                  (ret, out))
        if out:
            prints("pdftohtml log:")
            prints(out)
        if not os.path.exists(index) or os.stat(index).st_size < 100:
            raise DRMError()

        if not as_xml:
            with lopen(index, 'r+b') as i:
                raw = i.read().decode('utf-8')
                raw = flip_images(raw)
                raw = raw.replace(
                    '<head',
                    '<!-- created by calibre\'s pdftohtml -->\n  <head', 1)
                i.seek(0)
                i.truncate()
                # versions of pdftohtml >= 0.20 output self closing <br> tags, this
                # breaks the pdf heuristics regexps, so replace them
                raw = raw.replace('<br/>', '<br>')
                raw = re.sub(r'<a\s+name=(\d+)',
                             r'<a id="\1"',
                             raw,
                             flags=re.I)
                raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
                raw = re.sub(r'<a href="index.html#(\d+)"',
                             r'<a href="#p\1"',
                             raw,
                             flags=re.I)
                raw = replace_entities(raw)
                raw = raw.replace('\u00a0', ' ')

                i.write(raw.encode('utf-8'))

            cmd = [
                exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8',
                '-noframes', '-p', '-nomerge', '-nodrm', '-q', '-stdout',
                a(pdfsrc)
            ]
            if isbsd:
                cmd.remove('-nodrm')
            p = popen(cmd, stdout=subprocess.PIPE)
            raw = p.stdout.read().strip()
            if p.wait() == 0 and raw:
                parse_outline(raw, output_dir)

        try:
            os.remove(pdfsrc)
        except:
            pass
Esempio n. 25
0
    def get_book_details(self, log, metadata, timeout, cachedPage):  # {{{
        from lxml import etree, html
        from calibre.ebooks.chardet import xml_to_unicode

        if not cachedPage:
            url = self.get_book_url(metadata.get_identifiers())[2]
            # log.debug(u'book_details_url', url)

            raw = self.browser.open_novisit(url, timeout=timeout).read()
            fulldoc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
        else:
            fulldoc = cachedPage
            log.debug(u'book_details -> using cached page')

        fullString = etree.tostring(fulldoc)
        doc = fulldoc.xpath(u'//div[@class="bDetailPage"][1]')[0]

        # series Серия/Серии
        series_elem = doc.xpath(u'//div[contains(text(), "Сери")]')
        if series_elem:
            series_text_elem = series_elem[0].getnext()
            metadata.series = series_text_elem.xpath(u'.//a/text()')[0]
            log.debug(u'**Seria: ', metadata.series)

        isbn = None
        isbn_elem = doc.xpath(u'//div[contains(text(), "ISBN")]')
        if isbn_elem:
            isbn = isbn_elem[0].getnext().xpath(u'normalize-space(./text())')
            metadata.identifiers['isbn'] = isbn

        # get authors/editors if no authors are available
        authors_joined = ','.join(metadata.authors)

        if authors_joined == '' or authors_joined == "Unknown":
            authors_from_detail = []
            editor_elem = doc.xpath(u'//div[contains(text(), "Редактор")]')
            if editor_elem:
                editor = editor_elem[0].getnext().xpath(u'.//a/text()')[0]
                authors_from_detail.append(editor + u' (ред.)')
            authors_elem = doc.xpath(u'//div[contains(text(), "Автор")]')
            if authors_elem:
                authors = authors_elem[0].getnext().xpath(u'.//a/text()')  # list
                authors_from_detail.extend(authors)
            if len(authors_from_detail) > 0:
                metadata.authors = authors_from_detail

        cover = doc.xpath('.//img[contains(@class, "fullImage")]/@src')[0]
        metadata.ozon_cover_url = _translateToBigCoverUrl(cover)

        publishers = None
        publishers_elem = doc.xpath(u'//div[contains(text(), "Издатель")]')
        if publishers_elem:
            publishers_elem = publishers_elem[0].getnext()
            publishers = publishers_elem.xpath(u'.//a/text()')[0]

        if publishers:
            metadata.publisher = publishers

        displ_lang = None
        langs = None
        langs_elem = doc.xpath(u'//div[contains(text(), "зык")]')
        if langs_elem:
            langs_elem = langs_elem[0].getnext()
            langs = langs_elem.xpath(u'text()')[0].strip() if langs_elem else None
        if langs:
            lng_splt = langs.split(u',')
            if lng_splt:
                displ_lang = lng_splt[0].strip()
                # log.debug(u'displ_lang1: ', displ_lang)
        metadata.language = _translageLanguageToCode(displ_lang)
        # log.debug(u'Language: ', metadata.language)

        # can be set before from xml search response
        if not metadata.pubdate:
            pubdate_elem = doc.xpath(u'//div[contains(text(), "Год выпуска")]')
            if pubdate_elem:
                pubYear = pubdate_elem[0].getnext().xpath(u'text()')[0].strip()
                if pubYear:
                    matcher = re.search(r'\d{4}', pubYear)
                    if matcher:
                        metadata.pubdate = toPubdate(log, matcher.group(0))
        # log.debug(u'Pubdate: ', metadata.pubdate)

        # comments, from Javascript data
        beginning = fullString.find(u'FirstBlock')
        end = fullString.find(u'}', beginning)
        comments = unicode(fullString[beginning + 75:end - 1]).decode("unicode-escape")
        metadata.comments = replace_entities(comments, 'utf-8')