def _get_name(self, root, lang_uid, uid): try: hgroup = root.select_one('.hgroup') h1 = hgroup.select_one('h1') return regex.sub(r'^\P{alpha}*', '', h1.text_content()) except Exception as e: logger.warn('Could not determine name for {}/{}'.format(lang_uid, uid)) return ''
def _get_author(self, root, lang_uid, uid): try: e = root.select_one('meta[author]') if e: return e.attrib['author'] e = root.select_one('meta[data-author]') if e: return e.attrib['data-author'] e = root.select_one('#metaarea > .author') if e: return e.text raise ValueError('No author found') except Exception as e: logger.warn('Could not determine author for {}/{}'.format(lang_uid, uid)) return ''
def build(self, lang_dir, force=False): # The pagenumbinator should be scoped because it uses # a large chunk of memory which should be gc'd. # But it shouldn't be created at all if we don't need it. # So we use a getter, and delete it when we are done. self._ppn = None codepoints = set() bold_codepoints = set() italic_codepoints = set() lang_uid = lang_dir.stem all_files = sorted(lang_dir.glob('**/*.html'), key=lambda f: sc.util.numericsortkey(f.stem)) files = [f for f in all_files if f.stem == 'metadata'] + [f for f in all_files if f.stem != 'metadata'] for i, htmlfile in enumerate(files): try: if not self._should_process_file(htmlfile, force): continue logger.info('Adding file: {!s}'.format(htmlfile)) uid = htmlfile.stem root = html.parse(str(htmlfile)).getroot() #Set codepoint data _stack = [root] while _stack: e = _stack.pop() if self.is_bold(lang_uid, e): bold_codepoints.update(e.text_content()) elif self.is_italic(lang_uid, e): italic_codepoints.update(e.text_content()) else: _stack.extend(e) codepoints.update(root.text_content()) # Set the previous and next uids, using explicit data # if available, otherwise making a safe guess. # The safe guess relies on comparing uids, and will not # capture relationships such as the order of patimokha # rules. prev_uid = root.get('data-prev') next_uid = root.get('data-next') if not (prev_uid or next_uid): if i > 0: prev_uid = files[i - 1].stem if not self.uids_are_related(uid, prev_uid): prev_uid = None if i + 1 < len(files): next_uid = files[i + 1].stem if not self.uids_are_related(uid, next_uid): next_uid = None path = htmlfile.relative_to(sc.text_dir) author = self._get_author(root, lang_uid, uid) if uid == 'metadata': if author is None: raise ValueError('Metadata file {} does not define author'.format(path)) self.add_metadata(path, author, root) continue if author is None: metadata = self.get_metadata(path) if metadata: author = metadata['author'] if author is None: metadata = root.select_one('#metaarea') if metadata: metadata_text = metadata.text_content() m = regex.match(r'.{,80}\.', metadata_text) if not m: m = regex.match(r'.{,80}(?=\s)', metadata_text) if m: author = m[0] if author is None: logger.warn('Could not determine author for {}/{}'.format(lang_uid, uid)) author = '' name = self._get_name(root, lang_uid, uid) volpage = self._get_volpage(root, lang_uid, uid) embedded = self._get_embedded_uids(root, lang_uid, uid) fstat = htmlfile.stat() cdate = self.datestr(fstat.st_ctime) mdate = self.datestr(fstat.st_mtime) textinfo = TextInfo(uid=uid, lang=lang_uid, path=path, name=name, author=author, volpage=volpage, prev_uid=prev_uid, next_uid=next_uid, cdate=cdate, mdate=mdate, file_uid=uid) self.add_text_info(lang_uid, uid, textinfo) for child in embedded: child.path = path child.author = author child.file_uid = uid self.add_text_info(lang_uid, child.uid, child) m = regex.match(r'(.*?)(\d+)-(\d+)$', uid) if m: range_textinfo = TextInfo(uid=uid+'#', lang=lang_uid, path=path, name=name, author=author, volpage=volpage, file_uid=uid) start = int(m[2]) end = int(m[3]) + 1 for i in range(start, end): iuid = m[1] + str(i) if self.exists(iuid, lang_uid): continue self.add_text_info(lang_uid, iuid, range_textinfo) except Exception as e: print('An exception occured: {!s}'.format(htmlfile)) raise self._codepoints[lang_uid] = { 'normal': codepoints, 'bold': bold_codepoints, 'italic': italic_codepoints } del self._ppn