Example #1
0
 def _get_name(self, root, lang_uid, uid):
     try:
         hgroup = root.select_one('.hgroup')
         h1 = hgroup.select_one('h1')
         return regex.sub(r'^\P{alpha}*', '', h1.text_content())
     except Exception as e:
         logger.warn('Could not determine name for {}/{}'.format(lang_uid, uid))
         return ''
Example #2
0
 def _get_name(self, root, lang_uid, uid):
     try:
         hgroup = root.select_one('.hgroup')
         h1 = hgroup.select_one('h1')
         return regex.sub(r'^\P{alpha}*', '', h1.text_content())
     except Exception as e:
         logger.warn('Could not determine name for {}/{}'.format(lang_uid, uid))
         return ''
Example #3
0
 def _get_author(self, root, lang_uid, uid):
     try:
         e = root.select_one('meta[author]')
         if e:
             return e.attrib['author']
         
         e = root.select_one('meta[data-author]')
         if e:
             return e.attrib['data-author']
             
         e = root.select_one('#metaarea > .author')
         if e:
             return e.text
         raise ValueError('No author found')
     except Exception as e:
         logger.warn('Could not determine author for {}/{}'.format(lang_uid, uid))
         return ''
Example #4
0
    def build(self, lang_dir, force=False):
        # The pagenumbinator should be scoped because it uses
        # a large chunk of memory which should be gc'd.
        # But it shouldn't be created at all if we don't need it.
        # So we use a getter, and delete it when we are done.
        self._ppn = None
        
        codepoints = set()
        bold_codepoints = set()
        italic_codepoints = set()        

        lang_uid = lang_dir.stem
        all_files = sorted(lang_dir.glob('**/*.html'), key=lambda f: sc.util.numericsortkey(f.stem))
        files = [f for f in all_files if f.stem == 'metadata'] + [f for f in all_files if f.stem != 'metadata']
        for i, htmlfile in enumerate(files):
         try:
            if not self._should_process_file(htmlfile, force):
                continue
            logger.info('Adding file: {!s}'.format(htmlfile))
            uid = htmlfile.stem
            root = html.parse(str(htmlfile)).getroot()
            
            #Set codepoint data
            
            _stack = [root]
            while _stack:
                e = _stack.pop()
                if self.is_bold(lang_uid, e):
                    bold_codepoints.update(e.text_content())
                elif self.is_italic(lang_uid, e):
                    italic_codepoints.update(e.text_content())
                else:
                    _stack.extend(e)
            codepoints.update(root.text_content())                
            
            # Set the previous and next uids, using explicit data
            # if available, otherwise making a safe guess.
            # The safe guess relies on comparing uids, and will not
            # capture relationships such as the order of patimokha
            # rules.
            prev_uid = root.get('data-prev')
            next_uid = root.get('data-next')
            if not (prev_uid or next_uid):
                if i > 0:
                    prev_uid = files[i - 1].stem
                    if not self.uids_are_related(uid, prev_uid):
                        prev_uid = None
                if i + 1 < len(files):
                    next_uid = files[i + 1].stem
                    if not self.uids_are_related(uid, next_uid):
                        next_uid = None
            
            path = htmlfile.relative_to(sc.text_dir)
            author = self._get_author(root, lang_uid, uid)
            
            if uid == 'metadata':
                if author is None:
                    raise ValueError('Metadata file {} does not define author'.format(path))
                self.add_metadata(path, author, root)
                continue
            
            if author is None:
                metadata = self.get_metadata(path)
                if metadata:
                    author = metadata['author']
            
            if author is None:
                metadata = root.select_one('#metaarea')
                if metadata:
                    metadata_text = metadata.text_content()
                    m = regex.match(r'.{,80}\.', metadata_text)
                    if not m:
                        m = regex.match(r'.{,80}(?=\s)', metadata_text)
                    if m:
                        author = m[0]
                        
            if author is None:
                logger.warn('Could not determine author for {}/{}'.format(lang_uid, uid))
                author = ''
            
            name = self._get_name(root, lang_uid, uid)
            volpage = self._get_volpage(root, lang_uid, uid)
            embedded = self._get_embedded_uids(root, lang_uid, uid)
            
            fstat = htmlfile.stat()
            cdate = self.datestr(fstat.st_ctime)
            mdate = self.datestr(fstat.st_mtime)

            textinfo = TextInfo(uid=uid, lang=lang_uid, path=path, 
                                name=name, author=author,
                                volpage=volpage, prev_uid=prev_uid,
                                next_uid=next_uid,
                                cdate=cdate,
                                mdate=mdate,
                                file_uid=uid)
            self.add_text_info(lang_uid, uid, textinfo)

            for child in embedded:
                child.path = path
                child.author = author
                child.file_uid = uid
                self.add_text_info(lang_uid, child.uid, child)

            m = regex.match(r'(.*?)(\d+)-(\d+)$', uid)
            if m:
                range_textinfo = TextInfo(uid=uid+'#', 
                lang=lang_uid,
                path=path,
                name=name,
                author=author,
                volpage=volpage,
                file_uid=uid)
                start = int(m[2])
                end = int(m[3]) + 1
                for i in range(start, end):
                    iuid = m[1] + str(i)
                    if self.exists(iuid, lang_uid):
                        continue

                    self.add_text_info(lang_uid, iuid, range_textinfo)
        
         except Exception as e:
             print('An exception occured: {!s}'.format(htmlfile))
             raise
        
        self._codepoints[lang_uid] = {
            'normal': codepoints,
            'bold': bold_codepoints,
            'italic': italic_codepoints
        }
        
        del self._ppn
Example #5
0
    def build(self, lang_dir, force=False):
        # The pagenumbinator should be scoped because it uses
        # a large chunk of memory which should be gc'd.
        # But it shouldn't be created at all if we don't need it.
        # So we use a getter, and delete it when we are done.
        self._ppn = None
        
        codepoints = set()
        bold_codepoints = set()
        italic_codepoints = set()        

        lang_uid = lang_dir.stem
        all_files = sorted(lang_dir.glob('**/*.html'), key=lambda f: sc.util.numericsortkey(f.stem))
        files = [f for f in all_files if f.stem == 'metadata'] + [f for f in all_files if f.stem != 'metadata']
        for i, htmlfile in enumerate(files):
         try:
            if not self._should_process_file(htmlfile, force):
                continue
            logger.info('Adding file: {!s}'.format(htmlfile))
            uid = htmlfile.stem
            root = html.parse(str(htmlfile)).getroot()
            
            #Set codepoint data
            
            _stack = [root]
            while _stack:
                e = _stack.pop()
                if self.is_bold(lang_uid, e):
                    bold_codepoints.update(e.text_content())
                elif self.is_italic(lang_uid, e):
                    italic_codepoints.update(e.text_content())
                else:
                    _stack.extend(e)
            codepoints.update(root.text_content())                
            
            # Set the previous and next uids, using explicit data
            # if available, otherwise making a safe guess.
            # The safe guess relies on comparing uids, and will not
            # capture relationships such as the order of patimokha
            # rules.
            prev_uid = root.get('data-prev')
            next_uid = root.get('data-next')
            if not (prev_uid or next_uid):
                if i > 0:
                    prev_uid = files[i - 1].stem
                    if not self.uids_are_related(uid, prev_uid):
                        prev_uid = None
                if i + 1 < len(files):
                    next_uid = files[i + 1].stem
                    if not self.uids_are_related(uid, next_uid):
                        next_uid = None
            
            path = htmlfile.relative_to(sc.text_dir)
            author = self._get_author(root, lang_uid, uid)
            
            if uid == 'metadata':
                if author is None:
                    raise ValueError('Metadata file {} does not define author'.format(path))
                self.add_metadata(path, author, root)
                continue
            
            if author is None:
                metadata = self.get_metadata(path)
                if metadata:
                    author = metadata['author']
            
            if author is None:
                metadata = root.select_one('#metaarea')
                if metadata:
                    metadata_text = metadata.text_content()
                    m = regex.match(r'.{,80}\.', metadata_text)
                    if not m:
                        m = regex.match(r'.{,80}(?=\s)', metadata_text)
                    if m:
                        author = m[0]
                        
            if author is None:
                logger.warn('Could not determine author for {}/{}'.format(lang_uid, uid))
                author = ''
            
            name = self._get_name(root, lang_uid, uid)
            volpage = self._get_volpage(root, lang_uid, uid)
            embedded = self._get_embedded_uids(root, lang_uid, uid)
            
            fstat = htmlfile.stat()
            cdate = self.datestr(fstat.st_ctime)
            mdate = self.datestr(fstat.st_mtime)

            textinfo = TextInfo(uid=uid, lang=lang_uid, path=path, 
                                name=name, author=author,
                                volpage=volpage, prev_uid=prev_uid,
                                next_uid=next_uid,
                                cdate=cdate,
                                mdate=mdate,
                                file_uid=uid)
            self.add_text_info(lang_uid, uid, textinfo)

            for child in embedded:
                child.path = path
                child.author = author
                child.file_uid = uid
                self.add_text_info(lang_uid, child.uid, child)

            m = regex.match(r'(.*?)(\d+)-(\d+)$', uid)
            if m:
                range_textinfo = TextInfo(uid=uid+'#', 
                lang=lang_uid,
                path=path,
                name=name,
                author=author,
                volpage=volpage,
                file_uid=uid)
                start = int(m[2])
                end = int(m[3]) + 1
                for i in range(start, end):
                    iuid = m[1] + str(i)
                    if self.exists(iuid, lang_uid):
                        continue

                    self.add_text_info(lang_uid, iuid, range_textinfo)
        
         except Exception as e:
             print('An exception occured: {!s}'.format(htmlfile))
             raise
        
        self._codepoints[lang_uid] = {
            'normal': codepoints,
            'bold': bold_codepoints,
            'italic': italic_codepoints
        }
        
        del self._ppn