Ejemplo n.º 1
0
def analyze_path(path):
    by_tag = collections.defaultdict(collections.Counter)
    by_class = collections.defaultdict(collections.Counter)
    pnum_classes = {}
    for infile in path.glob('**/*.html'):
        doc = html.parse(str(infile), encoding='utf8')
        for e in doc.getroot().cssselect('[class]'):
            for class_ in e.attrib['class'].split():
                by_tag[e.tag][class_] += 1
                by_class[class_][e.tag] += 1
                if 'id' in e.attrib and not e.text_content():
                    by_class[class_]['pnum'] += 1

    defaults = {}
    for class_, counter in by_class.items():
        pnum_count = counter['pnum']
        if pnum_count:
            del counter['pnum']
        tag, count = counter.most_common(1)[0]
        defaults[class_] = tag
        if pnum_count > count / 2:
            pnum_classes[class_] = pnum_count

    return {
        'defaults': defaults,
        'by_tag': {tag: dict(val)
                   for tag, val in by_tag.items()},
        'by_class': {class_: dict(val)
                     for class_, val in by_class.items()},
        'pnum_classes': pnum_classes
    }
Ejemplo n.º 2
0
def analyze_path(path):
    by_tag = collections.defaultdict(collections.Counter)
    by_class = collections.defaultdict(collections.Counter)
    pnum_classes = {}
    for infile in path.glob("**/*.html"):
        doc = html.parse(str(infile), encoding="utf8")
        for e in doc.getroot().cssselect("[class]"):
            for class_ in e.attrib["class"].split():
                by_tag[e.tag][class_] += 1
                by_class[class_][e.tag] += 1
                if "id" in e.attrib and not e.text_content():
                    by_class[class_]["pnum"] += 1

    defaults = {}
    for class_, counter in by_class.items():
        pnum_count = counter["pnum"]
        if pnum_count:
            del counter["pnum"]
        tag, count = counter.most_common(1)[0]
        defaults[class_] = tag
        if pnum_count > count / 2:
            pnum_classes[class_] = pnum_count

    return {
        "defaults": defaults,
        "by_tag": {tag: dict(val) for tag, val in by_tag.items()},
        "by_class": {class_: dict(val) for class_, val in by_class.items()},
        "pnum_classes": pnum_classes,
    }
Ejemplo n.º 3
0
def get_all_pali_words():
    if "pali_words" not in cache:
        words = Counter()
            
        for file in (sc.text_dir / 'pi' / 'su' / 'mn').glob('**/*.html'):
            doc = html.parse(str(file))
            root = doc.getroot()
            for e in root.cssselect('#metaarea'):
                e.drop_tree()
            text = root.text_content()
            text = regex.sub(r'[\xad”’]', '', text)
            words_from_text = regex.findall(r'\p{alpha}+', text)
            words.update(words_from_text)
            words.update(word.rstrip('ṃ') for word in words_from_text if word.endswith('ṃ'))
        
        result = {}
        for word, count in words.most_common():
            asc_word = asciify(word)
            if not asc_word in result:
                result[asc_word] = ((word, count),)
            else:
                result[asc_word] = result[asc_word] + ((word, count),)
            
        cache["pali_words"] = result
    
    return cache["pali_words"]
Ejemplo n.º 4
0
    def process(self, srcfo):
        skipped = []
        found = []
        pinyin_match = regex.compile(r'Pinyin: (.*)').match
        # The source file is not a true XML tree so we need to jump
        # through some hoops. Yay, hoops!
        with tempfile.NamedTemporaryFile('w+') as truetree:
            truetree.write('<root>')
            truetree.writelines(srcfo)
            truetree.write('</root>')
            truetree.flush()
            truetree.seek(0)
            root = html.parse(truetree.name).getroot()
        self.root = root
        entries = []
        for entry in root.iter('entry'):
            head = pinyin = meaning = None

            try:
                head = entry.text.strip()
                for e in entry:
                    m = pinyin_match(e.text_content())
                    if m:
                        pinyin = m[1]
                        break

                meaning = entry.select_one('b').tail.lstrip(': ')

                if not head or not pinyin or not meaning:
                    logging.warning(
                        'Incomplete buddhdic entry: {!s}'.format(entry))

                if self.existing.issuperset(head):
                    entries.append('"{}": {}'.format(head, [pinyin, meaning]))
                    self.seen.update(head)
                    found.append((head, meaning))
                else:
                    skipped.append((head, meaning))
            except:
                print(head, pinyin, meaning)
                print(str(entry))
        if skipped:
            logging.info(
                '{} entries do and {} entries do not appear in SuttaCentral texts'
                .format(len(found), len(skipped)))
            if self.args.verbose:
                logging.info('Entries which do not appear: ')
                logging.info(', '.join('{}: {}'.format(head, meaning)
                                       for head, meaning in skipped))
        return 'sc.lzh2enData = {\n' + ',\n'.join(entries) + '\n}'
Ejemplo n.º 5
0
    def process(self, srcfo):
        skipped = []
        found = []
        pinyin_match = regex.compile(r'Pinyin: (.*)').match
        # The source file is not a true XML tree so we need to jump
        # through some hoops. Yay, hoops!
        with tempfile.NamedTemporaryFile('w+') as truetree:
            truetree.write('<root>')
            truetree.writelines(srcfo)
            truetree.write('</root>')
            truetree.flush()
            truetree.seek(0)
            root = html.parse(truetree.name).getroot()
        self.root = root
        entries = []
        for entry in root.iter('entry'):
            head = pinyin = meaning = None
            
            try:
                head = entry.text.strip()
                for e in entry:
                    m = pinyin_match(e.text_content())
                    if m:
                        pinyin = m[1]
                        break
                        
                meaning = entry.select_one('b').tail.lstrip(': ')
                
                if not head or not pinyin or not meaning:
                    logging.warning('Incomplete buddhdic entry: {!s}'.format(entry))

                if self.existing.issuperset(head):
                    entries.append('"{}": {}'.format(head, [pinyin, meaning]))
                    self.seen.update(head)
                    found.append((head, meaning))
                else:
                    skipped.append((head, meaning))
            except:
                print(head, pinyin, meaning)
                print(str(entry))
        if skipped:
            logging.info('{} entries do and {} entries do not appear in SuttaCentral texts'.format(len(found), len(skipped)))
            if self.args.verbose:
                logging.info('Entries which do not appear: ')
                logging.info(', '.join('{}: {}'.format(head, meaning) for head, meaning in skipped))
        return 'sc.lzh2enData = {\n' + ',\n'.join(entries) + '\n}'
Ejemplo n.º 6
0
        if not replacement:
            continue
        if args.hyphenate:
            replacement = hyphenate(replacement, args.hyphenate)
        mapping[original] = replacement


def replace_word_from_mapping(m):
    word = m[0]
    if not args.no_act and word in mapping:
        return mapping[word]
    return word


def process_text(text):
    if not text:
        return text
    return word_rex.sub(replace_word_from_mapping, text)


if args.source.is_dir():
    files = sorted(args.source.glob('**/*.html'), key=numericsortkey)
else:
    files = [args.source]
for file in files:
    doc = html.parse(str(file))
    root = doc.getroot()
    process_node(root, process_text)
    if not args.no_act:
        doc.write(str(file), method='html', encoding='utf8')
Ejemplo n.º 7
0
    def build(self, lang_dir, force=False):
        # The pagenumbinator should be scoped because it uses
        # a large chunk of memory which should be gc'd.
        # But it shouldn't be created at all if we don't need it.
        # So we use a getter, and delete it when we are done.
        self._ppn = None
        
        codepoints = set()
        bold_codepoints = set()
        italic_codepoints = set()        

        lang_uid = lang_dir.stem
        all_files = sorted(lang_dir.glob('**/*.html'), key=lambda f: sc.util.numericsortkey(f.stem))
        files = [f for f in all_files if f.stem == 'metadata'] + [f for f in all_files if f.stem != 'metadata']
        for i, htmlfile in enumerate(files):
         try:
            if not self._should_process_file(htmlfile, force):
                continue
            logger.info('Adding file: {!s}'.format(htmlfile))
            uid = htmlfile.stem
            root = html.parse(str(htmlfile)).getroot()
            
            #Set codepoint data
            
            _stack = [root]
            while _stack:
                e = _stack.pop()
                if self.is_bold(lang_uid, e):
                    bold_codepoints.update(e.text_content())
                elif self.is_italic(lang_uid, e):
                    italic_codepoints.update(e.text_content())
                else:
                    _stack.extend(e)
            codepoints.update(root.text_content())                
            
            # Set the previous and next uids, using explicit data
            # if available, otherwise making a safe guess.
            # The safe guess relies on comparing uids, and will not
            # capture relationships such as the order of patimokha
            # rules.
            prev_uid = root.get('data-prev')
            next_uid = root.get('data-next')
            if not (prev_uid or next_uid):
                if i > 0:
                    prev_uid = files[i - 1].stem
                    if not self.uids_are_related(uid, prev_uid):
                        prev_uid = None
                if i + 1 < len(files):
                    next_uid = files[i + 1].stem
                    if not self.uids_are_related(uid, next_uid):
                        next_uid = None
            
            path = htmlfile.relative_to(sc.text_dir)
            author = self._get_author(root, lang_uid, uid)
            
            if uid == 'metadata':
                if author is None:
                    raise ValueError('Metadata file {} does not define author'.format(path))
                self.add_metadata(path, author, root)
                continue
            
            if author is None:
                metadata = self.get_metadata(path)
                if metadata:
                    author = metadata['author']
            
            if author is None:
                metadata = root.select_one('#metaarea')
                if metadata:
                    metadata_text = metadata.text_content()
                    m = regex.match(r'.{,80}\.', metadata_text)
                    if not m:
                        m = regex.match(r'.{,80}(?=\s)', metadata_text)
                    if m:
                        author = m[0]
                        
            if author is None:
                logger.warn('Could not determine author for {}/{}'.format(lang_uid, uid))
                author = ''
            
            name = self._get_name(root, lang_uid, uid)
            volpage = self._get_volpage(root, lang_uid, uid)
            embedded = self._get_embedded_uids(root, lang_uid, uid)
            
            fstat = htmlfile.stat()
            cdate = self.datestr(fstat.st_ctime)
            mdate = self.datestr(fstat.st_mtime)

            textinfo = TextInfo(uid=uid, lang=lang_uid, path=path, 
                                name=name, author=author,
                                volpage=volpage, prev_uid=prev_uid,
                                next_uid=next_uid,
                                cdate=cdate,
                                mdate=mdate,
                                file_uid=uid)
            self.add_text_info(lang_uid, uid, textinfo)

            for child in embedded:
                child.path = path
                child.author = author
                child.file_uid = uid
                self.add_text_info(lang_uid, child.uid, child)

            m = regex.match(r'(.*?)(\d+)-(\d+)$', uid)
            if m:
                range_textinfo = TextInfo(uid=uid+'#', 
                lang=lang_uid,
                path=path,
                name=name,
                author=author,
                volpage=volpage,
                file_uid=uid)
                start = int(m[2])
                end = int(m[3]) + 1
                for i in range(start, end):
                    iuid = m[1] + str(i)
                    if self.exists(iuid, lang_uid):
                        continue

                    self.add_text_info(lang_uid, iuid, range_textinfo)
        
         except Exception as e:
             print('An exception occured: {!s}'.format(htmlfile))
             raise
        
        self._codepoints[lang_uid] = {
            'normal': codepoints,
            'bold': bold_codepoints,
            'italic': italic_codepoints
        }
        
        del self._ppn
Ejemplo n.º 8
0
        replacement = line[2]
        if not replacement:
            continue
        if args.hyphenate:
            replacement = hyphenate(replacement, args.hyphenate)
        mapping[original] = replacement

def replace_word_from_mapping(m):
    word = m[0]
    if not args.no_act and word in mapping:
        return mapping[word]
    return word

def process_text(text):
    if not text:
        return text
    return word_rex.sub(replace_word_from_mapping, text)    

if args.source.is_dir():
    files = sorted(args.source.glob('**/*.html'), key=numericsortkey)
else:
    files = [args.source]
for file in files:
    doc = html.parse(str(file))
    root = doc.getroot()
    process_node(root, process_text)
    if not args.no_act:
        doc.write(str(file), method='html', encoding='utf8')
    
    
Ejemplo n.º 9
0
    def build(self, force=False):
        # The pagenumbinator should be scoped because it uses
        # a large chunk of memory which should be gc'd.
        # But it shouldn't be created at all if we don't need it.
        # So we use a getter, and delete it when we are done.
        self._ppn = None
        file_i = 0
        file_of_total_i = 0
        percent = 0
        file_count = sum(1 for _ in sc.text_dir.glob('**/*.html'))
        for lang_dir in sc.text_dir.glob('*'):
            lang_uid = lang_dir.stem
            files = sorted(lang_dir.glob('**/*.html'), key=lambda f: sc.util.numericsortkey(f.stem))
            for i, htmlfile in enumerate(files):
             try:
                if not self._should_process_file(htmlfile, force):
                    continue
                logger.info('Adding file: {!s}'.format(htmlfile))
                uid = htmlfile.stem
                root = html.parse(str(htmlfile)).getroot()

                # Set the previous and next uids, using explicit data
                # if available, otherwise making a safe guess.
                # The safe guess relies on comparing uids, and will not
                # capture relationships such as the order of patimokha
                # rules.
                prev_uid = root.get('data-prev')
                next_uid = root.get('data-next')
                if not (prev_uid or next_uid):
                    if i > 0:
                        prev_uid = files[i - 1].stem
                        if not self.uids_are_related(uid, prev_uid):
                            prev_uid = None
                    if i + 1 < len(files):
                        next_uid = files[i + 1].stem
                        if not self.uids_are_related(uid, next_uid):
                            next_uid = None
                
                path = htmlfile.relative_to(sc.text_dir)
                author = self._get_author(root, lang_uid, uid)
                name = self._get_name(root, lang_uid, uid)
                volpage = self._get_volpage(root, lang_uid, uid)
                embedded = self._get_embedded_uids(root, lang_uid, uid)
                
                fstat = htmlfile.stat()
                cdate = self.datestr(fstat.st_ctime)
                mdate = self.datestr(fstat.st_mtime)

                textinfo = TextInfo(uid=uid, lang=lang_uid, path=path, 
                                    name=name, author=author,
                                    volpage=volpage, prev_uid=prev_uid,
                                    next_uid=next_uid,
                                    cdate=cdate,
                                    mdate=mdate)
                self.add_text_info(lang_uid, uid, textinfo)

                for child in embedded:
                    child.path = path
                    child.author = author
                    self.add_text_info(lang_uid, child.uid, child)

                m = regex.match(r'(.*?)(\d+)-(\d+)$', uid)
                if m:
                    range_textinfo = TextInfo(uid=uid+'#', lang=lang_uid, path=path, name=name, author=author, volpage=volpage)
                    start = int(m[2])
                    end = int(m[3]) + 1
                    for i in range(start, end):
                        iuid = m[1] + str(i)
                        if self.exists(iuid, lang_uid):
                            continue

                        self.add_text_info(lang_uid, iuid, range_textinfo)
                file_i += 1
                if (file_i % self.FILES_N) == 0:
                    self._on_n_files()
                file_of_total_i += 1
                new_percent = int(0.5 + 100 * file_of_total_i / file_count)
                if new_percent > percent:
                    percent = new_percent
                    self.build_process(percent)
             except Exception as e:
                 print('An exception occured: {!s}'.format(htmlfile))
                 raise
        if (file_i % self.FILES_N) != 0:
            self._on_n_files()
        
        del self._ppn
Ejemplo n.º 10
0
    def build(self, lang_dir, force=False):
        # The pagenumbinator should be scoped because it uses
        # a large chunk of memory which should be gc'd.
        # But it shouldn't be created at all if we don't need it.
        # So we use a getter, and delete it when we are done.
        self._ppn = None
        
        codepoints = set()
        bold_codepoints = set()
        italic_codepoints = set()        

        lang_uid = lang_dir.stem
        all_files = sorted(lang_dir.glob('**/*.html'), key=lambda f: sc.util.numericsortkey(f.stem))
        files = [f for f in all_files if f.stem == 'metadata'] + [f for f in all_files if f.stem != 'metadata']
        for i, htmlfile in enumerate(files):
         try:
            if not self._should_process_file(htmlfile, force):
                continue
            logger.info('Adding file: {!s}'.format(htmlfile))
            uid = htmlfile.stem
            root = html.parse(str(htmlfile)).getroot()
            
            #Set codepoint data
            
            _stack = [root]
            while _stack:
                e = _stack.pop()
                if self.is_bold(lang_uid, e):
                    bold_codepoints.update(e.text_content())
                elif self.is_italic(lang_uid, e):
                    italic_codepoints.update(e.text_content())
                else:
                    _stack.extend(e)
            codepoints.update(root.text_content())                
            
            # Set the previous and next uids, using explicit data
            # if available, otherwise making a safe guess.
            # The safe guess relies on comparing uids, and will not
            # capture relationships such as the order of patimokha
            # rules.
            prev_uid = root.get('data-prev')
            next_uid = root.get('data-next')
            if not (prev_uid or next_uid):
                if i > 0:
                    prev_uid = files[i - 1].stem
                    if not self.uids_are_related(uid, prev_uid):
                        prev_uid = None
                if i + 1 < len(files):
                    next_uid = files[i + 1].stem
                    if not self.uids_are_related(uid, next_uid):
                        next_uid = None
            
            path = htmlfile.relative_to(sc.text_dir)
            author = self._get_author(root, lang_uid, uid)
            
            if uid == 'metadata':
                if author is None:
                    raise ValueError('Metadata file {} does not define author'.format(path))
                self.add_metadata(path, author, root)
                continue
            
            if author is None:
                metadata = self.get_metadata(path)
                if metadata:
                    author = metadata['author']
            
            if author is None:
                metadata = root.select_one('#metaarea')
                if metadata:
                    metadata_text = metadata.text_content()
                    m = regex.match(r'.{,80}\.', metadata_text)
                    if not m:
                        m = regex.match(r'.{,80}(?=\s)', metadata_text)
                    if m:
                        author = m[0]
                        
            if author is None:
                logger.warn('Could not determine author for {}/{}'.format(lang_uid, uid))
                author = ''
            
            name = self._get_name(root, lang_uid, uid)
            volpage = self._get_volpage(root, lang_uid, uid)
            embedded = self._get_embedded_uids(root, lang_uid, uid)
            
            fstat = htmlfile.stat()
            cdate = self.datestr(fstat.st_ctime)
            mdate = self.datestr(fstat.st_mtime)

            textinfo = TextInfo(uid=uid, lang=lang_uid, path=path, 
                                name=name, author=author,
                                volpage=volpage, prev_uid=prev_uid,
                                next_uid=next_uid,
                                cdate=cdate,
                                mdate=mdate,
                                file_uid=uid)
            self.add_text_info(lang_uid, uid, textinfo)

            for child in embedded:
                child.path = path
                child.author = author
                child.file_uid = uid
                self.add_text_info(lang_uid, child.uid, child)

            m = regex.match(r'(.*?)(\d+)-(\d+)$', uid)
            if m:
                range_textinfo = TextInfo(uid=uid+'#', 
                lang=lang_uid,
                path=path,
                name=name,
                author=author,
                volpage=volpage,
                file_uid=uid)
                start = int(m[2])
                end = int(m[3]) + 1
                for i in range(start, end):
                    iuid = m[1] + str(i)
                    if self.exists(iuid, lang_uid):
                        continue

                    self.add_text_info(lang_uid, iuid, range_textinfo)
        
         except Exception as e:
             print('An exception occured: {!s}'.format(htmlfile))
             raise
        
        self._codepoints[lang_uid] = {
            'normal': codepoints,
            'bold': bold_codepoints,
            'italic': italic_codepoints
        }
        
        del self._ppn