Example #1
0
    def _paragraphs(self, soup):
        lines = []
        refs = False

        for e in soup.find_all(['p', 'table']):
            t = text(e)

            if e.name == 'table':
                if re.match('[\-\s]+excl\s+', t) \
                        or re.match('[\-\s]*1\.[^0-9]+[0-9]+\s+2\.\s+', t):
                    e.replace_with(new_tag(soup, 'p', 'value-table'))
                    break

            if e.name == 'p':
                if re.match('1\.\s+(.+?)\s+[0-9]+$', t):
                    ex = []
                    for p in next_siblings(e):
                        tt = text(p)
                        if p.name != 'p' or not re.match(
                                '[0-9]\.\s+(.+?)\s+[0-9]+$', tt):
                            break
                        ex.append(p)
                    if ex:
                        for ee in ex:
                            ee.extract()
                        e.replace_with(new_tag(soup, 'p', 'value-table'))
                        break

        for e, t in tag_and_text(soup.find_all(['p', 'table', 'ol', 'ul'])):
            if e.parent.name in ['li', 'td']:
                continue

            #print t
            br = t == self.BR
            if t in ['References', 'Reference']:
                refs = True
                t = ''
            elif not lines and re.match('[0-9]+\.\s+[A-Za-z]+(\s+[A-Za-z]+)*$',
                                        t):
                e.name = 'h3'
            elif not lines and re.match('[0-9]+\.[0-9]+\.\s+[A-Z]', t):
                e.name = 'h4'
            elif t.endswith('and the APiCS Consortium'):
                continue

            if br and not refs:
                if lines:
                    yield Paragraph(lines)
                    lines = []
            if t and t != self.BR:
                lines.append((e, t, e.name))

        if lines:
            yield Paragraph(lines, refs=refs)
Example #2
0
    def _paragraphs(self, soup):
        lines = []
        refs = False

        for e in soup.find_all(['p', 'table']):
            t = text(e)

            if e.name == 'table':
                if re.match('[\-\s]+excl\s+', t) \
                        or re.match('[\-\s]*1\.[^0-9]+[0-9]+\s+2\.\s+', t):
                    e.replace_with(new_tag(soup, 'p', 'value-table'))
                    break

            if e.name == 'p':
                if re.match('1\.\s+(.+?)\s+[0-9]+$', t):
                    ex = []
                    for p in next_siblings(e):
                        tt = text(p)
                        if p.name != 'p' or not re.match('[0-9]\.\s+(.+?)\s+[0-9]+$', tt):
                            break
                        ex.append(p)
                    if ex:
                        for ee in ex:
                            ee.extract()
                        e.replace_with(new_tag(soup, 'p', 'value-table'))
                        break

        for e, t in tag_and_text(soup.find_all(['p', 'table', 'ol', 'ul'])):
            if e.parent.name in ['li', 'td']:
                continue

            #print t
            br = t == self.BR
            if t in ['References', 'Reference']:
                refs = True
                t = ''
            elif not lines and re.match('[0-9]+\.\s+[A-Za-z]+(\s+[A-Za-z]+)*$', t):
                e.name = 'h3'
            elif not lines and re.match('[0-9]+\.[0-9]+\.\s+[A-Z]', t):
                e.name = 'h4'
            elif t.endswith('and the APiCS Consortium'):
                continue

            if br and not refs:
                if lines:
                    yield Paragraph(lines)
                    lines = []
            if t and t != self.BR:
                lines.append((e, t, e.name))

        if lines:
            yield Paragraph(lines, refs=refs)
Example #3
0
    def __call__(self, outdir):
        """
        runs a parser workflow consisting of
        - preprocess
        - refactor
        - postprocess
        writes the results, an html, a css and a json file to disk.
        """
        cssutils_logger = logging.getLogger('CSSUTILS')
        cssutils_logger.setLevel(logging.ERROR)
        print(self.fname.namebase.encode('utf8'))

        with open(self.fname, encoding='utf8') as fp:
            c = fp.read()
        soup = BeautifulSoup(self.preprocess(self._preprocess(c)))

        # extract css from the head section of the HTML doc:
        css = cssutils.parseString('\n')
        for style in soup.find('head').find_all('style'):
            for rule in self.cssrules(style):
                css.add(rule)

        md = dict(outline=[], refs=[], authors=[])
        soup = self.refactor(soup, md)

        # enhance section headings:
        for section, t in tag_and_text(soup.find_all('h3')):
            t = t.split('[Note')[0]
            id_ = 'section-%s' % slug(t)
            md['outline'].append((t, id_))
            section.attrs['id'] = id_
            for s, attrs in [
                (u'\u21eb', {'href': '#top', 'title': 'go to top of the page', 'style': 'vertical-align: bottom'}),
                ('ΒΆ', {'class': 'headerlink', 'href': '#' + id_, 'title': 'Permalink to this section'}),
            ]:
                append(section, soup.new_string('\n'), new_tag(soup, 'a', s, **attrs))

        body = self.insert_links(unicode(soup.find('body')), md)

        # write output files:
        with open(outdir.joinpath('%s.html' % self.id), 'w', encoding='utf8') as fp:
            fp.write(self.wrap(self.postprocess(body)))

        with open(outdir.joinpath('%s.css' % self.id), 'wb') as fp:
            fp.write(self.csstext(css))

        md['authors'] = list(self.yield_valid_authors(md['authors']))
        jsondump(md, outdir.joinpath('%s.json' % self.id), indent=4)
Example #4
0
    def refactor(self, soup, md):
        # clean attributes:
        def update_style(current):
            style = []
            for rule in (current or '').split(';'):
                rule = rule.strip()
                # tab-stops:14.2pt  text-indent:36.0pt
                if rule in ['tab-stops:14.2pt', 'text-indent:36.0pt']:
                    rule = 'margin-top:0.4em'
                if normalize_whitespace(rule, repl='') in [
                        'font-family:Junicode',
                        'font-family:JunicodeRegular',
                ]:
                    continue
                if rule and not rule.startswith('mso-'):
                    style.append(rule)
            return ';'.join(style)

        for e in descendants(soup.find('body')):
            update_attr(e, 'style', update_style)
            update_attr(e, 'lang', None)

        for e, t in tag_and_text(descendants(soup.find('body'),
                                             include=['p', 'h1', 'h2']),
                                 non_empty=False):
            if not t:
                e.extract()

        for p in soup.find_all('p'):
            if p.attrs.get('class') == ['Zitat']:
                p.wrap(soup.new_tag('blockquote'))
                continue

            if not p.parent.name == 'td':
                # need to detect headings by text, too!
                t = text(p)
                match = self.heading_pattern.match(t.lower())
                if match:
                    p.name = 'h2' if match.group('sub') else 'h1'

        # re-classify section headings:
        for i in range(1, 3):
            for p in soup.find_all('h%s' % i):
                p.name = 'h%s' % (i + 1, )

        for p in soup.find_all('a'):
            if p.attrs.get('name', '').startswith('OLE_LINK'):
                p.unwrap()

        top_level_elements = children(soup.find('div'))[:4]
        if '.' in self.id:
            try:
                assert [e.name for e in top_level_elements
                        ] == ['p', 'p', 'table', 'h3']
            except:
                print top_level_elements[0]
                print top_level_elements[1]
                print top_level_elements[3]
                raise

            md['title'] = text(top_level_elements[0])
            md['authors'] = [
                s for s in re.split(',|&| and ', text(top_level_elements[1]))
            ]
            remove(*top_level_elements[:3])

        refs = soup.find(
            lambda e: e.name == 'h3' and text(e).startswith('References'))
        if refs:
            ex = []
            category = None
            for e, t in tag_and_text(next_siblings(refs)):
                if e.name == 'p':
                    if t in REFERENCE_CATEGORIES:
                        category = t
                    elif len(t.split()) < 3:
                        raise ValueError(t)
                    else:
                        if 'comment' in e.attrs.get('class', []):
                            if 'refs_comments' not in md:
                                md['refs_comments'] = [t]
                            else:
                                md['refs_comments'].append(t)
                        else:
                            if not YEAR.search(t):
                                print t
                            md['refs'].append(
                                self.get_ref(e, category=category))
                    ex.append(e)
                elif e.name in ['h3', 'h4']:
                    category = t
                    ex.append(e)
            [e.extract() for e in ex + [refs]]

        for t in soup.find_all('table'):
            t.wrap(soup.new_tag('div', **{'class': 'table'}))

        return soup
Example #5
0
    def test_tag_and_text(self):
        from souplib import tag_and_text

        for e, t in tag_and_text(self.soup.a.descendants):
            assert t
Example #6
0
    def refactor(self, soup, md):
        # clean attributes:
        def update_style(current):
            style = []
            for rule in (current or '').split(';'):
                rule = rule.strip()
                # tab-stops:14.2pt  text-indent:36.0pt
                if rule in ['tab-stops:14.2pt', 'text-indent:36.0pt']:
                    rule = 'margin-top:0.4em'
                if normalize_whitespace(rule, repl='') in [
                    'font-family:Junicode',
                    'font-family:JunicodeRegular',
                ]:
                    continue
                if rule and not rule.startswith('mso-'):
                    style.append(rule)
            return ';'.join(style)

        for e in descendants(soup.find('body')):
            update_attr(e, 'style', update_style)
            update_attr(e, 'lang', None)

        for e, t in tag_and_text(
                descendants(soup.find('body'), include=['p', 'h1', 'h2']),
                non_empty=False):
            if not t:
                e.extract()

        for p in soup.find_all('p'):
            if p.attrs.get('class') == ['Zitat']:
                p.wrap(soup.new_tag('blockquote'))
                continue

            if not p.parent.name == 'td':
                # need to detect headings by text, too!
                t = text(p)
                match = self.heading_pattern.match(t.lower())
                if match:
                    p.name = 'h2' if match.group('sub') else 'h1'

        # re-classify section headings:
        for i in range(1, 3):
            for p in soup.find_all('h%s' % i):
                p.name = 'h%s' % (i + 1,)

        for p in soup.find_all('a'):
            if p.attrs.get('name', '').startswith('OLE_LINK'):
                p.unwrap()

        top_level_elements = children(soup.find('div'))[:4]
        if '.' in self.id:
            try:
                assert [e.name for e in top_level_elements] == ['p', 'p', 'table', 'h3']
            except:
                print top_level_elements[0]
                print top_level_elements[1]
                print top_level_elements[3]
                raise

            md['title'] = text(top_level_elements[0])
            md['authors'] = [s for s in re.split(',|&| and ', text(top_level_elements[1]))]
            remove(*top_level_elements[:3])

        refs = soup.find(lambda e: e.name == 'h3' and text(e).startswith('References'))
        if refs:
            ex = []
            category = None
            for e, t in tag_and_text(next_siblings(refs)):
                if e.name == 'p':
                    if t in REFERENCE_CATEGORIES:
                        category = t
                    elif len(t.split()) < 3:
                        raise ValueError(t)
                    else:
                        if 'comment' in e.attrs.get('class', []):
                            if 'refs_comments' not in md:
                                md['refs_comments'] = [t]
                            else:
                                md['refs_comments'].append(t)
                        else:
                            if not YEAR.search(t):
                                print t
                            md['refs'].append(self.get_ref(e, category=category))
                    ex.append(e)
                elif e.name in ['h3', 'h4']:
                    category = t
                    ex.append(e)
            [e.extract() for e in ex + [refs]]

        for t in soup.find_all('table'):
            t.wrap(soup.new_tag('div', **{'class': 'table'}))

        return soup