Beispiel #1
0
    def test_descendants(self):
        from souplib import descendants

        self.assertEquals(descendants(self.soup.a), [self.soup.a.b, self.soup.a.b.c])
Beispiel #2
0
    def refactor(self, soup, md):
        # clean attributes:
        def update_style(current):
            style = []
            for rule in (current or '').split(';'):
                rule = rule.strip()
                # tab-stops:14.2pt  text-indent:36.0pt
                if rule in ['tab-stops:14.2pt', 'text-indent:36.0pt']:
                    rule = 'margin-top:0.4em'
                if normalize_whitespace(rule, repl='') in [
                        'font-family:Junicode',
                        'font-family:JunicodeRegular',
                ]:
                    continue
                if rule and not rule.startswith('mso-'):
                    style.append(rule)
            return ';'.join(style)

        for e in descendants(soup.find('body')):
            update_attr(e, 'style', update_style)
            update_attr(e, 'lang', None)

        for e, t in tag_and_text(descendants(soup.find('body'),
                                             include=['p', 'h1', 'h2']),
                                 non_empty=False):
            if not t:
                e.extract()

        for p in soup.find_all('p'):
            if p.attrs.get('class') == ['Zitat']:
                p.wrap(soup.new_tag('blockquote'))
                continue

            if not p.parent.name == 'td':
                # need to detect headings by text, too!
                t = text(p)
                match = self.heading_pattern.match(t.lower())
                if match:
                    p.name = 'h2' if match.group('sub') else 'h1'

        # re-classify section headings:
        for i in range(1, 3):
            for p in soup.find_all('h%s' % i):
                p.name = 'h%s' % (i + 1, )

        for p in soup.find_all('a'):
            if p.attrs.get('name', '').startswith('OLE_LINK'):
                p.unwrap()

        top_level_elements = children(soup.find('div'))[:4]
        if '.' in self.id:
            try:
                assert [e.name for e in top_level_elements
                        ] == ['p', 'p', 'table', 'h3']
            except:
                print top_level_elements[0]
                print top_level_elements[1]
                print top_level_elements[3]
                raise

            md['title'] = text(top_level_elements[0])
            md['authors'] = [
                s for s in re.split(',|&| and ', text(top_level_elements[1]))
            ]
            remove(*top_level_elements[:3])

        refs = soup.find(
            lambda e: e.name == 'h3' and text(e).startswith('References'))
        if refs:
            ex = []
            category = None
            for e, t in tag_and_text(next_siblings(refs)):
                if e.name == 'p':
                    if t in REFERENCE_CATEGORIES:
                        category = t
                    elif len(t.split()) < 3:
                        raise ValueError(t)
                    else:
                        if 'comment' in e.attrs.get('class', []):
                            if 'refs_comments' not in md:
                                md['refs_comments'] = [t]
                            else:
                                md['refs_comments'].append(t)
                        else:
                            if not YEAR.search(t):
                                print t
                            md['refs'].append(
                                self.get_ref(e, category=category))
                    ex.append(e)
                elif e.name in ['h3', 'h4']:
                    category = t
                    ex.append(e)
            [e.extract() for e in ex + [refs]]

        for t in soup.find_all('table'):
            t.wrap(soup.new_tag('div', **{'class': 'table'}))

        return soup
Beispiel #3
0
    def refactor(self, soup, md):
        # clean attributes:
        def update_style(current):
            style = []
            for rule in (current or '').split(';'):
                rule = rule.strip()
                # tab-stops:14.2pt  text-indent:36.0pt
                if rule in ['tab-stops:14.2pt', 'text-indent:36.0pt']:
                    rule = 'margin-top:0.4em'
                if normalize_whitespace(rule, repl='') in [
                    'font-family:Junicode',
                    'font-family:JunicodeRegular',
                ]:
                    continue
                if rule and not rule.startswith('mso-'):
                    style.append(rule)
            return ';'.join(style)

        for e in descendants(soup.find('body')):
            update_attr(e, 'style', update_style)
            update_attr(e, 'lang', None)

        for e, t in tag_and_text(
                descendants(soup.find('body'), include=['p', 'h1', 'h2']),
                non_empty=False):
            if not t:
                e.extract()

        for p in soup.find_all('p'):
            if p.attrs.get('class') == ['Zitat']:
                p.wrap(soup.new_tag('blockquote'))
                continue

            if not p.parent.name == 'td':
                # need to detect headings by text, too!
                t = text(p)
                match = self.heading_pattern.match(t.lower())
                if match:
                    p.name = 'h2' if match.group('sub') else 'h1'

        # re-classify section headings:
        for i in range(1, 3):
            for p in soup.find_all('h%s' % i):
                p.name = 'h%s' % (i + 1,)

        for p in soup.find_all('a'):
            if p.attrs.get('name', '').startswith('OLE_LINK'):
                p.unwrap()

        top_level_elements = children(soup.find('div'))[:4]
        if '.' in self.id:
            try:
                assert [e.name for e in top_level_elements] == ['p', 'p', 'table', 'h3']
            except:
                print top_level_elements[0]
                print top_level_elements[1]
                print top_level_elements[3]
                raise

            md['title'] = text(top_level_elements[0])
            md['authors'] = [s for s in re.split(',|&| and ', text(top_level_elements[1]))]
            remove(*top_level_elements[:3])

        refs = soup.find(lambda e: e.name == 'h3' and text(e).startswith('References'))
        if refs:
            ex = []
            category = None
            for e, t in tag_and_text(next_siblings(refs)):
                if e.name == 'p':
                    if t in REFERENCE_CATEGORIES:
                        category = t
                    elif len(t.split()) < 3:
                        raise ValueError(t)
                    else:
                        if 'comment' in e.attrs.get('class', []):
                            if 'refs_comments' not in md:
                                md['refs_comments'] = [t]
                            else:
                                md['refs_comments'].append(t)
                        else:
                            if not YEAR.search(t):
                                print t
                            md['refs'].append(self.get_ref(e, category=category))
                    ex.append(e)
                elif e.name in ['h3', 'h4']:
                    category = t
                    ex.append(e)
            [e.extract() for e in ex + [refs]]

        for t in soup.find_all('table'):
            t.wrap(soup.new_tag('div', **{'class': 'table'}))

        return soup