Python next_siblings Examples, souplib.next_siblings Python Examples

Example #1

0

Show file

    def _paragraphs(self, soup):
        lines = []
        refs = False

        for e in soup.find_all(['p', 'table']):
            t = text(e)

            if e.name == 'table':
                if re.match('[\-\s]+excl\s+', t) \
                        or re.match('[\-\s]*1\.[^0-9]+[0-9]+\s+2\.\s+', t):
                    e.replace_with(new_tag(soup, 'p', 'value-table'))
                    break

            if e.name == 'p':
                if re.match('1\.\s+(.+?)\s+[0-9]+$', t):
                    ex = []
                    for p in next_siblings(e):
                        tt = text(p)
                        if p.name != 'p' or not re.match(
                                '[0-9]\.\s+(.+?)\s+[0-9]+$', tt):
                            break
                        ex.append(p)
                    if ex:
                        for ee in ex:
                            ee.extract()
                        e.replace_with(new_tag(soup, 'p', 'value-table'))
                        break

        for e, t in tag_and_text(soup.find_all(['p', 'table', 'ol', 'ul'])):
            if e.parent.name in ['li', 'td']:
                continue

            #print t
            br = t == self.BR
            if t in ['References', 'Reference']:
                refs = True
                t = ''
            elif not lines and re.match('[0-9]+\.\s+[A-Za-z]+(\s+[A-Za-z]+)*$',
                                        t):
                e.name = 'h3'
            elif not lines and re.match('[0-9]+\.[0-9]+\.\s+[A-Z]', t):
                e.name = 'h4'
            elif t.endswith('and the APiCS Consortium'):
                continue

            if br and not refs:
                if lines:
                    yield Paragraph(lines)
                    lines = []
            if t and t != self.BR:
                lines.append((e, t, e.name))

        if lines:
            yield Paragraph(lines, refs=refs)

Example #2

0

Show file

File: convert_texts.py Project: AnnaLuisaD/apics

    def _paragraphs(self, soup):
        lines = []
        refs = False

        for e in soup.find_all(['p', 'table']):
            t = text(e)

            if e.name == 'table':
                if re.match('[\-\s]+excl\s+', t) \
                        or re.match('[\-\s]*1\.[^0-9]+[0-9]+\s+2\.\s+', t):
                    e.replace_with(new_tag(soup, 'p', 'value-table'))
                    break

            if e.name == 'p':
                if re.match('1\.\s+(.+?)\s+[0-9]+$', t):
                    ex = []
                    for p in next_siblings(e):
                        tt = text(p)
                        if p.name != 'p' or not re.match('[0-9]\.\s+(.+?)\s+[0-9]+$', tt):
                            break
                        ex.append(p)
                    if ex:
                        for ee in ex:
                            ee.extract()
                        e.replace_with(new_tag(soup, 'p', 'value-table'))
                        break

        for e, t in tag_and_text(soup.find_all(['p', 'table', 'ol', 'ul'])):
            if e.parent.name in ['li', 'td']:
                continue

            #print t
            br = t == self.BR
            if t in ['References', 'Reference']:
                refs = True
                t = ''
            elif not lines and re.match('[0-9]+\.\s+[A-Za-z]+(\s+[A-Za-z]+)*$', t):
                e.name = 'h3'
            elif not lines and re.match('[0-9]+\.[0-9]+\.\s+[A-Z]', t):
                e.name = 'h4'
            elif t.endswith('and the APiCS Consortium'):
                continue

            if br and not refs:
                if lines:
                    yield Paragraph(lines)
                    lines = []
            if t and t != self.BR:
                lines.append((e, t, e.name))

        if lines:
            yield Paragraph(lines, refs=refs)

Example #3

0

Show file

    def refactor(self, soup, md):
        # clean attributes:
        def update_style(current):
            style = []
            for rule in (current or '').split(';'):
                rule = rule.strip()
                # tab-stops:14.2pt  text-indent:36.0pt
                if rule in ['tab-stops:14.2pt', 'text-indent:36.0pt']:
                    rule = 'margin-top:0.4em'
                if normalize_whitespace(rule, repl='') in [
                        'font-family:Junicode',
                        'font-family:JunicodeRegular',
                ]:
                    continue
                if rule and not rule.startswith('mso-'):
                    style.append(rule)
            return ';'.join(style)

        for e in descendants(soup.find('body')):
            update_attr(e, 'style', update_style)
            update_attr(e, 'lang', None)

        for e, t in tag_and_text(descendants(soup.find('body'),
                                             include=['p', 'h1', 'h2']),
                                 non_empty=False):
            if not t:
                e.extract()

        for p in soup.find_all('p'):
            if p.attrs.get('class') == ['Zitat']:
                p.wrap(soup.new_tag('blockquote'))
                continue

            if not p.parent.name == 'td':
                # need to detect headings by text, too!
                t = text(p)
                match = self.heading_pattern.match(t.lower())
                if match:
                    p.name = 'h2' if match.group('sub') else 'h1'

        # re-classify section headings:
        for i in range(1, 3):
            for p in soup.find_all('h%s' % i):
                p.name = 'h%s' % (i + 1, )

        for p in soup.find_all('a'):
            if p.attrs.get('name', '').startswith('OLE_LINK'):
                p.unwrap()

        top_level_elements = children(soup.find('div'))[:4]
        if '.' in self.id:
            try:
                assert [e.name for e in top_level_elements
                        ] == ['p', 'p', 'table', 'h3']
            except:
                print top_level_elements[0]
                print top_level_elements[1]
                print top_level_elements[3]
                raise

            md['title'] = text(top_level_elements[0])
            md['authors'] = [
                s for s in re.split(',|&| and ', text(top_level_elements[1]))
            ]
            remove(*top_level_elements[:3])

        refs = soup.find(
            lambda e: e.name == 'h3' and text(e).startswith('References'))
        if refs:
            ex = []
            category = None
            for e, t in tag_and_text(next_siblings(refs)):
                if e.name == 'p':
                    if t in REFERENCE_CATEGORIES:
                        category = t
                    elif len(t.split()) < 3:
                        raise ValueError(t)
                    else:
                        if 'comment' in e.attrs.get('class', []):
                            if 'refs_comments' not in md:
                                md['refs_comments'] = [t]
                            else:
                                md['refs_comments'].append(t)
                        else:
                            if not YEAR.search(t):
                                print t
                            md['refs'].append(
                                self.get_ref(e, category=category))
                    ex.append(e)
                elif e.name in ['h3', 'h4']:
                    category = t
                    ex.append(e)
            [e.extract() for e in ex + [refs]]

        for t in soup.find_all('table'):
            t.wrap(soup.new_tag('div', **{'class': 'table'}))

        return soup

Example #4

0

Show file

File: test_souplib.py Project: pombredanne/souplib

    def test_next_siblings(self):
        from souplib import next_siblings

        self.assertEquals(next_siblings(self.soup.a), [])

Example #5

0

Show file

File: convert_texts.py Project: AnnaLuisaD/apics

    def refactor(self, soup, md):
        # clean attributes:
        def update_style(current):
            style = []
            for rule in (current or '').split(';'):
                rule = rule.strip()
                # tab-stops:14.2pt  text-indent:36.0pt
                if rule in ['tab-stops:14.2pt', 'text-indent:36.0pt']:
                    rule = 'margin-top:0.4em'
                if normalize_whitespace(rule, repl='') in [
                    'font-family:Junicode',
                    'font-family:JunicodeRegular',
                ]:
                    continue
                if rule and not rule.startswith('mso-'):
                    style.append(rule)
            return ';'.join(style)

        for e in descendants(soup.find('body')):
            update_attr(e, 'style', update_style)
            update_attr(e, 'lang', None)

        for e, t in tag_and_text(
                descendants(soup.find('body'), include=['p', 'h1', 'h2']),
                non_empty=False):
            if not t:
                e.extract()

        for p in soup.find_all('p'):
            if p.attrs.get('class') == ['Zitat']:
                p.wrap(soup.new_tag('blockquote'))
                continue

            if not p.parent.name == 'td':
                # need to detect headings by text, too!
                t = text(p)
                match = self.heading_pattern.match(t.lower())
                if match:
                    p.name = 'h2' if match.group('sub') else 'h1'

        # re-classify section headings:
        for i in range(1, 3):
            for p in soup.find_all('h%s' % i):
                p.name = 'h%s' % (i + 1,)

        for p in soup.find_all('a'):
            if p.attrs.get('name', '').startswith('OLE_LINK'):
                p.unwrap()

        top_level_elements = children(soup.find('div'))[:4]
        if '.' in self.id:
            try:
                assert [e.name for e in top_level_elements] == ['p', 'p', 'table', 'h3']
            except:
                print top_level_elements[0]
                print top_level_elements[1]
                print top_level_elements[3]
                raise

            md['title'] = text(top_level_elements[0])
            md['authors'] = [s for s in re.split(',|&| and ', text(top_level_elements[1]))]
            remove(*top_level_elements[:3])

        refs = soup.find(lambda e: e.name == 'h3' and text(e).startswith('References'))
        if refs:
            ex = []
            category = None
            for e, t in tag_and_text(next_siblings(refs)):
                if e.name == 'p':
                    if t in REFERENCE_CATEGORIES:
                        category = t
                    elif len(t.split()) < 3:
                        raise ValueError(t)
                    else:
                        if 'comment' in e.attrs.get('class', []):
                            if 'refs_comments' not in md:
                                md['refs_comments'] = [t]
                            else:
                                md['refs_comments'].append(t)
                        else:
                            if not YEAR.search(t):
                                print t
                            md['refs'].append(self.get_ref(e, category=category))
                    ex.append(e)
                elif e.name in ['h3', 'h4']:
                    category = t
                    ex.append(e)
            [e.extract() for e in ex + [refs]]

        for t in soup.find_all('table'):
            t.wrap(soup.new_tag('div', **{'class': 'table'}))

        return soup