Ejemplo n.º 1
0
def build_url_list(url_list = "links.md"):
    """
    get the list of urls from a markdown file, and create some HTML
    It should only do it once each time we restart the server.
    """
    global links
    # check that we dont have it already
    if links is not None:
        return links
    # check that no one passed us garbage
    assert url_list is not None

    # This is the by-file way, that is troublesome, we cant edit it easily.
    html_links = None
    with open(url_list) as mdfh:
       md_links = mdfh.readlines()
       html_links_raw = markdown.markdown(" ".join(md_links))
       # Now that we have the links as html, we need to insert the class="button" to the a's
       bs = BeautifulSoup(html_links_raw, "html.parser")
       for link in bs.find_all("a"):
           souplib.update_attr(link, "class","button")

    # For now, till removed, we put the links in a template file as well
    with open("templates/links.html", "w") as fh:
        fh.write(bs.prettify())

    return html_links
Ejemplo n.º 2
0
def build_url_list(url_list="links.md"):
    """
    get the list of urls from a markdown file, and create some HTML
    It should only do it once each time we restart the server.
    """
    global links
    # check that we dont have it already
    if links is not None:
        return links
    # check that no one passed us garbage
    assert url_list is not None

    # This is the by-file way, that is troublesome, we cant edit it easily.
    html_links = None
    with open(url_list) as mdfh:
        md_links = mdfh.readlines()
        html_links_raw = markdown.markdown(" ".join(md_links))
        # Now that we have the links as html, we need to insert the class="button" to the a's
        bs = BeautifulSoup(html_links_raw, "html.parser")
        for link in bs.find_all("a"):
            souplib.update_attr(link, "class", "button")

    # For now, till removed, we put the links in a template file as well
    with open("templates/links.html", "w") as fh:
        fh.write(bs.prettify())

    return html_links
Ejemplo n.º 3
0
    def test_update_attr(self):
        from souplib import update_attr

        a = self.soup.a
        update_attr(a, 'a', 5)
        self.assertIn('a', a.attrs)
        update_attr(a, 'a', lambda v: None)
        self.assertNotIn('a', a.attrs)
        update_attr(a, 'a', lambda v: v is None)
        self.assertEquals(a.attrs['a'], True)
Ejemplo n.º 4
0
    def refactor(self, soup, md):
        # clean attributes:
        def update_style(current):
            style = []
            for rule in (current or '').split(';'):
                rule = rule.strip()
                # tab-stops:14.2pt  text-indent:36.0pt
                if rule in ['tab-stops:14.2pt', 'text-indent:36.0pt']:
                    rule = 'margin-top:0.4em'
                if normalize_whitespace(rule, repl='') in [
                        'font-family:Junicode',
                        'font-family:JunicodeRegular',
                ]:
                    continue
                if rule and not rule.startswith('mso-'):
                    style.append(rule)
            return ';'.join(style)

        for e in descendants(soup.find('body')):
            update_attr(e, 'style', update_style)
            update_attr(e, 'lang', None)

        for e, t in tag_and_text(descendants(soup.find('body'),
                                             include=['p', 'h1', 'h2']),
                                 non_empty=False):
            if not t:
                e.extract()

        for p in soup.find_all('p'):
            if p.attrs.get('class') == ['Zitat']:
                p.wrap(soup.new_tag('blockquote'))
                continue

            if not p.parent.name == 'td':
                # need to detect headings by text, too!
                t = text(p)
                match = self.heading_pattern.match(t.lower())
                if match:
                    p.name = 'h2' if match.group('sub') else 'h1'

        # re-classify section headings:
        for i in range(1, 3):
            for p in soup.find_all('h%s' % i):
                p.name = 'h%s' % (i + 1, )

        for p in soup.find_all('a'):
            if p.attrs.get('name', '').startswith('OLE_LINK'):
                p.unwrap()

        top_level_elements = children(soup.find('div'))[:4]
        if '.' in self.id:
            try:
                assert [e.name for e in top_level_elements
                        ] == ['p', 'p', 'table', 'h3']
            except:
                print top_level_elements[0]
                print top_level_elements[1]
                print top_level_elements[3]
                raise

            md['title'] = text(top_level_elements[0])
            md['authors'] = [
                s for s in re.split(',|&| and ', text(top_level_elements[1]))
            ]
            remove(*top_level_elements[:3])

        refs = soup.find(
            lambda e: e.name == 'h3' and text(e).startswith('References'))
        if refs:
            ex = []
            category = None
            for e, t in tag_and_text(next_siblings(refs)):
                if e.name == 'p':
                    if t in REFERENCE_CATEGORIES:
                        category = t
                    elif len(t.split()) < 3:
                        raise ValueError(t)
                    else:
                        if 'comment' in e.attrs.get('class', []):
                            if 'refs_comments' not in md:
                                md['refs_comments'] = [t]
                            else:
                                md['refs_comments'].append(t)
                        else:
                            if not YEAR.search(t):
                                print t
                            md['refs'].append(
                                self.get_ref(e, category=category))
                    ex.append(e)
                elif e.name in ['h3', 'h4']:
                    category = t
                    ex.append(e)
            [e.extract() for e in ex + [refs]]

        for t in soup.find_all('table'):
            t.wrap(soup.new_tag('div', **{'class': 'table'}))

        return soup
Ejemplo n.º 5
0
    def refactor(self, soup, md):
        # clean attributes:
        def update_style(current):
            style = []
            for rule in (current or '').split(';'):
                rule = rule.strip()
                # tab-stops:14.2pt  text-indent:36.0pt
                if rule in ['tab-stops:14.2pt', 'text-indent:36.0pt']:
                    rule = 'margin-top:0.4em'
                if normalize_whitespace(rule, repl='') in [
                    'font-family:Junicode',
                    'font-family:JunicodeRegular',
                ]:
                    continue
                if rule and not rule.startswith('mso-'):
                    style.append(rule)
            return ';'.join(style)

        for e in descendants(soup.find('body')):
            update_attr(e, 'style', update_style)
            update_attr(e, 'lang', None)

        for e, t in tag_and_text(
                descendants(soup.find('body'), include=['p', 'h1', 'h2']),
                non_empty=False):
            if not t:
                e.extract()

        for p in soup.find_all('p'):
            if p.attrs.get('class') == ['Zitat']:
                p.wrap(soup.new_tag('blockquote'))
                continue

            if not p.parent.name == 'td':
                # need to detect headings by text, too!
                t = text(p)
                match = self.heading_pattern.match(t.lower())
                if match:
                    p.name = 'h2' if match.group('sub') else 'h1'

        # re-classify section headings:
        for i in range(1, 3):
            for p in soup.find_all('h%s' % i):
                p.name = 'h%s' % (i + 1,)

        for p in soup.find_all('a'):
            if p.attrs.get('name', '').startswith('OLE_LINK'):
                p.unwrap()

        top_level_elements = children(soup.find('div'))[:4]
        if '.' in self.id:
            try:
                assert [e.name for e in top_level_elements] == ['p', 'p', 'table', 'h3']
            except:
                print top_level_elements[0]
                print top_level_elements[1]
                print top_level_elements[3]
                raise

            md['title'] = text(top_level_elements[0])
            md['authors'] = [s for s in re.split(',|&| and ', text(top_level_elements[1]))]
            remove(*top_level_elements[:3])

        refs = soup.find(lambda e: e.name == 'h3' and text(e).startswith('References'))
        if refs:
            ex = []
            category = None
            for e, t in tag_and_text(next_siblings(refs)):
                if e.name == 'p':
                    if t in REFERENCE_CATEGORIES:
                        category = t
                    elif len(t.split()) < 3:
                        raise ValueError(t)
                    else:
                        if 'comment' in e.attrs.get('class', []):
                            if 'refs_comments' not in md:
                                md['refs_comments'] = [t]
                            else:
                                md['refs_comments'].append(t)
                        else:
                            if not YEAR.search(t):
                                print t
                            md['refs'].append(self.get_ref(e, category=category))
                    ex.append(e)
                elif e.name in ['h3', 'h4']:
                    category = t
                    ex.append(e)
            [e.extract() for e in ex + [refs]]

        for t in soup.find_all('table'):
            t.wrap(soup.new_tag('div', **{'class': 'table'}))

        return soup