Beispiel #1
0
def wikimarkdown(text, include_toc=True, target=None):
    from r2.lib.cssfilter import legacy_s3_url
    
    def img_swap(tag):
        name = tag.get('src')
        name = custom_img_url.search(name)
        name = name and name.group(1)
        if name and c.site.images.has_key(name):
            url = c.site.images[name]
            url = legacy_s3_url(url, c.site)
            tag['src'] = url
        else:
            tag.extract()
    
    nofollow = True
    
    text = snudown.markdown(_force_utf8(text), nofollow, target, g.domain,
                            renderer=snudown.RENDERER_WIKI)
    
    # TODO: We should test how much of a load this adds to the app
    soup = BeautifulSoup(text.decode('utf-8'))
    images = soup.findAll('img')
    
    if images:
        [img_swap(image) for image in images]
    
    if include_toc:
        tocdiv = generate_table_of_contents(soup, prefix="wiki")
        if tocdiv:
            soup.insert(0, tocdiv)
    
    text = str(soup)
    
    return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
Beispiel #2
0
def wikimarkdown(text):
    from r2.lib.cssfilter import legacy_s3_url
    
    def img_swap(tag):
        name = tag.get('src')
        name = custom_img_url.search(name)
        name = name and name.group(1)
        if name and c.site.images.has_key(name):
            url = c.site.images[name]
            url = legacy_s3_url(url, c.site)
            tag['src'] = url
        else:
            tag.extract()
    
    nofollow = True
    target = None
    
    text = snudown.markdown(_force_utf8(text), nofollow, target,
                            renderer=snudown.RENDERER_WIKI, enable_toc=True)
    
    # TODO: We should test how much of a load this adds to the app
    soup = BeautifulSoup(text)
    images = soup.findAll('img')
    
    if images:
        [img_swap(image) for image in images]
        text = str(soup)
    
    return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
Beispiel #3
0
def safemarkdown(text, nofollow=False, target=None, lang=None, wrap=True):
    from r2.lib.c_markdown import c_markdown
    from r2.lib.py_markdown import py_markdown

    if c.user.pref_no_profanity:
        text = profanity_filter(text)

    if not text:
        return None

    if c.cname and not target:
        target = "_top"

    if lang is None:
        lang = g.markdown_backend

    if lang == "snudown":
        text = snudown.markdown(_force_utf8(text), nofollow, target)
    elif lang == "c":
        text = c_markdown(text, nofollow, target)
    elif lang == "py":
        text = py_markdown(text, nofollow, target)
    else:
        raise ValueError("weird lang [%s]" % lang)

    if wrap:
        return SC_OFF + MD_START + text + MD_END + SC_ON
    else:
        return text
Beispiel #4
0
def normalize_markdown_text(parser, source):
    rendered  = markdown(unicode(source).encode('utf-8'))
    html_body = ' '.join(rendered.splitlines())
    soup      = BeautifulSoup(html_body)
    text      = ' '.join(soup.findAll(text=True))
    text      = parser.unescape(text)
    return unicode(' '.join(text.splitlines()).replace(',', ' ')).encode('utf-8')
Beispiel #5
0
def safemarkdown(text, nofollow=False, wrap=True, **kwargs):
    from r2.lib.utils import generate_affiliate_link, domain
    if not text:
        return None

    target = kwargs.get("target", None)
    text = snudown.markdown(_force_utf8(text), nofollow, target)
    to_affiliate = kwargs.get("affiliate", False)
    if to_affiliate:
        soup = BeautifulSoup(text.decode('utf-8'))
        links = soup.findAll('a')
        update_text = False

        def detect_affiliate(markdown_link):
            return domain(markdown_link.get('href'))\
                    in g.merchant_affiliate_domains

        for link in filter(detect_affiliate, links):
            update_text = True
            link['class'] = 'affiliate'
            link['data-href-url'] = link.get('href')
            link['data-affiliate-url'] = generate_affiliate_link(
                                            link.get('href')
                                         )

        if update_text:
            text = str(soup)

    if wrap:
        return SC_OFF + MD_START + text + MD_END + SC_ON
    else:
        return SC_OFF + text + SC_ON
Beispiel #6
0
def hello():
    messages = rds.zrevrangebyscore('goygoy', '+inf', '-inf')
    msgs = []
    for i in messages:
        msg = json.loads(i)
        msgs.append(dict(
            msg = _force_unicode(snudown.markdown(_force_utf8(msg['msg']))),
            username='******'
        ))
    return render_template('index.html', messages=msgs)
Beispiel #7
0
def strip_markdown(text):
    """Extract text from a markdown string.
    """
    html = markdown(text.encode('utf-8'))
    soup = BeautifulSoup(
        html,
        "html.parser",
        from_encoding='utf8'
        )
    return "".join(soup.findAll(text=True))
Beispiel #8
0
def extract_urls_from_markdown(md):
    "Extract URLs that will be hot links from a piece of raw Markdown."

    html = snudown.markdown(_force_utf8(md))
    links = SoupStrainer("a")

    for link in BeautifulSoup(html, parseOnlyThese=links):
        url = link.get('href')
        if url:
            yield url
Beispiel #9
0
def safemarkdown(text, nofollow=False, wrap=True, **kwargs):
    if not text:
        return None

    target = kwargs.get("target", None)
    text = snudown.markdown(_force_utf8(text), nofollow, target)

    if wrap:
        return SC_OFF + MD_START + text + MD_END + SC_ON
    else:
        return SC_OFF + text + SC_ON
    def runTest(self):
        output = snudown.markdown(self.input)

        for i, (a, b) in enumerate(zip(repr(self.expected_output), repr(output))):
            if a != b:
                io = StringIO.StringIO()
                print >> io, "TEST FAILED:"
                print >> io, "       input: %s" % repr(self.input)
                print >> io, "    expected: %s" % repr(self.expected_output)
                print >> io, "      actual: %s" % repr(output)
                print >> io, "              %s" % (" " * i + "^")
                self.fail(io.getvalue())
Beispiel #11
0
def safemarkdown(text, nofollow=False, target=None, wrap=True):
    if not text:
        return None

    if c.cname and not target:
        target = "_top"

    text = snudown.markdown(_force_utf8(text), nofollow, target)

    if wrap:
        return SC_OFF + MD_START + text + MD_END + SC_ON
    else:
        return text
Beispiel #12
0
def wikimarkdown(text, include_toc=True, target=None):
    from r2.lib.template_helpers import make_url_protocol_relative

    # this hard codes the stylesheet page for now, but should be parameterized
    # in the future to allow per-page images.
    from r2.models.wiki import ImagesByWikiPage
    from r2.lib.utils import UrlParser
    from r2.lib.template_helpers import add_sr
    page_images = ImagesByWikiPage.get_images(c.site, "config/stylesheet")
    
    def img_swap(tag):
        name = tag.get('src')
        name = custom_img_url.search(name)
        name = name and name.group(1)
        if name and name in page_images:
            url = page_images[name]
            url = make_url_protocol_relative(url)
            tag['src'] = url
        else:
            tag.extract()
    
    nofollow = True
    
    text = snudown.markdown(_force_utf8(text), nofollow, target,
                            renderer=snudown.RENDERER_WIKI)
    
    # TODO: We should test how much of a load this adds to the app
    soup = BeautifulSoup(text.decode('utf-8'))
    images = soup.findAll('img')
    
    if images:
        [img_swap(image) for image in images]

    def add_ext_to_link(link):
        url = UrlParser(link.get('href'))
        if url.is_reddit_url():
            link['href'] = add_sr(link.get('href'), sr_path=False)

    if c.render_style == 'compact':
        links = soup.findAll('a')
        [add_ext_to_link(a) for a in links]

    if include_toc:
        tocdiv = generate_table_of_contents(soup, prefix="wiki")
        if tocdiv:
            soup.insert(0, tocdiv)
    
    text = str(soup)
    
    return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
Beispiel #13
0
def safemarkdown(text, nofollow=False, wrap=True, **kwargs):
    if not text:
        return None

    # this lets us skip the c.cname lookup (which is apparently quite
    # slow) if target was explicitly passed to this function.
    target = kwargs.get("target", None)
    if "target" not in kwargs and c.cname:
        target = "_top"

    text = snudown.markdown(_force_utf8(text), nofollow, target)

    if wrap:
        return SC_OFF + MD_START + text + MD_END + SC_ON
    else:
        return SC_OFF + text + SC_ON
Beispiel #14
0
    def runTest(self):
        output = snudown.markdown(self.input)

        for i, (a, b) in enumerate(zip(repr(self.expected_output),
                                       repr(output))):
            if a != b:
                try:
                    io = StringIO.StringIO()
                except:
                    io = StringIO()
                print("TEST FAILED:", file=io)
                print("       input: %s" % repr(self.input), file=io)
                print("    expected: %s" % repr(self.expected_output), file=io)
                print("      actual: %s" % repr(output), file=io)
                print("              %s" % (' ' * i + '^'), file=io)
                self.fail(io.getvalue())
Beispiel #15
0
def safemarkdown(text, nofollow=False, wrap=True, **kwargs):
    if not text:
        return None

    # this lets us skip the c.cname lookup (which is apparently quite
    # slow) if target was explicitly passed to this function.
    target = kwargs.get("target", None)
    if "target" not in kwargs and c.cname:
        target = "_top"

    text = snudown.markdown(_force_utf8(text), nofollow, target)

    if wrap:
        return SC_OFF + MD_START + text + MD_END + SC_ON
    else:
        return SC_OFF + text + SC_ON
Beispiel #16
0
    def process_self(self, submission):
        html = snudown.markdown(submission.selftext.encode('UTF-8'))
        soup = BeautifulSoup(html)
        refs = {}

        # Iterate through all links, get xkcd json
        for link in soup.find_all('a'):
            href = link.get('href')
            if not href:
                continue
            j = self.xkcd_fetcher.get_json(href)
            if not j:
                logger.warn(
                    'Data could not be fetched for {url}'.format(url=href))
                continue
            refs[int(j.get('num', -1))] = {'data': j, 'href': href}

        return self.process_references(submission, refs)
Beispiel #17
0
def wikimarkdown(text, include_toc=True, target=None):
    from r2.lib.cssfilter import legacy_s3_url
    
    nofollow = True
    
    text = snudown.markdown(_force_utf8(text), nofollow, target, g.domain )
    
    # TODO: We should test how much of a load this adds to the app
    soup = BeautifulSoup(text.decode('utf-8'))

    if include_toc:
        tocdiv = generate_table_of_contents(soup, prefix="wiki")
        if tocdiv:
            soup.insert(0, tocdiv)
    
    text = str(soup)
    
    return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
Beispiel #18
0
    def process_self(self, submission):
        html = snudown.markdown(submission.selftext.encode('UTF-8'))
        soup = BeautifulSoup(html)
        refs = {}

        # Iterate through all links, get xkcd json
        for link in soup.find_all('a'):
            href = link.get('href')
            if not href:
                continue
            j = self.xkcd_fetcher.get_json(href)
            if not j:
                logger.warn('Data could not be fetched for {url}'.format(url=href))
                continue
            refs[int(j.get('num', -1))] = {
                'data': j,
                'href': href
            }

        return self.process_references(submission, refs)
Beispiel #19
0
def wikimarkdown(text, include_toc=True, target=None):
    from r2.lib.template_helpers import s3_https_if_secure

    # this hard codes the stylesheet page for now, but should be parameterized
    # in the future to allow per-page images.
    from r2.models.wiki import ImagesByWikiPage
    page_images = ImagesByWikiPage.get_images(c.site, "config/stylesheet")

    def img_swap(tag):
        name = tag.get('src')
        name = custom_img_url.search(name)
        name = name and name.group(1)
        if name and name in page_images:
            url = page_images[name]
            url = s3_https_if_secure(url)
            tag['src'] = url
        else:
            tag.extract()

    nofollow = True

    text = snudown.markdown(_force_utf8(text),
                            nofollow,
                            target,
                            renderer=snudown.RENDERER_WIKI)

    # TODO: We should test how much of a load this adds to the app
    soup = BeautifulSoup(text.decode('utf-8'))
    images = soup.findAll('img')

    if images:
        [img_swap(image) for image in images]

    if include_toc:
        tocdiv = generate_table_of_contents(soup, prefix="wiki")
        if tocdiv:
            soup.insert(0, tocdiv)

    text = str(soup)

    return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
Beispiel #20
0
def wikimarkdown(text, include_toc=True, target=None):
    from r2.lib.template_helpers import media_https_if_secure

    # this hard codes the stylesheet page for now, but should be parameterized
    # in the future to allow per-page images.
    from r2.models.wiki import ImagesByWikiPage
    page_images = ImagesByWikiPage.get_images(c.site, "config/stylesheet")
    
    def img_swap(tag):
        name = tag.get('src')
        name = custom_img_url.search(name)
        name = name and name.group(1)
        if name and name in page_images:
            url = page_images[name]
            url = media_https_if_secure(url)
            tag['src'] = url
        else:
            tag.extract()
    
    nofollow = True
    
    text = snudown.markdown(_force_utf8(text), nofollow, target,
                            renderer=snudown.RENDERER_WIKI)
    
    # TODO: We should test how much of a load this adds to the app
    soup = BeautifulSoup(text.decode('utf-8'))
    images = soup.findAll('img')
    
    if images:
        [img_swap(image) for image in images]
    
    if include_toc:
        tocdiv = generate_table_of_contents(soup, prefix="wiki")
        if tocdiv:
            soup.insert(0, tocdiv)
    
    text = str(soup)
    
    return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
def parseComment(redditComment, postAuthorName, postAuthorExists, isRoot=True):
    commentAuthorName = ''
    commentAuthorExists = 0
    try:
        commentAuthorName = fixUnicode(redditComment.author.name)
        commentAuthorExists = 1
    except AttributeError:
        commentAuthorExists = 0
    if isRoot:
        htmlFile.write('<div id="' + str(redditComment.id))
        htmlFile.write('" class="comment">\n')
    else:
        htmlFile.write('<div id="' + str(redditComment.id))
        htmlFile.write(
            '" class="comment" style="margin-bottom:10px;margin-left:0px;">\n')
    htmlFile.write('<div class="commentinfo">\n')
    if commentAuthorExists:
        if postAuthorExists and postAuthorName == commentAuthorName:
            htmlFile.write('<a href="' + redditComment.author._url)
            htmlFile.write('" class="postOP-comment">' + commentAuthorName +
                           '</a> <em>')
        else:
            htmlFile.write('<a href="' + redditComment.author._url)
            htmlFile.write('">' + commentAuthorName + '</a> <em>')
    else:
        htmlFile.write('<strong>[Deleted]</strong> <em>')
    htmlFile.write(str(redditComment.ups - redditComment.downs))
    htmlFile.write(' Points </em><em>')
    htmlFile.write('Posted at ')
    postDate = time.gmtime(redditComment.created_utc)
    htmlFile.write(str(postDate.tm_hour) + ':')
    htmlFile.write(str(postDate.tm_min) + ' UTC on ')
    htmlFile.write(monthsList[postDate.tm_mon - 1] + ' ')
    htmlFile.write(str(postDate.tm_mday) + ', ' + str(postDate.tm_year))
    htmlFile.write('</em></div>\n')
    htmlFile.write(snudown.markdown(fixMarkdown(redditComment.body)))
    for reply in redditComment._replies:
        parseComment(reply, postAuthorName, postAuthorExists, False)
    htmlFile.write('</div>\n')
Beispiel #22
0
def parseComment(redditComment, postAuthorName, postAuthorExists, isRoot=True):
    commentAuthorName = ''
    commentAuthorExists = 0
    try:
        commentAuthorName = fixUnicode(redditComment.author.name)
        commentAuthorExists = 1
    except AttributeError:
        commentAuthorExists = 0
    if isRoot:
        htmlFile.write('<div id="' + str(redditComment.id))
        htmlFile.write('" class="comment">\n')
    else:
        htmlFile.write('<div id="' + str(redditComment.id)) 
        htmlFile.write('" class="comment" style="margin-bottom:10px;margin-left:0px;">\n')
    htmlFile.write('<div class="commentinfo">\n')
    if commentAuthorExists:
        if postAuthorExists and postAuthorName == commentAuthorName:
            htmlFile.write('<a href="' + redditComment.author._url)
            htmlFile.write('" class="postOP-comment">' + commentAuthorName + '</a> <em>')
        else:
            htmlFile.write('<a href="' + redditComment.author._url)
            htmlFile.write('">' + commentAuthorName + '</a> <em>')
    else:
        htmlFile.write('<strong>[Deleted]</strong> <em>')
    htmlFile.write(str(redditComment.ups - redditComment.downs))
    htmlFile.write(' Points </em><em>')
    htmlFile.write('Posted at ')
    postDate = time.gmtime(redditComment.created_utc)
    htmlFile.write(str(postDate.tm_hour) + ':')
    htmlFile.write(str(postDate.tm_min) + ' UTC on ')
    htmlFile.write(monthsList[postDate.tm_mon-1] + ' ')
    htmlFile.write(str(postDate.tm_mday) + ', ' + str(postDate.tm_year))
    htmlFile.write('</em></div>\n')
    htmlFile.write(snudown.markdown(fixMarkdown(redditComment.body)))
    for reply in redditComment._replies:
        parseComment(reply, postAuthorName, postAuthorExists, False)
    htmlFile.write('</div>\n')
Beispiel #23
0
def wikimarkdown(text, include_toc=True, target=None):
    from r2.lib.cssfilter import legacy_s3_url

    def img_swap(tag):
        name = tag.get('src')
        name = custom_img_url.search(name)
        name = name and name.group(1)
        if name and c.site.images.has_key(name):
            url = c.site.images[name]
            url = legacy_s3_url(url, c.site)
            tag['src'] = url
        else:
            tag.extract()

    nofollow = True

    text = snudown.markdown(_force_utf8(text),
                            nofollow,
                            target,
                            renderer=snudown.RENDERER_WIKI)

    # TODO: We should test how much of a load this adds to the app
    soup = BeautifulSoup(text.decode('utf-8'))
    images = soup.findAll('img')

    if images:
        [img_swap(image) for image in images]

    if include_toc:
        tocdiv = generate_table_of_contents(soup, prefix="wiki")
        if tocdiv:
            soup.insert(0, tocdiv)

    text = str(soup)

    return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
Beispiel #24
0
def emailmarkdown(text, wrap=True):
    if not text:
        return None

    text = snudown.markdown(_force_utf8(text))

    soup = BeautifulSoup(text.decode('utf-8'))
    links = soup.findAll('a')
    update_text = False
    base = g.https_endpoint or g.origin

    for link in links:
        # if link is relative
        if link['href'].startswith('/'):
            update_text = True
            link['href'] = urljoin(base, link['href'])

    if update_text:
        text = str(soup)

    if wrap:
        return SC_OFF + MD_START + text + MD_END + SC_ON
    else:
        return SC_OFF + text + SC_ON
Beispiel #25
0
def emailmarkdown(text, wrap=True):
    if not text:
        return None

    text = snudown.markdown(_force_utf8(text))

    soup = BeautifulSoup(text.decode('utf-8'))
    links = soup.findAll('a')
    update_text = False
    base = g.https_endpoint or g.origin

    for link in links:
        # if link is relative
        if link['href'].startswith('/'):
            update_text = True
            link['href'] = urljoin(base, link['href'])

    if update_text:
        text = str(soup)

    if wrap:
        return SC_OFF + MD_START + text + MD_END + SC_ON
    else:
        return SC_OFF + text + SC_ON
Beispiel #26
0
def renderwith(renderer, body):
    body_utf8 = _force_utf8(body)
    if renderer is snudown:
        return snudown.markdown(body_utf8)
    nodesend(renderer, body_utf8)
    return nodereceive(renderer)
Beispiel #27
0
def parsePost(postObject):
    writeHeader(fixUnicode(postObject.title))
    postObject.replace_more_comments()
    postAuthorName = ''
    postAuthorExists = 0
    try:
        postAuthorName = fixUnicode(postObject.author.name)
        postAuthorExists = 1
    except AttributeError:
    	postAuthorExists = 0
    htmlFile.write('<div class="title">\n')
    if postObject.is_self:
        # The post is a self post
        htmlFile.write(fixUnicode(postObject.title))
        htmlFile.write('\n<br/><strong>')
    else:
        # The post is a link post
        htmlFile.write('<a id="postlink" href="' + fixUnicode(postObject.url))
        htmlFile.write('">')
        htmlFile.write(fixUnicode(postObject.title))
        htmlFile.write('</a>\n<br/><strong>')
    if postAuthorExists:
        htmlFile.write('Posted by <a id="userlink" href="' + fixUnicode(postObject.author._url))
        htmlFile.write('">')
        htmlFile.write(postAuthorName)
        htmlFile.write('</a>. </strong><em>')
    else:
        htmlFile.write('Posted by [Deleted]. </strong><em>')
    htmlFile.write('Posted at ')
    postDate = time.gmtime(postObject.created_utc)
    htmlFile.write(str(postDate.tm_hour) + ':')
    htmlFile.write(str(postDate.tm_min) + ' UTC on ')
    htmlFile.write(monthsList[postDate.tm_mon-1] + ' ')
    htmlFile.write(str(postDate.tm_mday) + ', ' + str(postDate.tm_year))
    htmlFile.write('. ' + str(postObject.ups - postObject.downs))
    if postObject.is_self:
        htmlFile.write(' Points. </em><em>(self.<a id="selfLink" href="')
    else:
        htmlFile.write(' Points. </em><em>(<a id="selfLink" href="')
    htmlFile.write(postObject.subreddit._url)
    htmlFile.write('">' + postObject.subreddit.display_name)
    if postObject.is_self:
        htmlFile.write('</a>)</em><em>')
    else:
        htmlFile.write('</a> Subreddit)</em><em>')
    htmlFile.write(' (<a id="postpermalink" href="')
    htmlFile.write(fixUnicode(postObject.permalink))
    htmlFile.write('">Permalink</a>)</em>\n')
    if postObject.is_self:
        htmlFile.write('<div class="post">\n')
        htmlFile.write(snudown.markdown(fixMarkdown(postObject.selftext)))
        htmlFile.write('</div>\n')
    else:
        htmlFile.write('<div class="post">\n<p>\n')
        htmlFile.write(postObject.url)
        htmlFile.write('</p>\n</div>\n')
    htmlFile.write('</div>\n')
    for comment in postObject._comments:
        parseComment(comment, postAuthorName, postAuthorExists)
    htmlFile.write('<hr id="footerhr">\n')
    htmlFile.write('<div id="footer"><em>Archived on ')
    htmlFile.write(str(datetime.datetime.utcnow()))
    htmlFile.write(' UTC</em></div>')
    htmlFile.write('\n\n</body>\n</html>\n')
Beispiel #28
0
def write_link_page(subreddits,
                    link,
                    subreddit='',
                    hide_deleted_comments=False):
    # reddit:  https://www.reddit.com/r/conspiracy/comments/8742iv/happening_now_classmate_former_friend_of/
    # archive: r/conspiracy/comments/8/7/4/2/i/v/happening_now_classmate_former_friend_of.html
    idpath = '/'.join(list(link['id']))
    filepath = link['permalink'].lower().strip('/') + '.html'
    filepath = filepath.replace(link['id'], idpath)
    if os.path.isfile(filepath):
        return True

    created = datetime.utcfromtimestamp(int(link['created_utc']))
    sorted_comments = []
    if len(link['comments']) > 0:
        sorted_comments = sort_comments(link['comments'],
                                        hide_deleted_comments)

    # traverse up to root dir, depends on id length
    static_include_path = ''
    for i in range(len(link['id']) + 2):
        static_include_path += '../'

    image = None
    if not args.noimages:
        i = is_imgur(link['url'])
        # if we have an imgur client id and the url in the loop is an imgur link then get the URL
        if i[0]:
            # Extract url from json and download the image itself
            imu = get_imgur_image_link(link['url'])
            if imu is not None:
                image = retrieve_media(imu)
        elif i[1]:
            # TODO: Implement Imgur albums support
            pass
        else:
            image = retrieve_media(link['url'])
        # Finally, if the image is downloaded then generate a path and attach it to the url entry in the link dict
        # so when it's used as an href link it will point to the path instead of the url itself
        # URL + /images/ + ID + . Image Extension
        if image is not None:
            link['url'] = subreddit + "/images/" + link['id'] + "." + image[0]

    # render comments
    comments_html = ''
    for c in sorted_comments:
        css_classes = 'ml-' + (str(c['depth']) if int(c['depth']) <=
                               max_comment_depth else str(max_comment_depth))
        if c['author'] == link['author'] and c[
                'author'] not in removed_content_identifiers:
            css_classes += ' op'
        if c['stickied'].lower() == 'true' or c['stickied'] is True:
            css_classes += ' stickied'

        # author link
        url = static_include_path + 'user/' + c['author'] + '.html'
        author_link_html = template_user_url.replace(
            '###URL_AUTHOR###', url).replace('###AUTHOR###', c['author'])

        comment_data_map = {
            '###ID###':
            c['id'],
            '###PARENT_ID###':
            c['parent_id'],
            '###DEPTH###':
            str(c['depth']),
            '###DATE###':
            created.strftime('%Y-%m-%d'),
            '###SCORE###':
            str(c['score'])
            if len(str(c['score'])) > 0 else missing_comment_score_label,
            '###BODY###':
            snudown.markdown(c['body'].replace('&gt;', '>')),
            '###CSS_CLASSES###':
            css_classes,
            '###CLASS_SCORE###':
            'badge-danger' if len(c['score']) > 0 and int(c['score']) < 1 else
            'badge-secondary',
            '###HTML_AUTHOR_URL###':
            author_link_html,
        }
        comment_html = template_comment
        for key, value in comment_data_map.items():
            comment_html = comment_html.replace(key, value)
        comments_html += comment_html + '\n'

    # render subreddits list
    subs_menu_html = ''
    for sub in subreddits:
        sub_url = static_include_path + sub + '/index.html'
        subs_menu_html += template_sub_link.replace('###URL_SUB###',
                                                    sub_url).replace(
                                                        '###SUB###', sub)

    # render selftext
    selftext_html = ''
    if len(link['selftext']) > 0:
        selftext_html = template_selftext.replace(
            '###SELFTEXT###',
            snudown.markdown(link['selftext'].replace('&gt;', '>')))

    # author link
    url = static_include_path + 'user/' + link['author'] + '.html'
    author_link_html = template_user_url.replace(
        '###URL_AUTHOR###', url).replace('###AUTHOR###', link['author'])

    #html_title = template_url.replace('#HREF#', link['url']).replace('#INNER_HTML#', link['title'])
    if image is None:
        html_title = template_url.replace('#HREF#', link['url']).replace(
            '#INNER_HTML#', link['title'])
    else:
        html_title = template_url.replace(
            '#HREF#',
            static_include_path + link['url']).replace('#INNER_HTML#',
                                                       link['title'])
    if link['is_self'] is True or link['is_self'].lower() == 'true':
        html_title = link['title']

    # render link page
    link_data_map = {
        '###INCLUDE_PATH###':
        static_include_path,
        '###SUB###':
        subreddit,
        '###TITLE###':
        link['title'],
        '###ID###':
        link['id'],
        '###DATE###':
        created.strftime('%Y-%m-%d'),
        '###ARCHIVE_DATE###':
        datetime.utcfromtimestamp(int(
            link['retrieved_on'])).strftime('%Y-%m-%d')
        if link['retrieved_on'] != '' else 'n/a',
        '###SCORE###':
        str(link['score']),
        '###NUM_COMMENTS###':
        str(link['num_comments']),
        '###URL_PROJECT###':
        url_project,
        '###URL_SUBS###':
        static_include_path + 'index.html',
        '###URL_SUB###':
        static_include_path + subreddit + '/index.html',
        '###URL_SUB_CMNT###':
        static_include_path + subreddit + '/index-' +
        sort_indexes['num_comments']['slug'] + '/index.html',
        '###URL_SUB_DATE###':
        static_include_path + subreddit + '/index-' +
        sort_indexes['created_utc']['slug'] + '/index.html',
        '###URL_SEARCH###':
        static_include_path + subreddit + '/search.html',
        '###HTML_SUBS_MENU###':
        subs_menu_html,
        '###HTML_SELFTEXT###':
        selftext_html,
        '###HTML_COMMENTS###':
        comments_html,
        '###HTML_AUTHOR_URL###':
        author_link_html,
        '###HTML_TITLE###':
        html_title,
    }
    html = template_link
    for key, value in link_data_map.items():
        html = html.replace(key, value)

    # write html
    # reddit:  https://www.reddit.com/r/conspiracy/comments/8742iv/happening_now_classmate_former_friend_of/
    # archive: r/conspiracy/comments/8/7/4/2/i/v/happening_now_classmate_former_friend_of.html
    idpath = '/'.join(list(link['id']))
    filepath = link['permalink'].lower().strip('/') + '.html'
    filepath = filepath.replace(link['id'], idpath)
    if not os.path.isfile(filepath):
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        with open(filepath, 'w', encoding='utf-8') as file:
            file.write(html)
        # print('wrote %s %s' % (created.strftime('%Y-%m-%d'), filepath))

    if image is not None:
        if not os.path.isfile(link['url']):
            os.makedirs("r/" + subreddit + "/images/", exist_ok=True)
            open("r/" + link['url'], 'wb').write(image[1])
            print("Writing media: %s " % link['url'])
        # Add a '../' because we will reuse the file location for the index file
        link['url'] = "../" + link['url']
    return True
Beispiel #29
0
        return self.tag_PRE()

    def tag_IMG(self):
        src = self.e.get("src", "")
        title = self.e.get("title", "")
        alt = self.e.get("alt")
        alt = ' "%s"' % alt if alt else ""
        return "![%s](%s%s)" % (title, src, alt)

    def tag_DEL(self):
        return "~~%s~~" % self.default()

    def tag_P(self):
        return "%s\n\n" % self.default()

    def tag_BR(self):
        return "  \n"

    def tag_A(self):
        return "[%s](%s)" % (self.default(), self.e.get("href", ""))


if __name__ == "__main__":
    import snudown

    template = "<textarea>%s</textarea><hr/>%s"
    original = unicode(open("input3.html", "r").read(), "utf-8")
    markdowned = MarkDowner(BeautifulSoup(original)).content.encode("ascii", "xmlcharrefreplace")
    final = snudown.markdown(markdowned, renderer=snudown.RENDERER_WIKI)
    open("output.html", "w").write(template % (markdowned, final))
Beispiel #30
0
def parsePost(postObject):
    writeHeader(fixUnicode(postObject.title))
    postObject.replace_more_comments()
    postAuthorName = ''
    postAuthorExists = 0
    try:
        postAuthorName = fixUnicode(postObject.author.name)
        postAuthorExists = 1
    except AttributeError:
    	postAuthorExists = 0
    htmlFile.write('<div class="title">\n')
    if postObject.is_self:
        # The post is a self post
        htmlFile.write(fixUnicode(postObject.title))
        htmlFile.write('\n<br/><strong>')
    else:
        # The post is a link post
        htmlFile.write('<a id="postlink" href="' + fixUnicode(postObject.url))
        htmlFile.write('">')
        htmlFile.write(fixUnicode(postObject.title))
        htmlFile.write('</a>\n<br/><strong>')
    if postAuthorExists:
        htmlFile.write('Posted by <a id="userlink" href="' + fixUnicode(postObject.author._url))
        htmlFile.write('">')
        htmlFile.write(postAuthorName)
        htmlFile.write('</a>. </strong><em>')
    else:
        htmlFile.write('Posted by [Deleted]. </strong><em>')
    htmlFile.write('Posted at ')
    postDate = time.gmtime(postObject.created_utc)
    htmlFile.write(str(postDate.tm_hour) + ':')
    htmlFile.write(str(postDate.tm_min) + ' UTC on ')
    htmlFile.write(monthsList[postDate.tm_mon-1] + ' ')
    htmlFile.write(str(postDate.tm_mday) + ', ' + str(postDate.tm_year))
    htmlFile.write('. ' + str(postObject.ups - postObject.downs))
    if postObject.is_self:
        htmlFile.write(' Points. </em><em>(self.<a id="selfLink" href="')
    else:
        htmlFile.write(' Points. </em><em>(<a id="selfLink" href="')
    htmlFile.write(postObject.subreddit._url)
    htmlFile.write('">' + postObject.subreddit.display_name)
    if postObject.is_self:
        htmlFile.write('</a>)</em><em>')
    else:
        htmlFile.write('</a> Subreddit)</em><em>')
    htmlFile.write(' (<a id="postpermalink" href="')
    htmlFile.write(fixUnicode(postObject.permalink))
    htmlFile.write('">Permalink</a>)</em>\n')
    if postObject.is_self:
        htmlFile.write('<div class="post">\n')
        htmlFile.write(snudown.markdown(fixMarkdown(postObject.selftext)))
        htmlFile.write('</div>\n')
    else:
        htmlFile.write('<div class="post">\n<p>\n')
        htmlFile.write(postObject.url)
        htmlFile.write('</p>\n</div>\n')
    htmlFile.write('</div>\n')
    for comment in postObject._comments:
        parseComment(comment, postAuthorName, postAuthorExists)
    htmlFile.write('<hr id="footerhr">\n')
    htmlFile.write('<div id="footer"><em>Archived on ')
    htmlFile.write(str(datetime.datetime.utcnow()))
    htmlFile.write(' UTC</em></div>')
    htmlFile.write('\n\n</body>\n</html>\n')
def write_link_page(subreddits,
                    link,
                    subreddit='',
                    hide_deleted_comments=False):
    # reddit:  https://www.reddit.com/r/conspiracy/comments/8742iv/happening_now_classmate_former_friend_of/
    # archive: r/conspiracy/comments/8/7/4/2/i/v/happening_now_classmate_former_friend_of.html
    idpath = '/'.join(list(link['id']))
    filepath = link['permalink'].lower().strip('/') + '.html'
    filepath = filepath.replace(link['id'], idpath)
    if os.path.isfile(filepath):
        return True

    created = datetime.utcfromtimestamp(int(link['created_utc']))
    sorted_comments = []
    if len(link['comments']) > 0:
        sorted_comments = sort_comments(link['comments'],
                                        hide_deleted_comments)

    # traverse up to root dir, depends on id length
    static_include_path = ''
    for i in range(len(link['id']) + 2):
        static_include_path += '../'

    # render comments
    comments_html = ''
    for c in sorted_comments:
        css_classes = 'ml-' + (str(c['depth']) if int(c['depth']) <=
                               max_comment_depth else str(max_comment_depth))
        if c['author'] == link['author'] and c[
                'author'] not in removed_content_identifiers:
            css_classes += ' op'
        if c['stickied'].lower() == 'true' or c['stickied'] is True:
            css_classes += ' stickied'

        # author link
        url = static_include_path + 'user/' + c['author'] + '.html'
        author_link_html = template_user_url.replace(
            '###URL_AUTHOR###', url).replace('###AUTHOR###', c['author'])

        comment_data_map = {
            '###ID###':
            c['id'],
            '###PARENT_ID###':
            c['parent_id'],
            '###DEPTH###':
            str(c['depth']),
            '###DATE###':
            created.strftime('%Y-%m-%d'),
            '###SCORE###':
            str(c['score'])
            if len(str(c['score'])) > 0 else missing_comment_score_label,
            '###BODY###':
            snudown.markdown(c['body'].replace('&gt;', '>')),
            '###CSS_CLASSES###':
            css_classes,
            '###CLASS_SCORE###':
            'badge-danger' if len(c['score']) > 0 and int(c['score']) < 1 else
            'badge-secondary',
            '###HTML_AUTHOR_URL###':
            author_link_html,
        }
        comment_html = template_comment
        for key, value in comment_data_map.items():
            comment_html = comment_html.replace(key, value)
        comments_html += comment_html + '\n'

    # render subreddits list
    subs_menu_html = ''
    for sub in subreddits:
        sub_url = static_include_path + sub + '/index.html'
        subs_menu_html += template_sub_link.replace('###URL_SUB###',
                                                    sub_url).replace(
                                                        '###SUB###', sub)

    # render selftext
    selftext_html = ''
    if len(link['selftext']) > 0:
        selftext_html = template_selftext.replace(
            '###SELFTEXT###',
            snudown.markdown(link['selftext'].replace('&gt;', '>')))

    # author link
    url = static_include_path + 'user/' + link['author'] + '.html'
    author_link_html = template_user_url.replace(
        '###URL_AUTHOR###', url).replace('###AUTHOR###', link['author'])

    html_title = template_url.replace('#HREF#', link['url']).replace(
        '#INNER_HTML#', link['title'])
    if link['is_self'] is True or link['is_self'].lower() == 'true':
        html_title = link['title']

    # render link page
    link_data_map = {
        '###INCLUDE_PATH###':
        static_include_path,
        '###SUB###':
        subreddit,
        '###TITLE###':
        link['title'],
        '###ID###':
        link['id'],
        '###DATE###':
        created.strftime('%Y-%m-%d'),
        '###ARCHIVE_DATE###':
        datetime.utcfromtimestamp(int(
            link['retrieved_on'])).strftime('%Y-%m-%d')
        if link['retrieved_on'] != '' else 'n/a',
        '###SCORE###':
        str(link['score']),
        '###NUM_COMMENTS###':
        str(link['num_comments']),
        '###URL_PROJECT###':
        url_project,
        '###URL_SUBS###':
        static_include_path + 'index.html',
        '###URL_SUB###':
        static_include_path + subreddit + '/index.html',
        '###URL_SUB_CMNT###':
        static_include_path + subreddit + '/index-' +
        sort_indexes['num_comments']['slug'] + '/index.html',
        '###URL_SUB_DATE###':
        static_include_path + subreddit + '/index-' +
        sort_indexes['created_utc']['slug'] + '/index.html',
        '###URL_SEARCH###':
        static_include_path + subreddit + '/search.html',
        '###HTML_SUBS_MENU###':
        subs_menu_html,
        '###HTML_SELFTEXT###':
        selftext_html,
        '###HTML_COMMENTS###':
        comments_html,
        '###HTML_AUTHOR_URL###':
        author_link_html,
        '###HTML_TITLE###':
        html_title,
    }
    html = template_link
    for key, value in link_data_map.items():
        html = html.replace(key, value)

    # write html
    # reddit:  https://www.reddit.com/r/conspiracy/comments/8742iv/happening_now_classmate_former_friend_of/
    # archive: r/conspiracy/comments/8/7/4/2/i/v/happening_now_classmate_former_friend_of.html
    idpath = '/'.join(list(link['id']))
    filepath = link['permalink'].lower().strip('/') + '.html'
    filepath = filepath.replace(link['id'], idpath)
    if not os.path.isfile(filepath):
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        with open(filepath, 'w', encoding='utf-8') as file:
            file.write(html)
        # print('wrote %s %s' % (created.strftime('%Y-%m-%d'), filepath))

    return True
Beispiel #32
0
def markdown(value):
    return snudown.markdown(value)