Beispiel #1
0
def image(text_post, output, nopic):
    images = os.path.join(output, 'static', 'images')
    body = string2html(text_post)
    imgs = body.xpath('//img')
    for img in imgs:
        if nopic :
            img.attrib['src']=""
        else :
            src = img.attrib['src']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(src).hexdigest() + ext
            out = os.path.join(images, filename)
            # download the image only if it's not already downloaded and if it's not a html
            if not os.path.exists(out) and ext != ".html": 
                try:
                    headers=download(src, out, timeout=180)
                    type=get_filetype(headers,out)
                    # update post's html
                    resize_one(out,type,"540")
                    optimize_one(out,type)
                except Exception,e:
                    # do nothing
                    print e
                    pass
            src = '../static/images/' + filename
            img.attrib['src'] = src
            img.attrib['style']= "max-width:100%"
Beispiel #2
0
def some_questions(templates, database, output, title, publisher, dump, question):
    filename = '%s.html' % slugify(question["Title"])
    filepath = os.path.join(output, 'question', filename)
    images = os.path.join(output, 'static', 'images')
    #
    for post in chain([question], question['answers']):
        body = string2html(post['Body'])
        imgs = body.xpath('//img')
        for img in imgs:
            src = img.attrib['src']
            ext = os.path.splitext(src)[1]
            filename = sha1(src).hexdigest() + ext
            out = os.path.join(images, filename)
            # download the image only if it's not already downloaded
            if not os.path.exists(out):
                try:
                    download(src, out)
                except:
                    # do nothing
                    pass
                else:
                    # update post's html
                    src = '../static/images/' + filename
                    img.attrib['src'] = src
                    # finalize offlining
                    try:
                        resize(out)
                        optimize(out)
                    except:
                        print "Something went wrong with" + out
        # does the post contain images? if so, we surely modified
        # its content so save it.
        if imgs:
            body = html2string(body)
            post['Body'] = body

    #
    try:
        jinja(
            filepath,
            'question.html',
            templates,
            question=question,
            rooturl="..",
            title=title,
            publisher=publisher,
        )
    except:
        print ' * failed to generate: %s' % filename
Beispiel #3
0
def interne_link(text_post, domain,id):
    body = string2html(text_post)
    links = body.xpath('//a')
    for a in links:
        if a.attrib.has_key("href"):
            a_href=re.sub("^https?://","",a.attrib['href'])
            if len(a_href) >= 2 and a_href[0] == "/" and a_href[1] != "/":
                link=a_href
            elif a_href[0:len(domain)] == domain or a_href[0:len(domain)+2] == "//" + domain :
                if a_href[0] == "/":
                    link=a_href[2:]
                else:
                    link=a_href[len(domain)+1:]
            else:
                continue
            if link[0:2] == "q/" or (link[0:10] == "questions/" and link[10:17] != "tagged/"):
                is_a=link.split("/")[-1].split("#")
                if len(is_a)==2 and is_a[0] == is_a[1]:
                    #it a answers
                    qans=is_a[0]
                    a.attrib['href']="../answer/" + qans + ".html#a" + qans
                else:
                    #question
                    qid=link.split("/")[1]
                    a.attrib['href']= qid + ".html"
            elif link[0:10] == "questions/" and link[10:17] == "tagged/" :
                tag=urllib.quote(link.split("/")[-1])
                a.attrib['href']="../tag/" + tag + ".html"
            elif link[0:2] == "a/":
                qans_split = link.split("/")
                if len(qans_split) == 3:
                    qans=link.split("/")[2]
                else:
                    qans=link.split("/")[1]
                a.attrib['href']="../answer/" + qans + ".html#a" + qans
            elif link[0:6] == "users/":
                userid=link.split("/")[1]
                a.attrib['href']="../user/" + userid + ".html"
    if links:
        text_post = html2string(body)
    return text_post
Beispiel #4
0
def dl_dependencies(content, path, folder_name, c):
    body = string2html(str(content))
    imgs = body.xpath('//img')
    for img in imgs:
        if "src" in img.attrib:
            src = img.attrib['src']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            # download the image only if it's not already downloaded
            if not os.path.exists(out):
                try:
                    headers = download(src,
                                       out,
                                       c.conf["instance_url"],
                                       timeout=180)
                    type_of_file = get_filetype(headers, out)
                    optimize_one(out, type_of_file)
                except Exception as e:
                    logging.warning(str(e) + " : error with " + src)
                    pass
            src = os.path.join(folder_name, filename)
            img.attrib['src'] = src
            if 'style' in img.attrib:
                img.attrib['style'] += " max-width:100%"
            else:
                img.attrib['style'] = " max-width:100%"
    docs = body.xpath('//a')
    for a in docs:
        if "href" in a.attrib:
            src = a.attrib['href']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            if ext in [
                    ".doc", ".docx", ".pdf", ".DOC", ".DOCX", ".PDF", ".mp4",
                    ".MP4", ".webm", ".WEBM", ".mp3", ".MP3", ".zip", ".ZIP",
                    ".TXT", ".txt", ".CSV", ".csv", ".R", ".r"
            ] or (
                    not is_absolute(src) and not "wiki" in src
            ):  #Download when ext match, or when link is relatif (but not in wiki, because links in wiki are relatif)
                if not os.path.exists(out):
                    try:
                        download(unquote(src),
                                 out,
                                 c.conf["instance_url"],
                                 timeout=180)
                    except:
                        logging.warning("error with " + src)
                        pass
                src = os.path.join(folder_name, filename)
                a.attrib['href'] = src
    csss = body.xpath('//link')
    for css in csss:
        if "href" in css.attrib:
            src = css.attrib['href']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            if not os.path.exists(out):
                try:
                    download(src, out, c.conf["instance_url"], timeout=180)
                except:
                    logging.warning("error with " + src)
                    pass
            src = os.path.join(folder_name, filename)
            css.attrib['href'] = src
    jss = body.xpath('//script')
    for js in jss:
        if "src" in js.attrib:
            src = js.attrib['src']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            if not os.path.exists(out):
                try:
                    download(src, out, c.conf["instance_url"], timeout=180)
                except:
                    logging.warning("error with " + src)
                    pass
            src = os.path.join(folder_name, filename)
            js.attrib['src'] = src
    sources = body.xpath('//source')
    for source in sources:
        if "src" in source.attrib:
            src = source.attrib['src']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            if not os.path.exists(out):
                try:
                    download(src, out, c.conf["instance_url"], timeout=180)
                except:
                    logging.warning("error with " + src)
                    pass
            src = os.path.join(folder_name, filename)
            source.attrib['src'] = src
    iframes = body.xpath('//iframe')
    for iframe in iframes:
        if "src" in iframe.attrib:
            src = iframe.attrib['src']
            if "youtube" in src:
                name = src.split("/")[-1]
                out_dir = os.path.join(path, name)
                make_dir(out_dir)
                out = os.path.join(out_dir, "video.mp4")
                if not os.path.exists(out):
                    try:
                        download_youtube(src, out)
                    except Exception as e:
                        logging.warning(str(e) + " : error with " + src)
                        pass
                x = jinja(None,
                          "video.html",
                          False,
                          format="mp4",
                          folder_name=name,
                          subs=[])
                iframe.getparent().replace(iframe, string2html(x))
            elif ".pdf" in src:
                filename_src = src.split("/")[-1]
                ext = os.path.splitext(filename_src.split("?")[0])[1]
                filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
                out = os.path.join(path, filename)
                if not os.path.exists(out):
                    try:
                        download(unquote(src),
                                 out,
                                 c.conf["instance_url"],
                                 timeout=180)
                    except:
                        logging.warning("error with " + src)
                        pass
                src = os.path.join(folder_name, filename)
                iframe.attrib['src'] = src
    if imgs or docs or csss or jss or sources or iframes:
        content = html2string(body, encoding="unicode")
    return content
def dl_dependencies(content, path, folder_name, instance_url):
    body = string2html(content)
    imgs = body.xpath('//img')
    for img in imgs:
        if "src" in img.attrib:
            src = img.attrib['src']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            # download the image only if it's not already downloaded
            if not os.path.exists(out):
                try:
                    headers = download(src, out, instance_url, timeout=180)
                    type_of_file = get_filetype(headers, out)
                    # update post's html
                    resize_one(out, type_of_file, "540")
                    optimize_one(out, type_of_file)
                except:
                    logging.warning("error with " + src)
                    pass
            src = os.path.join(folder_name, filename)
            img.attrib['src'] = src
            img.attrib['style'] = "max-width:100%"
    docs = body.xpath('//a')
    for a in docs:
        if "href" in a.attrib:
            src = a.attrib['href']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            # download the image only if it's not already downloaded
            if ext in [".doc", ".docx", ".pdf", ".DOC", ".DOCX", ".PDF"
                       ]:  #TODO better solution for extention (black list?)
                if not os.path.exists(out):
                    try:
                        headers = download(src, out, instance_url, timeout=180)
                    except:
                        logging.warning("error with " + src)
                        pass
                src = os.path.join(folder_name, filename)
                a.attrib['href'] = src
    csss = body.xpath('//link')
    for css in csss:
        if "href" in css.attrib:
            src = css.attrib['href']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            if not os.path.exists(out):
                try:
                    headers = download(src, out, instance_url, timeout=180)
                except:
                    logging.warning("error with " + src)
                    pass
            src = os.path.join(folder_name, filename)
            css.attrib['href'] = src
    jss = body.xpath('//script')
    for js in jss:
        if "src" in js.attrib:
            src = js.attrib['src']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            if not os.path.exists(out):
                try:
                    headers = download(src, out, instance_url, timeout=180)
                except:
                    logging.warning("error with " + src)
                    pass
            src = os.path.join(folder_name, filename)
            js.attrib['href'] = src
    if imgs or docs or csss or jss:
        content = html2string(body, encoding="unicode")
    return content