Esempio n. 1
0
def some_questions(templates, database, output, title, publisher, dump, question):
    filename = '%s.html' % slugify(question["Title"])
    filepath = os.path.join(output, 'question', filename)
    images = os.path.join(output, 'static', 'images')
    #
    for post in chain([question], question['answers']):
        body = string2html(post['Body'])
        imgs = body.xpath('//img')
        for img in imgs:
            src = img.attrib['src']
            ext = os.path.splitext(src)[1]
            filename = sha1(src).hexdigest() + ext
            out = os.path.join(images, filename)
            # download the image only if it's not already downloaded
            if not os.path.exists(out):
                try:
                    download(src, out)
                except:
                    # do nothing
                    pass
                else:
                    # update post's html
                    src = '../static/images/' + filename
                    img.attrib['src'] = src
                    # finalize offlining
                    try:
                        resize(out)
                        optimize(out)
                    except:
                        print "Something went wrong with" + out
        # does the post contain images? if so, we surely modified
        # its content so save it.
        if imgs:
            body = html2string(body)
            post['Body'] = body

    #
    try:
        jinja(
            filepath,
            'question.html',
            templates,
            question=question,
            rooturl="..",
            title=title,
            publisher=publisher,
        )
    except:
        print ' * failed to generate: %s' % filename
Esempio n. 2
0
def interne_link(text_post, domain,id):
    body = string2html(text_post)
    links = body.xpath('//a')
    for a in links:
        if a.attrib.has_key("href"):
            a_href=re.sub("^https?://","",a.attrib['href'])
            if len(a_href) >= 2 and a_href[0] == "/" and a_href[1] != "/":
                link=a_href
            elif a_href[0:len(domain)] == domain or a_href[0:len(domain)+2] == "//" + domain :
                if a_href[0] == "/":
                    link=a_href[2:]
                else:
                    link=a_href[len(domain)+1:]
            else:
                continue
            if link[0:2] == "q/" or (link[0:10] == "questions/" and link[10:17] != "tagged/"):
                is_a=link.split("/")[-1].split("#")
                if len(is_a)==2 and is_a[0] == is_a[1]:
                    #it a answers
                    qans=is_a[0]
                    a.attrib['href']="../answer/" + qans + ".html#a" + qans
                else:
                    #question
                    qid=link.split("/")[1]
                    a.attrib['href']= qid + ".html"
            elif link[0:10] == "questions/" and link[10:17] == "tagged/" :
                tag=urllib.quote(link.split("/")[-1])
                a.attrib['href']="../tag/" + tag + ".html"
            elif link[0:2] == "a/":
                qans_split = link.split("/")
                if len(qans_split) == 3:
                    qans=link.split("/")[2]
                else:
                    qans=link.split("/")[1]
                a.attrib['href']="../answer/" + qans + ".html#a" + qans
            elif link[0:6] == "users/":
                userid=link.split("/")[1]
                a.attrib['href']="../user/" + userid + ".html"
    if links:
        text_post = html2string(body)
    return text_post
Esempio n. 3
0
def process(args):
    images, filepaths, uid = args
    count = len(filepaths)
    print 'offlining start', uid
    for index, filepath in enumerate(filepaths):
        print 'offline %s/%s (%s)' % (index, count, uid)
        try:
            body = html(filepath)
        except Exception as exc:  # error during xml parsing
            print exc
        else:
            imgs = body.xpath('//img')
            for img in imgs:
                src = img.attrib['src']
                ext = os.path.splitext(src)[1]
                filename = sha1(src).hexdigest() + ext
                out = os.path.join(images, filename)
                # download the image only if it's not already downloaded
                if not os.path.exists(out):
                    try:
                        download(src, out)
                    except:
                        # do nothing
                        pass
                    else:
                        # update post's html
                        src = '../static/images/' + filename
                        img.attrib['src'] = src
                        # finalize offlining
                        resize(out)
                        optimize(out)
            # does the post contain images? if so, we surely modified
            # its content so save it.
            if imgs:
                post = html2string(body)
                with open(filepath, 'w') as f:
                    f.write(post)
    print 'offlining finished', uid
Esempio n. 4
0
def dl_dependencies(content, path, folder_name, c):
    body = string2html(str(content))
    imgs = body.xpath('//img')
    for img in imgs:
        if "src" in img.attrib:
            src = img.attrib['src']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            # download the image only if it's not already downloaded
            if not os.path.exists(out):
                try:
                    headers = download(src,
                                       out,
                                       c.conf["instance_url"],
                                       timeout=180)
                    type_of_file = get_filetype(headers, out)
                    optimize_one(out, type_of_file)
                except Exception as e:
                    logging.warning(str(e) + " : error with " + src)
                    pass
            src = os.path.join(folder_name, filename)
            img.attrib['src'] = src
            if 'style' in img.attrib:
                img.attrib['style'] += " max-width:100%"
            else:
                img.attrib['style'] = " max-width:100%"
    docs = body.xpath('//a')
    for a in docs:
        if "href" in a.attrib:
            src = a.attrib['href']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            if ext in [
                    ".doc", ".docx", ".pdf", ".DOC", ".DOCX", ".PDF", ".mp4",
                    ".MP4", ".webm", ".WEBM", ".mp3", ".MP3", ".zip", ".ZIP",
                    ".TXT", ".txt", ".CSV", ".csv", ".R", ".r"
            ] or (
                    not is_absolute(src) and not "wiki" in src
            ):  #Download when ext match, or when link is relatif (but not in wiki, because links in wiki are relatif)
                if not os.path.exists(out):
                    try:
                        download(unquote(src),
                                 out,
                                 c.conf["instance_url"],
                                 timeout=180)
                    except:
                        logging.warning("error with " + src)
                        pass
                src = os.path.join(folder_name, filename)
                a.attrib['href'] = src
    csss = body.xpath('//link')
    for css in csss:
        if "href" in css.attrib:
            src = css.attrib['href']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            if not os.path.exists(out):
                try:
                    download(src, out, c.conf["instance_url"], timeout=180)
                except:
                    logging.warning("error with " + src)
                    pass
            src = os.path.join(folder_name, filename)
            css.attrib['href'] = src
    jss = body.xpath('//script')
    for js in jss:
        if "src" in js.attrib:
            src = js.attrib['src']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            if not os.path.exists(out):
                try:
                    download(src, out, c.conf["instance_url"], timeout=180)
                except:
                    logging.warning("error with " + src)
                    pass
            src = os.path.join(folder_name, filename)
            js.attrib['src'] = src
    sources = body.xpath('//source')
    for source in sources:
        if "src" in source.attrib:
            src = source.attrib['src']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            if not os.path.exists(out):
                try:
                    download(src, out, c.conf["instance_url"], timeout=180)
                except:
                    logging.warning("error with " + src)
                    pass
            src = os.path.join(folder_name, filename)
            source.attrib['src'] = src
    iframes = body.xpath('//iframe')
    for iframe in iframes:
        if "src" in iframe.attrib:
            src = iframe.attrib['src']
            if "youtube" in src:
                name = src.split("/")[-1]
                out_dir = os.path.join(path, name)
                make_dir(out_dir)
                out = os.path.join(out_dir, "video.mp4")
                if not os.path.exists(out):
                    try:
                        download_youtube(src, out)
                    except Exception as e:
                        logging.warning(str(e) + " : error with " + src)
                        pass
                x = jinja(None,
                          "video.html",
                          False,
                          format="mp4",
                          folder_name=name,
                          subs=[])
                iframe.getparent().replace(iframe, string2html(x))
            elif ".pdf" in src:
                filename_src = src.split("/")[-1]
                ext = os.path.splitext(filename_src.split("?")[0])[1]
                filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
                out = os.path.join(path, filename)
                if not os.path.exists(out):
                    try:
                        download(unquote(src),
                                 out,
                                 c.conf["instance_url"],
                                 timeout=180)
                    except:
                        logging.warning("error with " + src)
                        pass
                src = os.path.join(folder_name, filename)
                iframe.attrib['src'] = src
    if imgs or docs or csss or jss or sources or iframes:
        content = html2string(body, encoding="unicode")
    return content
Esempio n. 5
0
                    # update post's html
                    resize_one(out,type,"540")
                    optimize_one(out,type)
                except Exception,e:
                    # do nothing
                    print e
                    pass
            src = '../static/images/' + filename
            img.attrib['src'] = src
            img.attrib['style']= "max-width:100%"
                # finalize offlining

    # does the post contain images? if so, we surely modified
    # its content so save it.
    if imgs:
        text_post = html2string(body)
    return text_post

def grab_title_description_favicon_lang(url, output_dir, do_old):
    get_data = urlopen(url)
    if "area51" in get_data.geturl():
        if do_old:
            close_site = { "http://arabic.stackexchange.com" : "https://web.archive.org/web/20150812150251/http://arabic.stackexchange.com/" }
            if close_site.has_key(url):
                get_data = urlopen(close_site[url])
            else:
                sys.exit("This site is a close site and it's not supported by sotoki, please open a issue")
        else:
            print "This site is a close site and --ignoreoldsite has been pass as argument so we stop"
            sys.exit(0)
Esempio n. 6
0
def dl_dependencies(content, path, folder_name, instance_url):
    body = string2html(content)
    imgs = body.xpath('//img')
    for img in imgs:
        if "src" in img.attrib:
            src = img.attrib['src']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            # download the image only if it's not already downloaded
            if not os.path.exists(out):
                try:
                    headers = download(src, out, instance_url, timeout=180)
                    type_of_file = get_filetype(headers, out)
                    # update post's html
                    resize_one(out, type_of_file, "540")
                    optimize_one(out, type_of_file)
                except:
                    logging.warning("error with " + src)
                    pass
            src = os.path.join(folder_name, filename)
            img.attrib['src'] = src
            img.attrib['style'] = "max-width:100%"
    docs = body.xpath('//a')
    for a in docs:
        if "href" in a.attrib:
            src = a.attrib['href']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            # download the image only if it's not already downloaded
            if ext in [".doc", ".docx", ".pdf", ".DOC", ".DOCX", ".PDF"
                       ]:  #TODO better solution for extention (black list?)
                if not os.path.exists(out):
                    try:
                        headers = download(src, out, instance_url, timeout=180)
                    except:
                        logging.warning("error with " + src)
                        pass
                src = os.path.join(folder_name, filename)
                a.attrib['href'] = src
    csss = body.xpath('//link')
    for css in csss:
        if "href" in css.attrib:
            src = css.attrib['href']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            if not os.path.exists(out):
                try:
                    headers = download(src, out, instance_url, timeout=180)
                except:
                    logging.warning("error with " + src)
                    pass
            src = os.path.join(folder_name, filename)
            css.attrib['href'] = src
    jss = body.xpath('//script')
    for js in jss:
        if "src" in js.attrib:
            src = js.attrib['src']
            ext = os.path.splitext(src.split("?")[0])[1]
            filename = sha256(str(src).encode('utf-8')).hexdigest() + ext
            out = os.path.join(path, filename)
            if not os.path.exists(out):
                try:
                    headers = download(src, out, instance_url, timeout=180)
                except:
                    logging.warning("error with " + src)
                    pass
            src = os.path.join(folder_name, filename)
            js.attrib['href'] = src
    if imgs or docs or csss or jss:
        content = html2string(body, encoding="unicode")
    return content