def some_questions(templates, database, output, title, publisher, dump, question): filename = '%s.html' % slugify(question["Title"]) filepath = os.path.join(output, 'question', filename) images = os.path.join(output, 'static', 'images') # for post in chain([question], question['answers']): body = string2html(post['Body']) imgs = body.xpath('//img') for img in imgs: src = img.attrib['src'] ext = os.path.splitext(src)[1] filename = sha1(src).hexdigest() + ext out = os.path.join(images, filename) # download the image only if it's not already downloaded if not os.path.exists(out): try: download(src, out) except: # do nothing pass else: # update post's html src = '../static/images/' + filename img.attrib['src'] = src # finalize offlining try: resize(out) optimize(out) except: print "Something went wrong with" + out # does the post contain images? if so, we surely modified # its content so save it. if imgs: body = html2string(body) post['Body'] = body # try: jinja( filepath, 'question.html', templates, question=question, rooturl="..", title=title, publisher=publisher, ) except: print ' * failed to generate: %s' % filename
def interne_link(text_post, domain,id): body = string2html(text_post) links = body.xpath('//a') for a in links: if a.attrib.has_key("href"): a_href=re.sub("^https?://","",a.attrib['href']) if len(a_href) >= 2 and a_href[0] == "/" and a_href[1] != "/": link=a_href elif a_href[0:len(domain)] == domain or a_href[0:len(domain)+2] == "//" + domain : if a_href[0] == "/": link=a_href[2:] else: link=a_href[len(domain)+1:] else: continue if link[0:2] == "q/" or (link[0:10] == "questions/" and link[10:17] != "tagged/"): is_a=link.split("/")[-1].split("#") if len(is_a)==2 and is_a[0] == is_a[1]: #it a answers qans=is_a[0] a.attrib['href']="../answer/" + qans + ".html#a" + qans else: #question qid=link.split("/")[1] a.attrib['href']= qid + ".html" elif link[0:10] == "questions/" and link[10:17] == "tagged/" : tag=urllib.quote(link.split("/")[-1]) a.attrib['href']="../tag/" + tag + ".html" elif link[0:2] == "a/": qans_split = link.split("/") if len(qans_split) == 3: qans=link.split("/")[2] else: qans=link.split("/")[1] a.attrib['href']="../answer/" + qans + ".html#a" + qans elif link[0:6] == "users/": userid=link.split("/")[1] a.attrib['href']="../user/" + userid + ".html" if links: text_post = html2string(body) return text_post
def process(args): images, filepaths, uid = args count = len(filepaths) print 'offlining start', uid for index, filepath in enumerate(filepaths): print 'offline %s/%s (%s)' % (index, count, uid) try: body = html(filepath) except Exception as exc: # error during xml parsing print exc else: imgs = body.xpath('//img') for img in imgs: src = img.attrib['src'] ext = os.path.splitext(src)[1] filename = sha1(src).hexdigest() + ext out = os.path.join(images, filename) # download the image only if it's not already downloaded if not os.path.exists(out): try: download(src, out) except: # do nothing pass else: # update post's html src = '../static/images/' + filename img.attrib['src'] = src # finalize offlining resize(out) optimize(out) # does the post contain images? if so, we surely modified # its content so save it. if imgs: post = html2string(body) with open(filepath, 'w') as f: f.write(post) print 'offlining finished', uid
def dl_dependencies(content, path, folder_name, c): body = string2html(str(content)) imgs = body.xpath('//img') for img in imgs: if "src" in img.attrib: src = img.attrib['src'] ext = os.path.splitext(src.split("?")[0])[1] filename = sha256(str(src).encode('utf-8')).hexdigest() + ext out = os.path.join(path, filename) # download the image only if it's not already downloaded if not os.path.exists(out): try: headers = download(src, out, c.conf["instance_url"], timeout=180) type_of_file = get_filetype(headers, out) optimize_one(out, type_of_file) except Exception as e: logging.warning(str(e) + " : error with " + src) pass src = os.path.join(folder_name, filename) img.attrib['src'] = src if 'style' in img.attrib: img.attrib['style'] += " max-width:100%" else: img.attrib['style'] = " max-width:100%" docs = body.xpath('//a') for a in docs: if "href" in a.attrib: src = a.attrib['href'] ext = os.path.splitext(src.split("?")[0])[1] filename = sha256(str(src).encode('utf-8')).hexdigest() + ext out = os.path.join(path, filename) if ext in [ ".doc", ".docx", ".pdf", ".DOC", ".DOCX", ".PDF", ".mp4", ".MP4", ".webm", ".WEBM", ".mp3", ".MP3", ".zip", ".ZIP", ".TXT", ".txt", ".CSV", ".csv", ".R", ".r" ] or ( not is_absolute(src) and not "wiki" in src ): #Download when ext match, or when link is relatif (but not in wiki, because links in wiki are relatif) if not os.path.exists(out): try: download(unquote(src), out, c.conf["instance_url"], timeout=180) except: logging.warning("error with " + src) pass src = os.path.join(folder_name, filename) a.attrib['href'] = src csss = body.xpath('//link') for css in csss: if "href" in css.attrib: src = css.attrib['href'] ext = os.path.splitext(src.split("?")[0])[1] filename = sha256(str(src).encode('utf-8')).hexdigest() + ext out = os.path.join(path, filename) if not os.path.exists(out): try: download(src, out, c.conf["instance_url"], timeout=180) except: logging.warning("error with " + src) pass src = os.path.join(folder_name, filename) css.attrib['href'] = src jss = body.xpath('//script') for js in jss: if "src" in js.attrib: src = js.attrib['src'] ext = os.path.splitext(src.split("?")[0])[1] filename = sha256(str(src).encode('utf-8')).hexdigest() + ext out = os.path.join(path, filename) if not os.path.exists(out): try: download(src, out, c.conf["instance_url"], timeout=180) except: logging.warning("error with " + src) pass src = os.path.join(folder_name, filename) js.attrib['src'] = src sources = body.xpath('//source') for source in sources: if "src" in source.attrib: src = source.attrib['src'] ext = os.path.splitext(src.split("?")[0])[1] filename = sha256(str(src).encode('utf-8')).hexdigest() + ext out = os.path.join(path, filename) if not os.path.exists(out): try: download(src, out, c.conf["instance_url"], timeout=180) except: logging.warning("error with " + src) pass src = os.path.join(folder_name, filename) source.attrib['src'] = src iframes = body.xpath('//iframe') for iframe in iframes: if "src" in iframe.attrib: src = iframe.attrib['src'] if "youtube" in src: name = src.split("/")[-1] out_dir = os.path.join(path, name) make_dir(out_dir) out = os.path.join(out_dir, "video.mp4") if not os.path.exists(out): try: download_youtube(src, out) except Exception as e: logging.warning(str(e) + " : error with " + src) pass x = jinja(None, "video.html", False, format="mp4", folder_name=name, subs=[]) iframe.getparent().replace(iframe, string2html(x)) elif ".pdf" in src: filename_src = src.split("/")[-1] ext = os.path.splitext(filename_src.split("?")[0])[1] filename = sha256(str(src).encode('utf-8')).hexdigest() + ext out = os.path.join(path, filename) if not os.path.exists(out): try: download(unquote(src), out, c.conf["instance_url"], timeout=180) except: logging.warning("error with " + src) pass src = os.path.join(folder_name, filename) iframe.attrib['src'] = src if imgs or docs or csss or jss or sources or iframes: content = html2string(body, encoding="unicode") return content
# update post's html resize_one(out,type,"540") optimize_one(out,type) except Exception,e: # do nothing print e pass src = '../static/images/' + filename img.attrib['src'] = src img.attrib['style']= "max-width:100%" # finalize offlining # does the post contain images? if so, we surely modified # its content so save it. if imgs: text_post = html2string(body) return text_post def grab_title_description_favicon_lang(url, output_dir, do_old): get_data = urlopen(url) if "area51" in get_data.geturl(): if do_old: close_site = { "http://arabic.stackexchange.com" : "https://web.archive.org/web/20150812150251/http://arabic.stackexchange.com/" } if close_site.has_key(url): get_data = urlopen(close_site[url]) else: sys.exit("This site is a close site and it's not supported by sotoki, please open a issue") else: print "This site is a close site and --ignoreoldsite has been pass as argument so we stop" sys.exit(0)
def dl_dependencies(content, path, folder_name, instance_url): body = string2html(content) imgs = body.xpath('//img') for img in imgs: if "src" in img.attrib: src = img.attrib['src'] ext = os.path.splitext(src.split("?")[0])[1] filename = sha256(str(src).encode('utf-8')).hexdigest() + ext out = os.path.join(path, filename) # download the image only if it's not already downloaded if not os.path.exists(out): try: headers = download(src, out, instance_url, timeout=180) type_of_file = get_filetype(headers, out) # update post's html resize_one(out, type_of_file, "540") optimize_one(out, type_of_file) except: logging.warning("error with " + src) pass src = os.path.join(folder_name, filename) img.attrib['src'] = src img.attrib['style'] = "max-width:100%" docs = body.xpath('//a') for a in docs: if "href" in a.attrib: src = a.attrib['href'] ext = os.path.splitext(src.split("?")[0])[1] filename = sha256(str(src).encode('utf-8')).hexdigest() + ext out = os.path.join(path, filename) # download the image only if it's not already downloaded if ext in [".doc", ".docx", ".pdf", ".DOC", ".DOCX", ".PDF" ]: #TODO better solution for extention (black list?) if not os.path.exists(out): try: headers = download(src, out, instance_url, timeout=180) except: logging.warning("error with " + src) pass src = os.path.join(folder_name, filename) a.attrib['href'] = src csss = body.xpath('//link') for css in csss: if "href" in css.attrib: src = css.attrib['href'] ext = os.path.splitext(src.split("?")[0])[1] filename = sha256(str(src).encode('utf-8')).hexdigest() + ext out = os.path.join(path, filename) if not os.path.exists(out): try: headers = download(src, out, instance_url, timeout=180) except: logging.warning("error with " + src) pass src = os.path.join(folder_name, filename) css.attrib['href'] = src jss = body.xpath('//script') for js in jss: if "src" in js.attrib: src = js.attrib['src'] ext = os.path.splitext(src.split("?")[0])[1] filename = sha256(str(src).encode('utf-8')).hexdigest() + ext out = os.path.join(path, filename) if not os.path.exists(out): try: headers = download(src, out, instance_url, timeout=180) except: logging.warning("error with " + src) pass src = os.path.join(folder_name, filename) js.attrib['href'] = src if imgs or docs or csss or jss: content = html2string(body, encoding="unicode") return content