def init_index(self): self.log('init_index') self._files = [] for baseurl in self.baseurls: if baseurl.startswith('//'): addr = get_server_addr() if addr is None: continue baseurl = 'http://%s/%s' % (addr, baseurl[2:]) if baseurl.startswith('http://'): url = urljoin(baseurl, 'index.txt') self.log(' opening: %r...' % url) try: index = urlopen(url) if index.getcode() in (None, 200): files = index.read() for name in files.splitlines(): (name, _, _) = name.strip().partition('#') if not name: continue url = urljoin(baseurl, name) self.log(' loading: %r...' % url) fp = urlopen(url) if fp.getcode() in (None, 200): data = fp.read() self._files.append((name, data)) fp.close() index.close() if self._files: break except IOError as e: self.log(' error: %s' % e) continue else: # fallback to local files. path = os.path.join(baseurl, 'index.txt') self.log(' opening: %r...' % path) try: index = open(path) for name in index: (name, _, _) = name.strip().partition('#') if not name: continue path = os.path.join(baseurl, name) self.log(' loading: %r...' % path) fp = open(path, 'rb') data = fp.read() self._files.append((name, data)) fp.close() index.close() if self._files: break except IOError as e: self.log(' error: %s' % e) continue self.mode = 'index' self._text = 'INDEX' self.refresh() self.playSound('sound_close') self._curfile = None return
def prepare_atom_template(self, entries): ns = cpy.config.get('/').copy() entry_structs = [] last_updated = '' for e in entries: es = EntryStruct() es.title = e.title #this callback gives any interested plugins the chance to change #the text of a story, as presented in a feed. It gives an Entry #object, and ignores any return value run_callback(self.parent.plugins, "cb_feed_story", e) fulltext = escape(e.text) #If you only want short descriptions: #es.desc = escape(e.text[:255]) #for full text descriptions: es.desc = fulltext es.text = fulltext es.time = time.strftime('%Y-%m-%dT%H:%M:%SZ', e.time_tuple) if not last_updated: last_updated = es.time es.link = urljoin(config('base_url'), e.relpath + '.html') entry_structs.append(es) ns['last_updated'] = last_updated ns['entries'] = entry_structs return ('atom', ns)
def getSeasons(sname): url = urljoin(config.turbofilm_base, os.path.join("Series", sname)) parser = GetNSeasons() page = getpage(url)["page"] parser.feed(page) slinks = parser.get_slinks() seasons = {} for s in slinks: k = int(re.match(".*([0-9]+)$", s).groups()[0]) seasons.setdefault(k) u = urljoin(config.turbofilm_base, s) pg = getpage(u)["page"] p = GetSeriesDescription() p.feed(pg) seasons[k] = p.get_names() return seasons
def getSeasons(sname): url = urljoin(config.turbofilm_base,os.path.join("Series",sname)) parser = GetNSeasons() page = getpage(url)["page"] parser.feed(page) slinks = parser.get_slinks() seasons = {} for s in slinks: k = int(re.match(".*([0-9]+)$", s).groups()[0]) seasons.setdefault(k) u = urljoin(config.turbofilm_base, s) pg = getpage(u)["page"] p = GetSeriesDescription() p.feed(pg) seasons[k] = p.get_names() return seasons
def find_links_in_html_with_same_hostname(url, html): """ Find all the links with same hostname as url """ if (html == None): return set() url = urllib.parse.quote(url) links = url_regex.findall(html) link_set = set() for link in links: if link == None: continue try: link = str(link) if link.startswith("/"): link_set.add('http://' + url.netloc + link) elif link.startswith("http") or link.startswith("https"): if (link.find(url.netloc)): link_set.add(link) elif link.startswith("#"): continue else: link_set.add(urllib.urljoin(url.geturl(), link)) except Exception as e: pass return link_set
def path_join(self, p1, p2): if is_url(p1) or is_url(p2): if '/' not in p2 and '.' not in p2: # Assume it's a directory -- needed for package loading p2 = p2 + "/" return urllib.urljoin(p1, p2) else: return RHooks.path_join(self, p1, p2)
def help_cmd(self, event=None): """Dispatch browser on self.help_url.""" if not self.app.browsers: print("No browser left to dislay help.") return browser = self.helpbrowser if not browser or not browser.valid(): import Browser browser = Browser.Browser(self.app.root, self.app) self.helpbrowser = browser helproot = self.app.prefs.Get('landmarks', 'grail-help-root') browser.context.load(urllib.urljoin(helproot, self.HELP_URL)) browser.root.tkraise()
def start_a(self, attrs): uri = str.strip(attrs.get("href", "")) if uri: self.__node = bookmarks.nodes.Bookmark() self.__root.append_child(self.__node) if self.__baseurl: uri = urllib.urljoin(self.__baseurl, uri) self.__node.set_uri(uri) title = str.join(str.split(attrs.get("title", ""))) if title: self.__node.set_title(title) else: self.__node = None self.save_bgn()
def urlopen(self, method, url, redirect=True, **kw): """ Same as :meth:`urllib3.connectionpool.HTTPConnectionPool.urlopen` with custom cross-host redirect logic and only sends the request-uri portion of the ``url``. The given ``url`` parameter must be absolute, such that an appropriate :class:`urllib3.connectionpool.ConnectionPool` can be chosen for it. """ u = parse_url(url) conn = self.connection_from_host(u.host, port=u.port, scheme=u.scheme) kw['assert_same_host'] = False kw['redirect'] = False if 'headers' not in kw: kw['headers'] = self.headers if self.proxy is not None and u.scheme == "http": response = conn.urlopen(method, url, **kw) else: response = conn.urlopen(method, u.request_uri, **kw) redirect_location = redirect and response.get_redirect_location() if not redirect_location: return response # Support relative URLs for redirecting. redirect_location = urljoin(url, redirect_location) # RFC 7231, Section 6.4.4 if response.status == 303: method = 'GET' retries = kw.get('retries') if not isinstance(retries, Retry): retries = Retry.from_int(retries, redirect=redirect) try: retries = retries.increment(method, url, response=response, _pool=conn) except MaxRetryError: if retries.raise_on_redirect: raise return response kw['retries'] = retries kw['redirect'] = redirect log.info("Redirecting %s -> %s", url, redirect_location) return self.urlopen(method, redirect_location, **kw)
def asset_src(path): """ retorna un url absoluto hacia el recurso ubicado en ``http://popego_asset_host/path`` donde ``popego_asset_host`` es el valor definido en popego.asset_host si config['pylons.g'].revision (en los globals) no es nulo, se appendea un query string con ese valor. """ # TODO OPTIMIZE ME # inicializar el arreglo de asset_hosts at module load time # así, es una cagada: estoy parseando popego.asset_hosts cada vez asset_hosts = aslist(config.get('popego.asset_hosts', None), ',', True) or [''] if config['pylons.g'].revision is not None: path += '?' + config['pylons.g'].revision return urljoin( string.strip(asset_hosts[path.__hash__() % len(asset_hosts)]), path)
def format_directory(self): # XXX Unixism if self.url and self.url[-1] != '/': self.url = self.url + '/' fp = os.popen("ls -l -a %s/. 2>&1" % self.pathname, "r") lines = fp.readlines() fp.close() import io import re from urllib import quote from urllib import urljoin import regsub def escape(s, regsub=regsub): if not s: return "" s = regsub.gsub('&', '&', s) # Must be done first s = regsub.gsub('<', '<', s) s = regsub.gsub('>', '>', s) return s prog = re.compile(self.listing_pattern) data = self.listing_header % { 'url': self.url, 'pathname': escape(self.pathname) } for line in lines: if line[-1] == '\n': line = line[:-1] if prog.match(line) < 0: line = escape(line) + '\n' data = data + line continue mode, middle, name = prog.group(1, 2, 3) rawname = name [mode, middle, name] = map(escape, [mode, middle, name]) href = urljoin(self.url, quote(rawname)) if len(mode) == 10 and mode[0] == 'd' or name[-1:] == '/': if name[-1:] != '/': name = name + '/' if href[-1:] != '/': href = href + '/' line = '%s%s<A HREF="%s">%s</A>\n' % (mode, middle, escape(href), name) data = data + line data = data + self.listing_trailer self.fp = io.io(data) self.headers['content-type'] = 'text/html' self.headers['content-length'] = str(len(data))
def restart(self, url): self.maxrestarts = self.maxrestarts - 1 self.viewer = self.last_context.viewer self.app = self.last_context.app self.parser = None tuple = urlparse(url) # it's possible that the url send in a 301 or 302 error is a # relative URL. if there's no scheme or netloc in the # returned tuple, try joining the URL with the previous URL # and retry parsing it. if not (tuple[0] and tuple[1]): url = urllib.urljoin(self.url, url) tuple = urlparse(url) self.url = url self.fragment = tuple[-1] tuple = tuple[:-1] + ("",) if self.user_passwd: netloc = tuple[1] i = str.find(netloc, '@') if i >= 0: netloc = netloc[i+1:] netloc = self.user_passwd + '@' + netloc tuple = (tuple[0], netloc) + tuple[2:] realurl = urllib.urlunparse(tuple) # Check first to see if the previous Context has any protocol handlers api = self.last_context.get_local_api(realurl, self.method, self.params) if not api: if self.app: api = self.app.open_url(realurl, self.method, self.params, self.reload, data=self.data) else: import protocols api = protocols.protocol_access(realurl, self.method, self.params, data=self.data) BaseReader.__init__(self, self.last_context, api)
def cas(): ''' This is a proxy to the cas p3/serviceValidate endpoint which also sets the cookie on successful authentication. ''' service = request.args.get('service') ticket = request.args.get('ticket') r = requests.get(urllib.urljoin(current_app.config['CAS_URL'], 'p3/serviceValidate'), params=dict(service=service, ticket=ticket)) if r.status_code == 200: xml = et.fromstring(r.content) success = xml.find('{http://www.yale.edu/tp/cas}authenticationSuccess') if success: username = success.find('{http://www.yale.edu/tp/cas}user').text _do_login(True, username, tool=request.args.get('tool')) return Response(response=r.content, status=r.status_code, mimetype=r.headers['Content-Type'])
def getlistingdata(self): if not self.lines: return "" lines, self.lines = self.lines[:-1], self.lines[-1:] data = "" prog = re.compile(self.listing_pattern) for line in lines: if self.debuglevel > 2: print("*getl*", repr(line)) if line is None: data = data + self.listing_header % { 'url': self.escape(self.url) } continue if line[-1:] == '\r': line = line[:-1] if prog.match(line) < 0: line = self.escape(line) + '\n' data = data + line continue mode, middle, name, symlink = prog.group(1, 2, 3, 5) rawname = name [mode, middle, name] = map(self.escape, [mode, middle, name]) href = urljoin(self.url, quote(rawname)) if len(mode) == 10 and mode[0] == 'd' or name[-1:] == '/': if name[-1:] != '/': name = name + '/' if href[-1:] != '/': href = href + '/' line = '%s%s<A HREF="%s">%s</A>%s\n' % ( mode, middle, self.escape(href), name, (symlink and symlink or '')) data = data + line if self.lines == [None]: data = data + self.listing_trailer self.lines = [] return data
def prepare_rss_template(self, entries): ns = cpy.config.get('/').copy() entry_structs = [] for e in entries: #XXX: what exactly is the <guid> element? #XXX: what is the category tag? should keywords go here? es = EntryStruct() es.title = e.title #this callback gives any interested plugins the chance to change #the text of a story, as presented in a feed. It gives an Entry #object, and ignores any return value run_callback(self.parent.plugins, "cb_feed_story", e) #because <style> messed me up, I'm going to stop stripping #HTML out of my description. The RSS spec sucks. es.desc = e.text es.link = urljoin(config('base_url'), e.relpath + '.html') es.relpath = e.relpath es.time = time.strftime('%Y-%m-%dT%H:%M:%SZ', e.time_tuple) es.text = e.text entry_structs.append(es) ns['entries'] = entry_structs return ('rss', ns)
from yaml import load, dump from git import Repo from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.action_chains import ActionChains from pymongo import MongoClient from noip.scripts.template import * COLLECTION_NAME = 'luogu' WEB_DRIVER_PATH = '/Users/yangdong/tools/chromedriver' URL_JOIN = lambda path: urljoin('https://www.luogu.org/', path) PROBLEMS_PAGE_URL = lambda page: "https://www.luogu.org/problemnew/lists?page=%s" % page def init_db(): client = MongoClient('localhost', 27017) client.spider.drop_collection(COLLECTION_NAME) client.spider.create_collection(COLLECTION_NAME) collection = client.spider[COLLECTION_NAME] collection.create_index("index", unique=True) return collection def get_collection(): client = MongoClient('localhost', 27017) return client.spider[COLLECTION_NAME]
def receive(self, message): # 如果有多个收件人的话,只解释第一个收件人 to = parseaddr(message.to)[1] to = to.split("@")[0] if to and "@" in to else "xxx" if "__" in to: listto = to.split("__") username = listto[0] if listto[0] else "admin" to = listto[1] else: username = "******" user = KeUser.all().filter("name = ", username).get() if not user: username = "******" user = KeUser.all().filter("name = ", username).get() if not user or not user.kindle_email: self.response.out.write("No account or no email configured!") return sender = parseaddr(message.sender)[1] mailhost = sender.split("@")[1] if sender and "@" in sender else None if (not sender or not mailhost) or ( not user.whitelist.filter("mail = ", "*").get() and not user.whitelist.filter("mail = ", sender.lower()).get() and not user.whitelist.filter("mail = ", "@" + mailhost.lower()).get() ): self.response.out.write("Spam mail!") log.warn("Spam mail from : %s" % sender) return if hasattr(message, "subject"): subject = decode_subject(message.subject).strip() else: subject = u"NoSubject" # 邮件主题中如果在最后添加一个 !links,则强制提取邮件中的链接然后生成电子书 forceToLinks = False forceToArticle = False if subject.endswith("!links"): subject = subject.replace("!links", "").rstrip() forceToLinks = True elif subject.find(" !links ") >= 0: subject = subject.replace(" !links ", "") forceToLinks = True if subject.endswith("!article"): subject = subject.replace("!article", "").rstrip() forceToArticle = True elif subject.find(" !article ") >= 0: subject = subject.replace(" !article ", "") forceToArticle = True # 通过邮件触发一次“现在投递” if to.lower() == "trigger": return self.TrigDeliver(subject, username) # 获取和解码邮件内容 txt_bodies = message.bodies("text/plain") html_bodies = message.bodies("text/html") try: allBodies = [body.decode() for ctype, body in html_bodies] except: log.warn("Decode html bodies of mail failed.") allBodies = [] # 此邮件为纯文本邮件 if len(allBodies) == 0: log.info("no html body, use text body.") try: allBodies = [body.decode() for ctype, body in txt_bodies] except: log.warn("Decode text bodies of mail failed.") allBodies = [] bodies = u"".join(allBodies) if not bodies: return bodyurls = [] for l in bodies.split("\n"): l = l.strip() if not l: continue link = IsHyperLink(l) if link: bodyurls.append('<a href="%s">%s</a><br />' % (link, link)) else: break bodies = u"""<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>%s</title></head><body>%s</body></html>""" % ( subject, "".join(bodyurls) if bodyurls else bodies, ) allBodies = [bodies.encode("utf-8")] # 开始处理邮件内容 soup = BeautifulSoup(allBodies[0], "lxml") # 合并多个邮件文本段 if len(allBodies) > 1: for o in allBodies[1:]: so = BeautifulSoup(o, "lxml") b = so.find("body") if not b: continue for c in b.contents: soup.body.append(c) # 判断邮件内容是文本还是链接(包括多个链接的情况) links = [] body = soup.body if soup.find("body") else soup if not forceToArticle: for s in body.stripped_strings: link = IsHyperLink(s) if link: if link not in links: links.append(link) elif not forceToLinks: # 如果是多个链接,则必须一行一个,除非强制提取链接 break if not links and not forceToArticle: # 正常字符判断没有链接,看html的a标签 links = [link["href"] for link in soup.find_all("a", attrs={"href": True})] text = " ".join([s for s in body.stripped_strings]) # 如果有相对路径,则在里面找一个绝对路径,然后转换其他 hasRelativePath = False fullPath = "" for link in links: text = text.replace(link, "") if not link.startswith("http"): hasRelativePath = True if not fullPath and link.startswith("http"): fullPath = link if hasRelativePath and fullPath: for idx, link in enumerate(links): if not link.startswith("http"): links[idx] = urllib.urljoin(fullPath, link) # 如果字数太多,则认为直接推送正文内容 if not forceToLinks and (len(links) != 1 or len(text) > WORDCNT_THRESHOLD_FOR_APMAIL): links = [] if links: # 判断是下载文件还是转发内容 isbook = bool(to.lower() in ("book", "file", "download")) isbook = link[-5:].lower() in (".mobi", ".epub", ".docx") if not isbook else isbook isbook = link[-4:].lower() in (".pdf", ".txt", ".doc", ".rtf") if not isbook else isbook param = { "u": username, "urls": base64.urlsafe_b64encode(zlib.compress("|".join(links), 9)), "type": "Download" if isbook else user.book_type, "to": user.kindle_email, "tz": user.timezone, "subject": subject[:SUBJECT_WORDCNT_FOR_APMAIL], "lng": user.ownfeeds.language, "keepimage": "1" if user.ownfeeds.keep_image else "0", } taskqueue.add(url="/url2book", queue_name="deliverqueue1", method="GET", params=param, target="worker") else: # 直接转发邮件正文 # 先判断是否有图片 from lib.makeoeb import MimeFromFilename hasimage = False if hasattr(message, "attachments"): for f, c in message.attachments: if MimeFromFilename(f): hasimage = True break # 先修正不规范的HTML邮件 h = soup.find("head") if not h: h = soup.new_tag("head") soup.html.insert(0, h) t = soup.head.find("title") if not t: t = soup.new_tag("title") t.string = subject soup.head.insert(0, t) # 有图片的话,要生成MOBI或EPUB才行 # 而且多看邮箱不支持html推送,也先转换epub再推送 if hasimage or (user.book_type == "epub"): from main import local_time from lib.makeoeb import ( getOpts, CreateOeb, setMetaData, ServerContainer, byteStringIO, EPUBOutput, MOBIOutput, ) # 仿照Amazon的转换服务器的处理,去掉CSS if DELETE_CSS_FOR_APPSPOTMAIL: tag = soup.find("style", attrs={"type": "text/css"}) if tag: tag.extract() for tag in soup.find_all(attrs={"style": True}): del tag["style"] # 将图片的src的文件名调整好 for img in soup.find_all("img", attrs={"src": True}): if img["src"].lower().startswith("cid:"): img["src"] = img["src"][4:] opts = getOpts() oeb = CreateOeb(log, None, opts) setMetaData( oeb, subject[:SUBJECT_WORDCNT_FOR_APMAIL], user.ownfeeds.language, local_time(tz=user.timezone), pubtype="book:book:KindleEar", ) oeb.container = ServerContainer(log) id, href = oeb.manifest.generate(id="page", href="page.html") item = oeb.manifest.add(id, href, "application/xhtml+xml", data=unicode(soup)) oeb.spine.add(item, False) oeb.toc.add(subject, href) if hasattr(message, "attachments"): for filename, content in message.attachments: mimetype = MimeFromFilename(filename) if mimetype: try: content = content.decode() except: pass else: id, href = oeb.manifest.generate(id="img", href=filename) item = oeb.manifest.add(id, href, mimetype, data=content) oIO = byteStringIO() o = EPUBOutput() if user.book_type == "epub" else MOBIOutput() o.convert(oeb, oIO, opts, log) BaseHandler.SendToKindle( username, user.kindle_email, subject[:SUBJECT_WORDCNT_FOR_APMAIL], user.book_type, str(oIO.getvalue()), user.timezone, ) else: # 没有图片则直接推送HTML文件,阅读体验更佳 m = soup.find("meta", attrs={"http-equiv": "Content-Type"}) if not m: m = soup.new_tag("meta", content="text/html; charset=utf-8") m["http-equiv"] = "Content-Type" soup.html.head.insert(0, m) else: m["content"] = "text/html; charset=utf-8" html = unicode(soup).encode("utf-8") BaseHandler.SendToKindle( username, user.kindle_email, subject[:SUBJECT_WORDCNT_FOR_APMAIL], "html", html, user.timezone, False, ) self.response.out.write("Done")
def receive(self, message): #如果有多个收件人的话,只解释第一个收件人 to = parseaddr(message.to)[1] to = to.split('@')[0] if to and '@' in to else 'xxx' if '__' in to: listto = to.split('__') username = listto[0] if listto[0] else 'admin' to = listto[1] else: username = '******' user = KeUser.all().filter('name = ', username).get() if not user: username = '******' user = KeUser.all().filter('name = ', username).get() if not user or not user.kindle_email: self.response.out.write('No account or no email configured!') return sender = parseaddr(message.sender)[1] mailhost = sender.split('@')[1] if sender and '@' in sender else None if (not sender or not mailhost) or \ (not user.whitelist.filter('mail = ', '*').get() and not user.whitelist.filter('mail = ', sender.lower()).get() and not user.whitelist.filter('mail = ', '@' + mailhost.lower()).get()): self.response.out.write("Spam mail!") log.warn('Spam mail from : %s' % sender) return if hasattr(message, 'subject'): subject = decode_subject(message.subject).strip() else: subject = u"NoSubject" #邮件主题中如果在最后添加一个 !links,则强制提取邮件中的链接然后生成电子书 forceToLinks = False forceToArticle = False if subject.endswith('!links'): subject = subject.replace('!links', '').rstrip() forceToLinks = True elif subject.find(' !links ') >= 0: subject = subject.replace(' !links ', '') forceToLinks = True #如果邮件主题在最后添加一个 !article,则强制转换邮件内容为电子书,忽略其中的链接 if not forceToLinks: if subject.endswith('!article'): subject = subject.replace('!article', '').rstrip() forceToArticle = True elif subject.find(' !article ') >= 0: subject = subject.replace(' !article ', '') forceToArticle = True #通过邮件触发一次“现在投递” if to.lower() == 'trigger': return self.TrigDeliver(subject, username) #获取和解码邮件内容 txt_bodies = message.bodies('text/plain') html_bodies = message.bodies('text/html') try: allBodies = [body.decode() for ctype, body in html_bodies] except: log.warn('Decode html bodies of mail failed.') allBodies = [] #此邮件为纯文本邮件 if len(allBodies) == 0: log.info('no html body, use text body.') try: allBodies = [body.decode() for ctype, body in txt_bodies] except: log.warn('Decode text bodies of mail failed.') allBodies = [] bodies = u''.join(allBodies) if not bodies: return bodyurls = [] for l in bodies.split('\n'): l = l.strip() if not l: continue link = IsHyperLink(l) if link: bodyurls.append('<a href="%s">%s</a><br />' % (link,link)) else: break bodies = u"""<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>%s</title></head><body>%s</body></html>""" %(subject, ''.join(bodyurls) if bodyurls else bodies) allBodies = [bodies.encode('utf-8')] #开始处理邮件内容 soup = BeautifulSoup(allBodies[0], 'lxml') #合并多个邮件文本段 if len(allBodies) > 1: for o in allBodies[1:]: so = BeautifulSoup(o, 'lxml') b = so.find('body') if not b: continue for c in b.contents: soup.body.append(c) #判断邮件内容是文本还是链接(包括多个链接的情况) links = [] body = soup.body if soup.find('body') else soup if not forceToArticle: #如果强制转正文就不分析链接了,否则先分析和提取链接 for s in body.stripped_strings: link = IsHyperLink(s) if link: if link not in links: links.append(link) #如果是多个链接,则必须一行一个,不能留空,除非强制提取链接 #这个处理是为了去除部分邮件客户端在邮件末尾添加的一个广告链接 elif not forceToLinks: break if not links and not forceToArticle: #如果通过正常字符(显示出来的)判断没有链接,则看html的a标签 links = [link['href'] for link in soup.find_all('a', attrs={'href':True})] text = ' '.join([s for s in body.stripped_strings]) #如果有相对路径,则在里面找一个绝对路径,然后转换其他 hasRelativePath = False fullPath = '' for link in links: text = text.replace(link, '') if not link.startswith('http'): hasRelativePath = True if not fullPath and link.startswith('http'): fullPath = link if hasRelativePath and fullPath: for idx, link in enumerate(links): if not link.startswith('http'): links[idx] = urllib.urljoin(fullPath, link) #如果字数太多,则认为直接推送正文内容 if not forceToLinks and (len(links) != 1 or len(text) > WORDCNT_THRESHOLD_FOR_APMAIL): links = [] if links: #判断是下载文件还是转发内容 isBook = bool(to.lower() in ('book', 'file', 'download')) if not isBook: isBook = bool(link[-5:].lower() in ('.mobi','.epub','.docx')) if not isBook: isBook = bool(link[-4:].lower() in ('.pdf','.txt','.doc','.rtf')) isDebug = bool(to.lower() == 'debug') if isDebug: bookType = 'Debug' elif isbook: bookType = 'Download' else: bookType = user.book_type param = {'u': username, 'urls': base64.urlsafe_b64encode(zlib.compress('|'.join(links), 9)), 'type': bookType, 'to': user.kindle_email, 'tz': user.timezone, 'subject': subject[:SUBJECT_WORDCNT_FOR_APMAIL], 'lng': user.ownfeeds.language, 'keepimage': '1' if user.ownfeeds.keep_image else '0' } taskqueue.add(url='/url2book', queue_name="deliverqueue1", method='GET', params=param, target='worker') else: #直接转发邮件正文 #先判断是否有图片 from lib.makeoeb import MimeFromFilename hasimage = False if hasattr(message, 'attachments'): for f,c in message.attachments: if MimeFromFilename(f): hasimage = True break #先修正不规范的HTML邮件 h = soup.find('head') if not h: h = soup.new_tag('head') soup.html.insert(0, h) t = soup.head.find('title') if not t: t = soup.new_tag('title') t.string = subject soup.head.insert(0, t) #有图片的话,要生成MOBI或EPUB才行 #而且多看邮箱不支持html推送,也先转换epub再推送 if hasimage or (user.book_type == "epub"): from main import local_time from lib.makeoeb import (getOpts, CreateOeb, setMetaData, ServerContainer, byteStringIO, EPUBOutput, MOBIOutput) #仿照Amazon的转换服务器的处理,去掉CSS if DELETE_CSS_FOR_APPSPOTMAIL: tag = soup.find('style', attrs={'type':'text/css'}) if tag: tag.extract() for tag in soup.find_all(attrs={'style':True}): del tag['style'] #将图片的src的文件名调整好 for img in soup.find_all('img',attrs={'src':True}): if img['src'].lower().startswith('cid:'): img['src'] = img['src'][4:] opts = getOpts() oeb = CreateOeb(log, None, opts) setMetaData(oeb, subject[:SUBJECT_WORDCNT_FOR_APMAIL], user.ownfeeds.language, local_time(tz=user.timezone), pubtype='book:book:KindleEar') oeb.container = ServerContainer(log) id, href = oeb.manifest.generate(id='page', href='page.html') item = oeb.manifest.add(id, href, 'application/xhtml+xml', data=unicode(soup)) oeb.spine.add(item, False) oeb.toc.add(subject, href) if hasattr(message, 'attachments'): for filename,content in message.attachments: mimetype = MimeFromFilename(filename) if mimetype: try: content = content.decode() except: pass else: id, href = oeb.manifest.generate(id='img', href=filename) item = oeb.manifest.add(id, href, mimetype, data=content) oIO = byteStringIO() o = EPUBOutput() if user.book_type == "epub" else MOBIOutput() o.convert(oeb, oIO, opts, log) BaseHandler.SendToKindle(username, user.kindle_email, subject[:SUBJECT_WORDCNT_FOR_APMAIL], user.book_type, str(oIO.getvalue()), user.timezone) else: #没有图片则直接推送HTML文件,阅读体验更佳 m = soup.find('meta', attrs={"http-equiv":"Content-Type"}) if not m: m = soup.new_tag('meta', content="text/html; charset=utf-8") m["http-equiv"] = "Content-Type" soup.html.head.insert(0,m) else: m['content'] = "text/html; charset=utf-8" html = unicode(soup).encode('utf-8') BaseHandler.SendToKindle(username, user.kindle_email, subject[:SUBJECT_WORDCNT_FOR_APMAIL], 'html', html, user.timezone, False) self.response.out.write('Done')
def comment_form(self, story): url = urljoin(config('base_url'), '/Comments/add') return ('comment_form', {'url': url, 'story':story})
def receive(self, message): #如果有多个收件人的话,只解释第一个收件人 to = parseaddr(message.to)[1] to = to.split('@')[0] if to and '@' in to else 'xxx' if '__' in to: listto = to.split('__') username = listto[0] if listto[0] else 'admin' to = listto[1] else: username = '******' user = KeUser.all().filter('name = ', username).get() if not user: username = '******' user = KeUser.all().filter('name = ', username).get() if not user or not user.kindle_email: self.response.out.write('No account or no email configured!') return sender = parseaddr(message.sender)[1] mailhost = sender.split('@')[1] if sender and '@' in sender else None if (not sender or not mailhost) or \ (not user.whitelist.filter('mail = ', '*').get() and not user.whitelist.filter('mail = ', sender.lower()).get() and not user.whitelist.filter('mail = ', '@' + mailhost.lower()).get()): self.response.out.write("Spam mail!") log.warn('Spam mail from : %s' % sender) return if hasattr(message, 'subject'): subject = decode_subject(message.subject).strip() else: subject = u"NoSubject" #邮件主题中如果在最后添加一个 !links,则强制提取邮件中的链接然后生成电子书 forceToLinks = False forceToArticle = False if subject.endswith('!links'): subject = subject.replace('!links', '').rstrip() forceToLinks = True elif subject.find(' !links ') >= 0: subject = subject.replace(' !links ', '') forceToLinks = True #如果邮件主题在最后添加一个 !article,则强制转换邮件内容为电子书,忽略其中的链接 if not forceToLinks: if subject.endswith('!article'): subject = subject.replace('!article', '').rstrip() forceToArticle = True elif subject.find(' !article ') >= 0: subject = subject.replace(' !article ', '') forceToArticle = True #通过邮件触发一次“现在投递” if to.lower() == 'trigger': return self.TrigDeliver(subject, username) #获取和解码邮件内容 txt_bodies = message.bodies('text/plain') html_bodies = message.bodies('text/html') try: allBodies = [body.decode() for ctype, body in html_bodies] except: log.warn('Decode html bodies of mail failed.') allBodies = [] #此邮件为纯文本邮件 if len(allBodies) == 0: log.info('no html body, use text body.') try: allBodies = [body.decode() for ctype, body in txt_bodies] except: log.warn('Decode text bodies of mail failed.') allBodies = [] bodies = u''.join(allBodies) if not bodies: return bodyurls = [] for l in bodies.split('\n'): l = l.strip() if not l: continue link = IsHyperLink(l) if link: bodyurls.append('<a href="%s">%s</a><br />' % (link, link)) else: break bodies = u"""<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>%s</title></head><body>%s</body></html>""" % ( subject, ''.join(bodyurls) if bodyurls else bodies) allBodies = [bodies.encode('utf-8')] #开始处理邮件内容 soup = BeautifulSoup(allBodies[0], 'lxml') #合并多个邮件文本段 if len(allBodies) > 1: for o in allBodies[1:]: so = BeautifulSoup(o, 'lxml') b = so.find('body') if not b: continue for c in b.contents: soup.body.append(c) #判断邮件内容是文本还是链接(包括多个链接的情况) links = [] body = soup.body if soup.find('body') else soup if not forceToArticle: #如果强制转正文就不分析链接了,否则先分析和提取链接 for s in body.stripped_strings: link = IsHyperLink(s) if link: if link not in links: links.append(link) #如果是多个链接,则必须一行一个,不能留空,除非强制提取链接 #这个处理是为了去除部分邮件客户端在邮件末尾添加的一个广告链接 elif not forceToLinks: break if not links and not forceToArticle: #如果通过正常字符(显示出来的)判断没有链接,则看html的a标签 links = [ link['href'] for link in soup.find_all('a', attrs={'href': True}) ] text = ' '.join([s for s in body.stripped_strings]) #如果有相对路径,则在里面找一个绝对路径,然后转换其他 hasRelativePath = False fullPath = '' for link in links: text = text.replace(link, '') if not link.startswith('http'): hasRelativePath = True if not fullPath and link.startswith('http'): fullPath = link if hasRelativePath and fullPath: for idx, link in enumerate(links): if not link.startswith('http'): links[idx] = urllib.urljoin(fullPath, link) #如果字数太多,则认为直接推送正文内容 if not forceToLinks and (len(links) != 1 or len(text) > WORDCNT_THRESHOLD_FOR_APMAIL): links = [] if links: #判断是下载文件还是转发内容 isBook = bool(to.lower() in ('book', 'file', 'download')) if not isBook: isBook = bool(link[-5:].lower() in ('.mobi', '.epub', '.docx')) if not isBook: isBook = bool(link[-4:].lower() in ('.pdf', '.txt', '.doc', '.rtf')) isDebug = bool(to.lower() == 'debug') if isDebug: bookType = 'Debug' elif isBook: bookType = 'Download' else: bookType = user.book_type param = { 'u': username, 'urls': base64.urlsafe_b64encode(zlib.compress('|'.join(links), 9)), 'type': bookType, 'to': user.kindle_email, 'tz': user.timezone, 'subject': subject[:SUBJECT_WORDCNT_FOR_APMAIL], 'lng': user.ownfeeds.language, 'keepimage': '1' if user.ownfeeds.keep_image else '0' } taskqueue.add(url='/url2book', queue_name="deliverqueue1", method='GET', params=param, target='worker') else: #直接转发邮件正文 #先判断是否有图片 from lib.makeoeb import MimeFromFilename hasimage = False if hasattr(message, 'attachments'): for f, c in message.attachments: if MimeFromFilename(f): hasimage = True break #先修正不规范的HTML邮件 h = soup.find('head') if not h: h = soup.new_tag('head') soup.html.insert(0, h) t = soup.head.find('title') if not t: t = soup.new_tag('title') t.string = subject soup.head.insert(0, t) #有图片的话,要生成MOBI或EPUB才行 #而且多看邮箱不支持html推送,也先转换epub再推送 if hasimage or (user.book_type == "epub"): from main import local_time from lib.makeoeb import (getOpts, CreateOeb, setMetaData, ServerContainer, byteStringIO, EPUBOutput, MOBIOutput) #仿照Amazon的转换服务器的处理,去掉CSS if DELETE_CSS_FOR_APPSPOTMAIL: tag = soup.find('style', attrs={'type': 'text/css'}) if tag: tag.extract() for tag in soup.find_all(attrs={'style': True}): del tag['style'] #将图片的src的文件名调整好 for img in soup.find_all('img', attrs={'src': True}): if img['src'].lower().startswith('cid:'): img['src'] = img['src'][4:] opts = getOpts() oeb = CreateOeb(log, None, opts) setMetaData(oeb, subject[:SUBJECT_WORDCNT_FOR_APMAIL], user.ownfeeds.language, local_time(tz=user.timezone), pubtype='book:book:KindleEar') oeb.container = ServerContainer(log) id_, href = oeb.manifest.generate(id='page', href='page.html') item = oeb.manifest.add(id_, href, 'application/xhtml+xml', data=unicode(soup)) oeb.spine.add(item, False) oeb.toc.add(subject, href) if hasattr(message, 'attachments'): for filename, content in message.attachments: mimetype = MimeFromFilename(filename) if mimetype: try: content = content.decode() except: pass else: id_, href = oeb.manifest.generate( id='img', href=filename) item = oeb.manifest.add(id_, href, mimetype, data=content) oIO = byteStringIO() o = EPUBOutput() if user.book_type == "epub" else MOBIOutput() o.convert(oeb, oIO, opts, log) BaseHandler.SendToKindle(username, user.kindle_email, subject[:SUBJECT_WORDCNT_FOR_APMAIL], user.book_type, str(oIO.getvalue()), user.timezone) else: #没有图片则直接推送HTML文件,阅读体验更佳 m = soup.find('meta', attrs={"http-equiv": "Content-Type"}) if not m: m = soup.new_tag('meta', content="text/html; charset=utf-8") m["http-equiv"] = "Content-Type" soup.html.head.insert(0, m) else: m['content'] = "text/html; charset=utf-8" html = unicode(soup).encode('utf-8') BaseHandler.SendToKindle(username, user.kindle_email, subject[:SUBJECT_WORDCNT_FOR_APMAIL], 'html', html, user.timezone, False) self.response.out.write('Done')
def f(request): url = urllib.urljoin(request.app['api-url'], pattern.format(**request.match_info))