Python urljoin Examples, urllib.urljoin Python Examples

Example #1

0

Show file

 def init_index(self):
     self.log('init_index')
     self._files = []
     for baseurl in self.baseurls:
         if baseurl.startswith('//'):
             addr = get_server_addr()
             if addr is None: continue
             baseurl = 'http://%s/%s' % (addr, baseurl[2:])
         if baseurl.startswith('http://'):
             url = urljoin(baseurl, 'index.txt')
             self.log(' opening: %r...' % url)
             try:
                 index = urlopen(url)
                 if index.getcode() in (None, 200):
                     files = index.read()
                     for name in files.splitlines():
                         (name, _, _) = name.strip().partition('#')
                         if not name: continue
                         url = urljoin(baseurl, name)
                         self.log('  loading: %r...' % url)
                         fp = urlopen(url)
                         if fp.getcode() in (None, 200):
                             data = fp.read()
                             self._files.append((name, data))
                         fp.close()
                 index.close()
                 if self._files: break
             except IOError as e:
                 self.log('  error: %s' % e)
                 continue
         else:
             # fallback to local files.
             path = os.path.join(baseurl, 'index.txt')
             self.log(' opening: %r...' % path)
             try:
                 index = open(path)
                 for name in index:
                     (name, _, _) = name.strip().partition('#')
                     if not name: continue
                     path = os.path.join(baseurl, name)
                     self.log('  loading: %r...' % path)
                     fp = open(path, 'rb')
                     data = fp.read()
                     self._files.append((name, data))
                     fp.close()
                 index.close()
                 if self._files: break
             except IOError as e:
                 self.log('  error: %s' % e)
                 continue
     self.mode = 'index'
     self._text = 'INDEX'
     self.refresh()
     self.playSound('sound_close')
     self._curfile = None
     return

Example #2

0

Show file

File: Atom.py Project: llimllib/cherry-blossom

    def prepare_atom_template(self, entries):
        ns = cpy.config.get('/').copy()
        entry_structs = []
        last_updated = ''
        for e in entries:
            es = EntryStruct()       
            es.title = e.title

            #this callback gives any interested plugins the chance to change
            #the text of a story, as presented in a feed. It gives an Entry
            #object, and ignores any return value
            run_callback(self.parent.plugins, "cb_feed_story", e)
            fulltext = escape(e.text)

            #If you only want short descriptions:
            #es.desc = escape(e.text[:255])
            #for full text descriptions:
            es.desc = fulltext
            es.text = fulltext
            es.time = time.strftime('%Y-%m-%dT%H:%M:%SZ', e.time_tuple)
            if not last_updated: last_updated = es.time
            es.link = urljoin(config('base_url'), e.relpath + '.html')
            entry_structs.append(es)
        ns['last_updated'] = last_updated
        ns['entries'] = entry_structs
        return ('atom', ns)

Example #3

0

Show file

def getSeasons(sname):
    url = urljoin(config.turbofilm_base, os.path.join("Series", sname))
    parser = GetNSeasons()
    page = getpage(url)["page"]
    parser.feed(page)
    slinks = parser.get_slinks()
    seasons = {}
    for s in slinks:
        k = int(re.match(".*([0-9]+)$", s).groups()[0])
        seasons.setdefault(k)
        u = urljoin(config.turbofilm_base, s)
        pg = getpage(u)["page"]
        p = GetSeriesDescription()
        p.feed(pg)
        seasons[k] = p.get_names()
    return seasons

Example #4

0

Show file

File: functions.py Project: fordindin/py-turbofilm

def getSeasons(sname):
		url = urljoin(config.turbofilm_base,os.path.join("Series",sname))
		parser = GetNSeasons()
		page = getpage(url)["page"]
		parser.feed(page)
		slinks = parser.get_slinks()
		seasons = {}
		for s in slinks:
				k = int(re.match(".*([0-9]+)$", s).groups()[0])
				seasons.setdefault(k)
				u = urljoin(config.turbofilm_base, s)
				pg = getpage(u)["page"]
				p = GetSeriesDescription()
				p.feed(pg)
				seasons[k] = p.get_names()
		return seasons

Example #5

0

Show file

def find_links_in_html_with_same_hostname(url, html):
    """
	Find all the links with same hostname as url
	"""
    if (html == None):
        return set()
    url = urllib.parse.quote(url)
    links = url_regex.findall(html)
    link_set = set()
    for link in links:
        if link == None:
            continue
        try:
            link = str(link)
            if link.startswith("/"):
                link_set.add('http://' + url.netloc + link)
            elif link.startswith("http") or link.startswith("https"):
                if (link.find(url.netloc)):
                    link_set.add(link)
            elif link.startswith("#"):
                continue
            else:
                link_set.add(urllib.urljoin(url.geturl(), link))
        except Exception as e:
            pass

    return link_set

Example #6

0

Show file

File: AppletRExec.py Project: PrakharPythonProgrammer/Grail-0.6

 def path_join(self, p1, p2):
     if is_url(p1) or is_url(p2):
         if '/' not in p2 and '.' not in p2:
             # Assume it's a directory -- needed for package loading
             p2 = p2 + "/"
         return urllib.urljoin(p1, p2)
     else:
         return RHooks.path_join(self, p1, p2)

Example #7

0

Show file

 def help_cmd(self, event=None):
     """Dispatch browser on self.help_url."""
     if not self.app.browsers:
         print("No browser left to dislay help.")
         return
     browser = self.helpbrowser
     if not browser or not browser.valid():
         import Browser
         browser = Browser.Browser(self.app.root, self.app)
         self.helpbrowser = browser
     helproot = self.app.prefs.Get('landmarks', 'grail-help-root')
     browser.context.load(urllib.urljoin(helproot, self.HELP_URL))
     browser.root.tkraise()

Example #8

0

Show file

 def start_a(self, attrs):
     uri = str.strip(attrs.get("href", ""))
     if uri:
         self.__node = bookmarks.nodes.Bookmark()
         self.__root.append_child(self.__node)
         if self.__baseurl:
             uri = urllib.urljoin(self.__baseurl, uri)
         self.__node.set_uri(uri)
         title = str.join(str.split(attrs.get("title", "")))
         if title:
             self.__node.set_title(title)
     else:
         self.__node = None
     self.save_bgn()

Example #9

0

Show file

    def urlopen(self, method, url, redirect=True, **kw):
        """
        Same as :meth:`urllib3.connectionpool.HTTPConnectionPool.urlopen`
        with custom cross-host redirect logic and only sends the request-uri
        portion of the ``url``.

        The given ``url`` parameter must be absolute, such that an appropriate
        :class:`urllib3.connectionpool.ConnectionPool` can be chosen for it.
        """
        u = parse_url(url)
        conn = self.connection_from_host(u.host, port=u.port, scheme=u.scheme)

        kw['assert_same_host'] = False
        kw['redirect'] = False
        if 'headers' not in kw:
            kw['headers'] = self.headers

        if self.proxy is not None and u.scheme == "http":
            response = conn.urlopen(method, url, **kw)
        else:
            response = conn.urlopen(method, u.request_uri, **kw)

        redirect_location = redirect and response.get_redirect_location()
        if not redirect_location:
            return response

        # Support relative URLs for redirecting.
        redirect_location = urljoin(url, redirect_location)

        # RFC 7231, Section 6.4.4
        if response.status == 303:
            method = 'GET'

        retries = kw.get('retries')
        if not isinstance(retries, Retry):
            retries = Retry.from_int(retries, redirect=redirect)

        try:
            retries = retries.increment(method, url, response=response, _pool=conn)
        except MaxRetryError:
            if retries.raise_on_redirect:
                raise
            return response

        kw['retries'] = retries
        kw['redirect'] = redirect

        log.info("Redirecting %s -> %s", url, redirect_location)
        return self.urlopen(method, redirect_location, **kw)

Example #10

0

Show file

def asset_src(path):
    """ retorna un url absoluto hacia el recurso ubicado en ``http://popego_asset_host/path``
        donde ``popego_asset_host`` es el valor definido en popego.asset_host 
        si config['pylons.g'].revision (en los globals) no es nulo, se appendea un query string
        con ese valor.
    """
    # TODO OPTIMIZE ME
    # inicializar el arreglo de asset_hosts at module load time
    # así, es una cagada: estoy parseando popego.asset_hosts cada vez
    asset_hosts = aslist(config.get('popego.asset_hosts', None), ',',
                         True) or ['']
    if config['pylons.g'].revision is not None:
        path += '?' + config['pylons.g'].revision
    return urljoin(
        string.strip(asset_hosts[path.__hash__() % len(asset_hosts)]), path)

Example #11

0

Show file

File: fileAPI.py Project: PrakharPythonProgrammer/Grail-0.6

    def format_directory(self):
        # XXX Unixism
        if self.url and self.url[-1] != '/':
            self.url = self.url + '/'
        fp = os.popen("ls -l -a %s/. 2>&1" % self.pathname, "r")
        lines = fp.readlines()
        fp.close()
        import io
        import re
        from urllib import quote
        from urllib import urljoin
        import regsub

        def escape(s, regsub=regsub):
            if not s: return ""
            s = regsub.gsub('&', '&amp;', s)  # Must be done first
            s = regsub.gsub('<', '&lt;', s)
            s = regsub.gsub('>', '&gt;', s)
            return s

        prog = re.compile(self.listing_pattern)
        data = self.listing_header % {
            'url': self.url,
            'pathname': escape(self.pathname)
        }
        for line in lines:
            if line[-1] == '\n': line = line[:-1]
            if prog.match(line) < 0:
                line = escape(line) + '\n'
                data = data + line
                continue
            mode, middle, name = prog.group(1, 2, 3)
            rawname = name
            [mode, middle, name] = map(escape, [mode, middle, name])
            href = urljoin(self.url, quote(rawname))
            if len(mode) == 10 and mode[0] == 'd' or name[-1:] == '/':
                if name[-1:] != '/':
                    name = name + '/'
                if href[-1:] != '/':
                    href = href + '/'
            line = '%s%s<A HREF="%s">%s</A>\n' % (mode, middle, escape(href),
                                                  name)
            data = data + line
        data = data + self.listing_trailer
        self.fp = io.io(data)
        self.headers['content-type'] = 'text/html'
        self.headers['content-length'] = str(len(data))

Example #12

0

Show file

File: Reader.py Project: PrakharPythonProgrammer/Grail-0.6

    def restart(self, url):
        self.maxrestarts = self.maxrestarts - 1

        self.viewer = self.last_context.viewer
        self.app = self.last_context.app

        self.parser = None

        tuple = urlparse(url)
        # it's possible that the url send in a 301 or 302 error is a
        # relative URL.  if there's no scheme or netloc in the
        # returned tuple, try joining the URL with the previous URL
        # and retry parsing it.
        if not (tuple[0] and tuple[1]):
            url = urllib.urljoin(self.url, url)
            tuple = urlparse(url)
        self.url = url

        self.fragment = tuple[-1]
        tuple = tuple[:-1] + ("",)
        if self.user_passwd:
            netloc = tuple[1]
            i = str.find(netloc, '@')
            if i >= 0: netloc = netloc[i+1:]
            netloc = self.user_passwd + '@' + netloc
            tuple = (tuple[0], netloc) + tuple[2:]
        realurl = urllib.urlunparse(tuple)

        # Check first to see if the previous Context has any protocol handlers
        api = self.last_context.get_local_api(realurl, self.method,
                                              self.params)
        if not api:
            if self.app:
                api = self.app.open_url(realurl,
                                        self.method, self.params, self.reload,
                                        data=self.data)
            else:
                import protocols
                api = protocols.protocol_access(realurl,
                                                self.method, self.params,
                                                data=self.data)

        BaseReader.__init__(self, self.last_context, api)

Example #13

0

Show file

File: jsonrpc.py Project: miesi/dim

def cas():
    '''
    This is a proxy to the cas p3/serviceValidate endpoint which also sets the
    cookie on successful authentication.
    '''
    service = request.args.get('service')
    ticket = request.args.get('ticket')
    r = requests.get(urllib.urljoin(current_app.config['CAS_URL'],
                                    'p3/serviceValidate'),
                     params=dict(service=service, ticket=ticket))
    if r.status_code == 200:
        xml = et.fromstring(r.content)
        success = xml.find('{http://www.yale.edu/tp/cas}authenticationSuccess')
        if success:
            username = success.find('{http://www.yale.edu/tp/cas}user').text
            _do_login(True, username, tool=request.args.get('tool'))
    return Response(response=r.content,
                    status=r.status_code,
                    mimetype=r.headers['Content-Type'])

Example #14

0

Show file

 def getlistingdata(self):
     if not self.lines:
         return ""
     lines, self.lines = self.lines[:-1], self.lines[-1:]
     data = ""
     prog = re.compile(self.listing_pattern)
     for line in lines:
         if self.debuglevel > 2:
             print("*getl*", repr(line))
         if line is None:
             data = data + self.listing_header % {
                 'url': self.escape(self.url)
             }
             continue
         if line[-1:] == '\r': line = line[:-1]
         if prog.match(line) < 0:
             line = self.escape(line) + '\n'
             data = data + line
             continue
         mode, middle, name, symlink = prog.group(1, 2, 3, 5)
         rawname = name
         [mode, middle, name] = map(self.escape, [mode, middle, name])
         href = urljoin(self.url, quote(rawname))
         if len(mode) == 10 and mode[0] == 'd' or name[-1:] == '/':
             if name[-1:] != '/':
                 name = name + '/'
             if href[-1:] != '/':
                 href = href + '/'
         line = '%s%s<A HREF="%s">%s</A>%s\n' % (
             mode, middle, self.escape(href), name,
             (symlink and symlink or ''))
         data = data + line
     if self.lines == [None]:
         data = data + self.listing_trailer
         self.lines = []
     return data

Example #15

0

Show file

File: Rss.py Project: llimllib/cherry-blossom

    def prepare_rss_template(self, entries):
        ns = cpy.config.get('/').copy()
        entry_structs = []
        for e in entries:
            #XXX: what exactly is the <guid> element?
            #XXX: what is the category tag? should keywords go here?
            es = EntryStruct()
            es.title = e.title

            #this callback gives any interested plugins the chance to change
            #the text of a story, as presented in a feed. It gives an Entry
            #object, and ignores any return value
            run_callback(self.parent.plugins, "cb_feed_story", e)

            #because <style> messed me up, I'm going to stop stripping
            #HTML out of my description. The RSS spec sucks.
            es.desc = e.text
            es.link = urljoin(config('base_url'), e.relpath + '.html')
            es.relpath = e.relpath
            es.time = time.strftime('%Y-%m-%dT%H:%M:%SZ', e.time_tuple)
            es.text = e.text
            entry_structs.append(es)
        ns['entries'] = entry_structs
        return ('rss', ns)

Example #16

0

Show file

from yaml import load, dump
from git import Repo
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains

from pymongo import MongoClient

from noip.scripts.template import *


COLLECTION_NAME = 'luogu'
WEB_DRIVER_PATH = '/Users/yangdong/tools/chromedriver'

URL_JOIN = lambda path: urljoin('https://www.luogu.org/', path)
PROBLEMS_PAGE_URL = lambda page: "https://www.luogu.org/problemnew/lists?page=%s" % page


def init_db():
    client = MongoClient('localhost', 27017)
    client.spider.drop_collection(COLLECTION_NAME)
    client.spider.create_collection(COLLECTION_NAME)
    collection = client.spider[COLLECTION_NAME]
    collection.create_index("index", unique=True)
    return collection


def get_collection():
    client = MongoClient('localhost', 27017)
    return client.spider[COLLECTION_NAME]

Example #17

0

Show file

File: handlemail.py Project: Chansie/KindleEar

    def receive(self, message):
        # 如果有多个收件人的话，只解释第一个收件人
        to = parseaddr(message.to)[1]
        to = to.split("@")[0] if to and "@" in to else "xxx"
        if "__" in to:
            listto = to.split("__")
            username = listto[0] if listto[0] else "admin"
            to = listto[1]
        else:
            username = "******"

        user = KeUser.all().filter("name = ", username).get()
        if not user:
            username = "******"
            user = KeUser.all().filter("name = ", username).get()

        if not user or not user.kindle_email:
            self.response.out.write("No account or no email configured!")
            return

        sender = parseaddr(message.sender)[1]
        mailhost = sender.split("@")[1] if sender and "@" in sender else None
        if (not sender or not mailhost) or (
            not user.whitelist.filter("mail = ", "*").get()
            and not user.whitelist.filter("mail = ", sender.lower()).get()
            and not user.whitelist.filter("mail = ", "@" + mailhost.lower()).get()
        ):
            self.response.out.write("Spam mail!")
            log.warn("Spam mail from : %s" % sender)
            return

        if hasattr(message, "subject"):
            subject = decode_subject(message.subject).strip()
        else:
            subject = u"NoSubject"

        # 邮件主题中如果在最后添加一个 !links，则强制提取邮件中的链接然后生成电子书
        forceToLinks = False
        forceToArticle = False
        if subject.endswith("!links"):
            subject = subject.replace("!links", "").rstrip()
            forceToLinks = True
        elif subject.find(" !links ") >= 0:
            subject = subject.replace(" !links ", "")
            forceToLinks = True

        if subject.endswith("!article"):
            subject = subject.replace("!article", "").rstrip()
            forceToArticle = True
        elif subject.find(" !article ") >= 0:
            subject = subject.replace(" !article ", "")
            forceToArticle = True

        # 通过邮件触发一次“现在投递”
        if to.lower() == "trigger":
            return self.TrigDeliver(subject, username)

        # 获取和解码邮件内容
        txt_bodies = message.bodies("text/plain")
        html_bodies = message.bodies("text/html")
        try:
            allBodies = [body.decode() for ctype, body in html_bodies]
        except:
            log.warn("Decode html bodies of mail failed.")
            allBodies = []

        # 此邮件为纯文本邮件
        if len(allBodies) == 0:
            log.info("no html body, use text body.")
            try:
                allBodies = [body.decode() for ctype, body in txt_bodies]
            except:
                log.warn("Decode text bodies of mail failed.")
                allBodies = []
            bodies = u"".join(allBodies)
            if not bodies:
                return
            bodyurls = []
            for l in bodies.split("\n"):
                l = l.strip()
                if not l:
                    continue
                link = IsHyperLink(l)
                if link:
                    bodyurls.append('<a href="%s">%s</a><br />' % (link, link))
                else:
                    break

            bodies = u"""<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
              <title>%s</title></head><body>%s</body></html>""" % (
                subject,
                "".join(bodyurls) if bodyurls else bodies,
            )
            allBodies = [bodies.encode("utf-8")]

        # 开始处理邮件内容
        soup = BeautifulSoup(allBodies[0], "lxml")

        # 合并多个邮件文本段
        if len(allBodies) > 1:
            for o in allBodies[1:]:
                so = BeautifulSoup(o, "lxml")
                b = so.find("body")
                if not b:
                    continue
                for c in b.contents:
                    soup.body.append(c)

        # 判断邮件内容是文本还是链接（包括多个链接的情况）
        links = []
        body = soup.body if soup.find("body") else soup
        if not forceToArticle:
            for s in body.stripped_strings:
                link = IsHyperLink(s)
                if link:
                    if link not in links:
                        links.append(link)
                elif not forceToLinks:  # 如果是多个链接，则必须一行一个，除非强制提取链接
                    break

        if not links and not forceToArticle:  # 正常字符判断没有链接，看html的a标签
            links = [link["href"] for link in soup.find_all("a", attrs={"href": True})]

            text = " ".join([s for s in body.stripped_strings])

            # 如果有相对路径，则在里面找一个绝对路径，然后转换其他
            hasRelativePath = False
            fullPath = ""
            for link in links:
                text = text.replace(link, "")
                if not link.startswith("http"):
                    hasRelativePath = True
                if not fullPath and link.startswith("http"):
                    fullPath = link

            if hasRelativePath and fullPath:
                for idx, link in enumerate(links):
                    if not link.startswith("http"):
                        links[idx] = urllib.urljoin(fullPath, link)

            # 如果字数太多，则认为直接推送正文内容
            if not forceToLinks and (len(links) != 1 or len(text) > WORDCNT_THRESHOLD_FOR_APMAIL):
                links = []

        if links:
            # 判断是下载文件还是转发内容
            isbook = bool(to.lower() in ("book", "file", "download"))
            isbook = link[-5:].lower() in (".mobi", ".epub", ".docx") if not isbook else isbook
            isbook = link[-4:].lower() in (".pdf", ".txt", ".doc", ".rtf") if not isbook else isbook

            param = {
                "u": username,
                "urls": base64.urlsafe_b64encode(zlib.compress("|".join(links), 9)),
                "type": "Download" if isbook else user.book_type,
                "to": user.kindle_email,
                "tz": user.timezone,
                "subject": subject[:SUBJECT_WORDCNT_FOR_APMAIL],
                "lng": user.ownfeeds.language,
                "keepimage": "1" if user.ownfeeds.keep_image else "0",
            }
            taskqueue.add(url="/url2book", queue_name="deliverqueue1", method="GET", params=param, target="worker")
        else:  # 直接转发邮件正文
            # 先判断是否有图片
            from lib.makeoeb import MimeFromFilename

            hasimage = False
            if hasattr(message, "attachments"):
                for f, c in message.attachments:
                    if MimeFromFilename(f):
                        hasimage = True
                        break

            # 先修正不规范的HTML邮件
            h = soup.find("head")
            if not h:
                h = soup.new_tag("head")
                soup.html.insert(0, h)
            t = soup.head.find("title")
            if not t:
                t = soup.new_tag("title")
                t.string = subject
                soup.head.insert(0, t)

            # 有图片的话，要生成MOBI或EPUB才行
            # 而且多看邮箱不支持html推送，也先转换epub再推送
            if hasimage or (user.book_type == "epub"):
                from main import local_time
                from lib.makeoeb import (
                    getOpts,
                    CreateOeb,
                    setMetaData,
                    ServerContainer,
                    byteStringIO,
                    EPUBOutput,
                    MOBIOutput,
                )

                # 仿照Amazon的转换服务器的处理，去掉CSS
                if DELETE_CSS_FOR_APPSPOTMAIL:
                    tag = soup.find("style", attrs={"type": "text/css"})
                    if tag:
                        tag.extract()
                    for tag in soup.find_all(attrs={"style": True}):
                        del tag["style"]

                # 将图片的src的文件名调整好
                for img in soup.find_all("img", attrs={"src": True}):
                    if img["src"].lower().startswith("cid:"):
                        img["src"] = img["src"][4:]

                opts = getOpts()
                oeb = CreateOeb(log, None, opts)

                setMetaData(
                    oeb,
                    subject[:SUBJECT_WORDCNT_FOR_APMAIL],
                    user.ownfeeds.language,
                    local_time(tz=user.timezone),
                    pubtype="book:book:KindleEar",
                )
                oeb.container = ServerContainer(log)
                id, href = oeb.manifest.generate(id="page", href="page.html")
                item = oeb.manifest.add(id, href, "application/xhtml+xml", data=unicode(soup))
                oeb.spine.add(item, False)
                oeb.toc.add(subject, href)

                if hasattr(message, "attachments"):
                    for filename, content in message.attachments:
                        mimetype = MimeFromFilename(filename)
                        if mimetype:
                            try:
                                content = content.decode()
                            except:
                                pass
                            else:
                                id, href = oeb.manifest.generate(id="img", href=filename)
                                item = oeb.manifest.add(id, href, mimetype, data=content)

                oIO = byteStringIO()
                o = EPUBOutput() if user.book_type == "epub" else MOBIOutput()
                o.convert(oeb, oIO, opts, log)
                BaseHandler.SendToKindle(
                    username,
                    user.kindle_email,
                    subject[:SUBJECT_WORDCNT_FOR_APMAIL],
                    user.book_type,
                    str(oIO.getvalue()),
                    user.timezone,
                )
            else:  # 没有图片则直接推送HTML文件，阅读体验更佳
                m = soup.find("meta", attrs={"http-equiv": "Content-Type"})
                if not m:
                    m = soup.new_tag("meta", content="text/html; charset=utf-8")
                    m["http-equiv"] = "Content-Type"
                    soup.html.head.insert(0, m)
                else:
                    m["content"] = "text/html; charset=utf-8"

                html = unicode(soup).encode("utf-8")
                BaseHandler.SendToKindle(
                    username,
                    user.kindle_email,
                    subject[:SUBJECT_WORDCNT_FOR_APMAIL],
                    "html",
                    html,
                    user.timezone,
                    False,
                )
        self.response.out.write("Done")

Example #18

0

Show file

File: handlemail.py Project: mingde816/KindleEar

    def receive(self, message):
        #如果有多个收件人的话，只解释第一个收件人
        to = parseaddr(message.to)[1]
        to = to.split('@')[0] if to and '@' in to else 'xxx'
        if '__' in to:
            listto = to.split('__')
            username = listto[0] if listto[0] else 'admin'
            to = listto[1]
        else:
            username = '******'
            
        user = KeUser.all().filter('name = ', username).get()
        if not user:
            username = '******'
            user = KeUser.all().filter('name = ', username).get()
        
        if not user or not user.kindle_email:
            self.response.out.write('No account or no email configured!')
            return
        
        sender = parseaddr(message.sender)[1]
        mailhost = sender.split('@')[1] if sender and '@' in sender else None
        if (not sender or not mailhost) or \
            (not user.whitelist.filter('mail = ', '*').get()
            and not user.whitelist.filter('mail = ', sender.lower()).get()
            and not user.whitelist.filter('mail = ', '@' + mailhost.lower()).get()):
            self.response.out.write("Spam mail!")
            log.warn('Spam mail from : %s' % sender)
            return
        
        if hasattr(message, 'subject'):
            subject = decode_subject(message.subject).strip()
        else:
            subject = u"NoSubject"
        
        #邮件主题中如果在最后添加一个 !links，则强制提取邮件中的链接然后生成电子书
        forceToLinks = False
        forceToArticle = False
        if subject.endswith('!links'):
            subject = subject.replace('!links', '').rstrip()
            forceToLinks = True
        elif subject.find(' !links ') >= 0:
            subject = subject.replace(' !links ', '')
            forceToLinks = True
        
        #如果邮件主题在最后添加一个 !article，则强制转换邮件内容为电子书，忽略其中的链接
        if not forceToLinks:
            if subject.endswith('!article'):
                subject = subject.replace('!article', '').rstrip()
                forceToArticle = True
            elif subject.find(' !article ') >= 0:
                subject = subject.replace(' !article ', '')
                forceToArticle = True
            
        #通过邮件触发一次“现在投递”
        if to.lower() == 'trigger':
            return self.TrigDeliver(subject, username)
        
        #获取和解码邮件内容
        txt_bodies = message.bodies('text/plain')
        html_bodies = message.bodies('text/html')
        try:
            allBodies = [body.decode() for ctype, body in html_bodies]
        except:
            log.warn('Decode html bodies of mail failed.')
            allBodies = []
        
        #此邮件为纯文本邮件
        if len(allBodies) == 0:
            log.info('no html body, use text body.')
            try:
                allBodies = [body.decode() for ctype, body in txt_bodies]
            except:
                log.warn('Decode text bodies of mail failed.')
                allBodies = []
            bodies = u''.join(allBodies)
            if not bodies:
                return
            bodyurls = []
            for l in bodies.split('\n'):
                l = l.strip()
                if not l:
                    continue
                link = IsHyperLink(l)
                if link:
                    bodyurls.append('<a href="%s">%s</a><br />' % (link,link))
                else:
                    break

            bodies = u"""<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
              <title>%s</title></head><body>%s</body></html>""" %(subject,
              ''.join(bodyurls) if bodyurls else bodies)
            allBodies = [bodies.encode('utf-8')]
        
        #开始处理邮件内容
        soup = BeautifulSoup(allBodies[0], 'lxml')
        
        #合并多个邮件文本段
        if len(allBodies) > 1:
            for o in allBodies[1:]:
                so = BeautifulSoup(o, 'lxml')
                b = so.find('body')
                if not b:
                    continue
                for c in b.contents:
                    soup.body.append(c)
        
        #判断邮件内容是文本还是链接（包括多个链接的情况）
        links = []
        body = soup.body if soup.find('body') else soup
        if not forceToArticle: #如果强制转正文就不分析链接了，否则先分析和提取链接
            for s in body.stripped_strings:
                link = IsHyperLink(s)
                if link:
                    if link not in links:
                        links.append(link)
                #如果是多个链接，则必须一行一个，不能留空，除非强制提取链接
                #这个处理是为了去除部分邮件客户端在邮件末尾添加的一个广告链接
                elif not forceToLinks:
                    break
                
        if not links and not forceToArticle: #如果通过正常字符（显示出来的）判断没有链接，则看html的a标签
            links = [link['href'] for link in soup.find_all('a', attrs={'href':True})]
            
            text = ' '.join([s for s in body.stripped_strings])
            
            #如果有相对路径，则在里面找一个绝对路径，然后转换其他
            hasRelativePath = False
            fullPath = ''
            for link in links:
                text = text.replace(link, '')
                if not link.startswith('http'):
                    hasRelativePath = True
                if not fullPath and link.startswith('http'):
                    fullPath = link
            
            if hasRelativePath and fullPath:
                for idx, link in enumerate(links):
                    if not link.startswith('http'):
                        links[idx] = urllib.urljoin(fullPath, link)
            
            #如果字数太多，则认为直接推送正文内容
            if not forceToLinks and (len(links) != 1 or len(text) > WORDCNT_THRESHOLD_FOR_APMAIL):
                links = []
            
        if links:
            #判断是下载文件还是转发内容
            isBook = bool(to.lower() in ('book', 'file', 'download'))
            if not isBook:
                isBook = bool(link[-5:].lower() in ('.mobi','.epub','.docx'))
            if not isBook:
                isBook = bool(link[-4:].lower() in ('.pdf','.txt','.doc','.rtf'))
            isDebug = bool(to.lower() == 'debug')

            if isDebug:
                bookType = 'Debug'
            elif isbook:
                bookType = 'Download'
            else:
                bookType = user.book_type
            
            param = {'u': username,
                     'urls': base64.urlsafe_b64encode(zlib.compress('|'.join(links), 9)),
                     'type': bookType,
                     'to': user.kindle_email,
                     'tz': user.timezone,
                     'subject': subject[:SUBJECT_WORDCNT_FOR_APMAIL],
                     'lng': user.ownfeeds.language,
                     'keepimage': '1' if user.ownfeeds.keep_image else '0'
                    }
            taskqueue.add(url='/url2book', queue_name="deliverqueue1", method='GET',
                params=param, target='worker')
        else: #直接转发邮件正文
            #先判断是否有图片
            from lib.makeoeb import MimeFromFilename
            hasimage = False
            if hasattr(message, 'attachments'):
                for f,c in message.attachments:
                    if MimeFromFilename(f):
                        hasimage = True
                        break
                        
            #先修正不规范的HTML邮件
            h = soup.find('head')
            if not h:
                h = soup.new_tag('head')
                soup.html.insert(0, h)
            t = soup.head.find('title')
            if not t:
                t = soup.new_tag('title')
                t.string = subject
                soup.head.insert(0, t)
            
            #有图片的话，要生成MOBI或EPUB才行
            #而且多看邮箱不支持html推送，也先转换epub再推送
            if hasimage or (user.book_type == "epub"):
                from main import local_time
                from lib.makeoeb import (getOpts, CreateOeb, setMetaData,
                                    ServerContainer, byteStringIO, 
                                    EPUBOutput, MOBIOutput)
                
                #仿照Amazon的转换服务器的处理，去掉CSS
                if DELETE_CSS_FOR_APPSPOTMAIL:
                    tag = soup.find('style', attrs={'type':'text/css'})
                    if tag:
                        tag.extract()
                    for tag in soup.find_all(attrs={'style':True}):
                        del tag['style']
                
                #将图片的src的文件名调整好
                for img in soup.find_all('img',attrs={'src':True}):
                    if img['src'].lower().startswith('cid:'):
                        img['src'] = img['src'][4:]
                
                opts = getOpts()
                oeb = CreateOeb(log, None, opts)
                
                setMetaData(oeb, subject[:SUBJECT_WORDCNT_FOR_APMAIL], 
                    user.ownfeeds.language, local_time(tz=user.timezone), 
                    pubtype='book:book:KindleEar')
                oeb.container = ServerContainer(log)
                id, href = oeb.manifest.generate(id='page', href='page.html')
                item = oeb.manifest.add(id, href, 'application/xhtml+xml', data=unicode(soup))
                oeb.spine.add(item, False)
                oeb.toc.add(subject, href)
                
                if hasattr(message, 'attachments'):
                    for filename,content in message.attachments:
                        mimetype = MimeFromFilename(filename)
                        if mimetype:
                            try:
                                content = content.decode()
                            except:
                                pass
                            else:
                                id, href = oeb.manifest.generate(id='img', href=filename)
                                item = oeb.manifest.add(id, href, mimetype, data=content)
                
                oIO = byteStringIO()
                o = EPUBOutput() if user.book_type == "epub" else MOBIOutput()
                o.convert(oeb, oIO, opts, log)
                BaseHandler.SendToKindle(username, user.kindle_email, 
                    subject[:SUBJECT_WORDCNT_FOR_APMAIL], 
                    user.book_type, str(oIO.getvalue()), user.timezone)
            else: #没有图片则直接推送HTML文件，阅读体验更佳
                m = soup.find('meta', attrs={"http-equiv":"Content-Type"})
                if not m:
                    m = soup.new_tag('meta', content="text/html; charset=utf-8")
                    m["http-equiv"] = "Content-Type"
                    soup.html.head.insert(0,m)
                else:
                    m['content'] = "text/html; charset=utf-8"
                
                html = unicode(soup).encode('utf-8')
                BaseHandler.SendToKindle(username, user.kindle_email, 
                    subject[:SUBJECT_WORDCNT_FOR_APMAIL], 'html', html, user.timezone, False)
        self.response.out.write('Done')

Example #19

0

Show file

File: Comments.py Project: Tubbz-alt/cherry-blossom

 def comment_form(self, story):
     url = urljoin(config('base_url'), '/Comments/add')
     return ('comment_form', {'url': url, 'story':story})

Example #20

0

Show file

File: handlemail.py Project: xujun05/kindleear

    def receive(self, message):
        #如果有多个收件人的话，只解释第一个收件人
        to = parseaddr(message.to)[1]
        to = to.split('@')[0] if to and '@' in to else 'xxx'
        if '__' in to:
            listto = to.split('__')
            username = listto[0] if listto[0] else 'admin'
            to = listto[1]
        else:
            username = '******'

        user = KeUser.all().filter('name = ', username).get()
        if not user:
            username = '******'
            user = KeUser.all().filter('name = ', username).get()

        if not user or not user.kindle_email:
            self.response.out.write('No account or no email configured!')
            return

        sender = parseaddr(message.sender)[1]
        mailhost = sender.split('@')[1] if sender and '@' in sender else None
        if (not sender or not mailhost) or \
            (not user.whitelist.filter('mail = ', '*').get()
            and not user.whitelist.filter('mail = ', sender.lower()).get()
            and not user.whitelist.filter('mail = ', '@' + mailhost.lower()).get()):
            self.response.out.write("Spam mail!")
            log.warn('Spam mail from : %s' % sender)
            return

        if hasattr(message, 'subject'):
            subject = decode_subject(message.subject).strip()
        else:
            subject = u"NoSubject"

        #邮件主题中如果在最后添加一个 !links，则强制提取邮件中的链接然后生成电子书
        forceToLinks = False
        forceToArticle = False
        if subject.endswith('!links'):
            subject = subject.replace('!links', '').rstrip()
            forceToLinks = True
        elif subject.find(' !links ') >= 0:
            subject = subject.replace(' !links ', '')
            forceToLinks = True

        #如果邮件主题在最后添加一个 !article，则强制转换邮件内容为电子书，忽略其中的链接
        if not forceToLinks:
            if subject.endswith('!article'):
                subject = subject.replace('!article', '').rstrip()
                forceToArticle = True
            elif subject.find(' !article ') >= 0:
                subject = subject.replace(' !article ', '')
                forceToArticle = True

        #通过邮件触发一次“现在投递”
        if to.lower() == 'trigger':
            return self.TrigDeliver(subject, username)

        #获取和解码邮件内容
        txt_bodies = message.bodies('text/plain')
        html_bodies = message.bodies('text/html')
        try:
            allBodies = [body.decode() for ctype, body in html_bodies]
        except:
            log.warn('Decode html bodies of mail failed.')
            allBodies = []

        #此邮件为纯文本邮件
        if len(allBodies) == 0:
            log.info('no html body, use text body.')
            try:
                allBodies = [body.decode() for ctype, body in txt_bodies]
            except:
                log.warn('Decode text bodies of mail failed.')
                allBodies = []
            bodies = u''.join(allBodies)
            if not bodies:
                return
            bodyurls = []
            for l in bodies.split('\n'):
                l = l.strip()
                if not l:
                    continue
                link = IsHyperLink(l)
                if link:
                    bodyurls.append('<a href="%s">%s</a><br />' % (link, link))
                else:
                    break

            bodies = u"""<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
              <title>%s</title></head><body>%s</body></html>""" % (
                subject, ''.join(bodyurls) if bodyurls else bodies)
            allBodies = [bodies.encode('utf-8')]

        #开始处理邮件内容
        soup = BeautifulSoup(allBodies[0], 'lxml')

        #合并多个邮件文本段
        if len(allBodies) > 1:
            for o in allBodies[1:]:
                so = BeautifulSoup(o, 'lxml')
                b = so.find('body')
                if not b:
                    continue
                for c in b.contents:
                    soup.body.append(c)

        #判断邮件内容是文本还是链接（包括多个链接的情况）
        links = []
        body = soup.body if soup.find('body') else soup
        if not forceToArticle:  #如果强制转正文就不分析链接了，否则先分析和提取链接
            for s in body.stripped_strings:
                link = IsHyperLink(s)
                if link:
                    if link not in links:
                        links.append(link)
                #如果是多个链接，则必须一行一个，不能留空，除非强制提取链接
                #这个处理是为了去除部分邮件客户端在邮件末尾添加的一个广告链接
                elif not forceToLinks:
                    break

        if not links and not forceToArticle:  #如果通过正常字符（显示出来的）判断没有链接，则看html的a标签
            links = [
                link['href']
                for link in soup.find_all('a', attrs={'href': True})
            ]

            text = ' '.join([s for s in body.stripped_strings])

            #如果有相对路径，则在里面找一个绝对路径，然后转换其他
            hasRelativePath = False
            fullPath = ''
            for link in links:
                text = text.replace(link, '')
                if not link.startswith('http'):
                    hasRelativePath = True
                if not fullPath and link.startswith('http'):
                    fullPath = link

            if hasRelativePath and fullPath:
                for idx, link in enumerate(links):
                    if not link.startswith('http'):
                        links[idx] = urllib.urljoin(fullPath, link)

            #如果字数太多，则认为直接推送正文内容
            if not forceToLinks and (len(links) != 1 or
                                     len(text) > WORDCNT_THRESHOLD_FOR_APMAIL):
                links = []

        if links:
            #判断是下载文件还是转发内容
            isBook = bool(to.lower() in ('book', 'file', 'download'))
            if not isBook:
                isBook = bool(link[-5:].lower() in ('.mobi', '.epub', '.docx'))
            if not isBook:
                isBook = bool(link[-4:].lower() in ('.pdf', '.txt', '.doc',
                                                    '.rtf'))
            isDebug = bool(to.lower() == 'debug')

            if isDebug:
                bookType = 'Debug'
            elif isBook:
                bookType = 'Download'
            else:
                bookType = user.book_type

            param = {
                'u': username,
                'urls':
                base64.urlsafe_b64encode(zlib.compress('|'.join(links), 9)),
                'type': bookType,
                'to': user.kindle_email,
                'tz': user.timezone,
                'subject': subject[:SUBJECT_WORDCNT_FOR_APMAIL],
                'lng': user.ownfeeds.language,
                'keepimage': '1' if user.ownfeeds.keep_image else '0'
            }
            taskqueue.add(url='/url2book',
                          queue_name="deliverqueue1",
                          method='GET',
                          params=param,
                          target='worker')
        else:  #直接转发邮件正文
            #先判断是否有图片
            from lib.makeoeb import MimeFromFilename
            hasimage = False
            if hasattr(message, 'attachments'):
                for f, c in message.attachments:
                    if MimeFromFilename(f):
                        hasimage = True
                        break

            #先修正不规范的HTML邮件
            h = soup.find('head')
            if not h:
                h = soup.new_tag('head')
                soup.html.insert(0, h)
            t = soup.head.find('title')
            if not t:
                t = soup.new_tag('title')
                t.string = subject
                soup.head.insert(0, t)

            #有图片的话，要生成MOBI或EPUB才行
            #而且多看邮箱不支持html推送，也先转换epub再推送
            if hasimage or (user.book_type == "epub"):
                from main import local_time
                from lib.makeoeb import (getOpts, CreateOeb, setMetaData,
                                         ServerContainer, byteStringIO,
                                         EPUBOutput, MOBIOutput)

                #仿照Amazon的转换服务器的处理，去掉CSS
                if DELETE_CSS_FOR_APPSPOTMAIL:
                    tag = soup.find('style', attrs={'type': 'text/css'})
                    if tag:
                        tag.extract()
                    for tag in soup.find_all(attrs={'style': True}):
                        del tag['style']

                #将图片的src的文件名调整好
                for img in soup.find_all('img', attrs={'src': True}):
                    if img['src'].lower().startswith('cid:'):
                        img['src'] = img['src'][4:]

                opts = getOpts()
                oeb = CreateOeb(log, None, opts)

                setMetaData(oeb,
                            subject[:SUBJECT_WORDCNT_FOR_APMAIL],
                            user.ownfeeds.language,
                            local_time(tz=user.timezone),
                            pubtype='book:book:KindleEar')
                oeb.container = ServerContainer(log)
                id_, href = oeb.manifest.generate(id='page', href='page.html')
                item = oeb.manifest.add(id_,
                                        href,
                                        'application/xhtml+xml',
                                        data=unicode(soup))
                oeb.spine.add(item, False)
                oeb.toc.add(subject, href)

                if hasattr(message, 'attachments'):
                    for filename, content in message.attachments:
                        mimetype = MimeFromFilename(filename)
                        if mimetype:
                            try:
                                content = content.decode()
                            except:
                                pass
                            else:
                                id_, href = oeb.manifest.generate(
                                    id='img', href=filename)
                                item = oeb.manifest.add(id_,
                                                        href,
                                                        mimetype,
                                                        data=content)

                oIO = byteStringIO()
                o = EPUBOutput() if user.book_type == "epub" else MOBIOutput()
                o.convert(oeb, oIO, opts, log)
                BaseHandler.SendToKindle(username, user.kindle_email,
                                         subject[:SUBJECT_WORDCNT_FOR_APMAIL],
                                         user.book_type, str(oIO.getvalue()),
                                         user.timezone)
            else:  #没有图片则直接推送HTML文件，阅读体验更佳
                m = soup.find('meta', attrs={"http-equiv": "Content-Type"})
                if not m:
                    m = soup.new_tag('meta',
                                     content="text/html; charset=utf-8")
                    m["http-equiv"] = "Content-Type"
                    soup.html.head.insert(0, m)
                else:
                    m['content'] = "text/html; charset=utf-8"

                html = unicode(soup).encode('utf-8')
                BaseHandler.SendToKindle(username, user.kindle_email,
                                         subject[:SUBJECT_WORDCNT_FOR_APMAIL],
                                         'html', html, user.timezone, False)
        self.response.out.write('Done')

Example #21

0

Show file

 def f(request):
     url = urllib.urljoin(request.app['api-url'],
                          pattern.format(**request.match_info))