class cookie:
    def __init__( self ):
        self.cookieObj = SimpleCookie()
        self.load()

    def load( self ):
        if not os.environ.has_key("HTTP_COOKIE"):
            # Kein Cookie vorhanden
            return

        self.cookieObj.load( os.environ["HTTP_COOKIE"] )

    def readCookie( self, CookieName ):
        if self.cookieObj == False:
            # Gibt kein Cookie
            return False

        if self.cookieObj.has_key(CookieName):
            return self.cookieObj[CookieName].value
        else:
            return False

    def debug( self ):
        print "Cookie-Debug:"
        print "<hr><pre>"
        if not os.environ.has_key("HTTP_COOKIE"):
            print "There is no HTTP_COOKIE in os.environ:\n"
            for k,v in os.environ.iteritems(): print k,v
        else:
            print self.cookieObj
        print "</pre><hr>"
Esempio n. 2
0
def cookie_parts(name, kaka):
    cookie_obj = SimpleCookie(kaka)
    morsel = cookie_obj.get(name)
    if morsel:
        return morsel.value.split("|")
    else:
        return None
Esempio n. 3
0
def make_cookie(name, load, seed, expire=0, domain="",  path="",
                timestamp=""):
    """
    Create and return a cookie

    :param name: Cookie name
    :param load: Cookie load
    :param seed: A seed for the HMAC function
    :param expire: Number of minutes before this cookie goes stale
    :param domain: The domain of the cookie
    :param path: The path specification for the cookie
    :return: A tuple to be added to headers
    """
    cookie = SimpleCookie()
    if not timestamp:
        timestamp = str(int(time.mktime(time.gmtime())))
    signature = cookie_signature(seed, load, timestamp)
    cookie[name] = "|".join([load, timestamp, signature])
    if path:
        cookie[name]["path"] = path
    if domain:
        cookie[name]["domain"] = domain
    if expire:
        cookie[name]["expires"] = _expiration(expire,
                                              "%a, %d-%b-%Y %H:%M:%S GMT")

    return tuple(cookie.output().split(": ", 1))
def getSessionId(request_cookie):
    cookie = SimpleCookie()
    cookie.load(request_cookie)
    try:
        sessionId = int((cookie['id']).value)
    except CookieError, ValueError:
        sessionId = sessions.AddNewSession({'num' : 0, 'auth' : False})
Esempio n. 5
0
def parse_cookie(name, seed, kaka):
    """Parses and verifies a cookie value

    :param seed: A seed used for the HMAC signature
    :param kaka: The cookie
    :return: A tuple consisting of (payload, timestamp)
    """
    if not kaka:
        return None

    cookie_obj = SimpleCookie(kaka)
    morsel = cookie_obj.get(name)

    if morsel:
        parts = morsel.value.split("|")
        if len(parts) != 3:
            return None
            # verify the cookie signature
        sig = cookie_signature(seed, parts[0], parts[1])
        if sig != parts[2]:
            raise Exception("Invalid cookie signature")

        try:
            return parts[0].strip(), parts[1]
        except KeyError:
            return None
    else:
        return None
Esempio n. 6
0
def set_cookie(name, _, *args):
    cookie = SimpleCookie()
    cookie[name] = base64.b64encode(":".join(args))
    cookie[name]['path'] = "/"
    cookie[name]["expires"] = _expiration(5)  # 5 minutes from now
    logger.debug("Cookie expires: %s", cookie[name]["expires"])
    return tuple(cookie.output().split(": ", 1))
Esempio n. 7
0
    def __read_cookie(self):
        """Reads the HTTP Cookie and loads the sid and data from it (if any)."""
        try:
            # check the cookie to see if a session has been started
            cookie = SimpleCookie(os.environ['HTTP_COOKIE'])
            self.cookie_keys = filter(is_gaesessions_key, cookie.keys())
            if not self.cookie_keys:
                return  # no session yet
            self.cookie_keys.sort()
            data = ''.join(cookie[k].value for k in self.cookie_keys)
            i = SIG_LEN + SID_LEN
            sig, sid, b64pdump = data[:SIG_LEN], data[SIG_LEN:i], data[i:]
            pdump = b64decode(b64pdump)
            actual_sig = Session.__compute_hmac(self.base_key, sid, pdump)
            if sig == actual_sig:
                self.__set_sid(sid, False)
                # check for expiration and terminate the session if it has expired
                if self.get_expiration() != 0 and time.time() > self.get_expiration():
                    return self.terminate()

                if pdump:
                    self.data = self.__decode_data(pdump)
                else:
                    self.data = None  # data is in memcache/db: load it on-demand
            else:
                logging.warn('cookie with invalid sig received from %s: %s' % (os.environ.get('REMOTE_ADDR'), b64pdump))
        except (CookieError, KeyError, IndexError, TypeError):
            # there is no cookie (i.e., no session) or the cookie is invalid
            self.terminate(False)
Esempio n. 8
0
def parse_cookie(name, seed, kaka):
    """Parses and verifies a cookie value """
    if not kaka:
        return None

    cookie_obj = SimpleCookie(kaka)
    morsel = cookie_obj.get(name)

    if morsel:
        parts = morsel.value.split("|")
        if len(parts) != 3: return None
        # verify the cookie signature
        #print >> sys.stderr, "COOKIE verify '%s' '%s' '%s'" %  (seed,
        #                                                        parts[0],
        #                                                        parts[1])
        sig = cookie_signature(seed, parts[0], parts[1])
        #print >> sys.stderr, ">>", sig
        if sig != parts[2]:
            raise Exception("Invalid cookie signature")

        try:
            return parts[0].strip(), parts[1]
        except KeyError:
            return None
    else:
        return None
Esempio n. 9
0
    def __login(self, username, password):
        """
        login douban, get the session token
        """
        data = urllib.urlencode({'source':'simple',
                'form_email':username, 'form_password':password})
        contentType = "application/x-www-form-urlencoded"

        self.__get_bid()
        cookie = "bid=%s" % self.bid

        headers = {"Content-Type":contentType, "Cookie": cookie }
        with contextlib.closing(httplib.HTTPSConnection("www.douban.com")) as conn:
            conn.request("POST", "/accounts/login", data, headers)
        
            r1 = conn.getresponse()
            resultCookie = SimpleCookie(r1.getheader('Set-Cookie'))

            if not resultCookie.has_key('dbcl2'):
                raise DoubanLoginException()

            dbcl2 = resultCookie['dbcl2'].value
            if dbcl2 is not None and len(dbcl2) > 0:
                self.dbcl2 = dbcl2
        
                uid = self.dbcl2.split(':')[0]
                self.uid = uid
Esempio n. 10
0
def set_cookie(name, _, value):
    cookie = SimpleCookie()
    cookie[name] = value
    cookie[name]['path'] = "/"
    cookie[name]["expires"] = _expiration(5)  # 5 minutes from now
    logger.debug("Cookie expires: %s" % cookie[name]["expires"])
    return tuple(cookie.output().split(": ", 1))
Esempio n. 11
0
def cookie(name, sid, seed, expire=0, domain="",  path=""):
    """
    Create and return a cookie

    :param sid: Session identifier
    :param seed: A seed for the HMAC function
    :param expire: Number of minutes before this cookie goes stale
    :param domain: The domain of the cookie
    :param path: The path specification for the cookie
    :return: A tuple to be added to headers
    """
    cookie = SimpleCookie()
    timestamp = str(int(time.mktime(time.gmtime())))
    #print >> sys.stderr, "COOKIE create '%s' '%s' '%s'" %  (seed, sid,
    #                                                        timestamp)
    signature = cookie_signature(seed, sid, timestamp)
    #print >> sys.stderr, ">>", signature
    cookie[name] = "|".join([sid, timestamp, signature])
    if path:
        cookie[name]["path"] = path
    if domain:
        cookie[name]["domain"] = domain
    if expire:
        cookie[name]["expires"] = _expiration(expire,
                                              "%a, %d-%b-%Y %H:%M:%S GMT")

    return tuple(cookie.output().split(": ", 1))
Esempio n. 12
0
 def COOKIES(self):
     if 'brick.cookies' not in self.environ:
         raw_dict = SimpleCookie(self.environ.get('HTTP_COOKIE',''))
         self.environ['brick.cookies'] = {}
         for cookie in raw_dict.itervalues():
             self.environ['brick.cookies'][cookie.key] = cookie.value
     return self.environ['brick.cookies']
Esempio n. 13
0
    def getSession(self):
        """Return the existing session or a new session"""
        if self.session is not None:
            return self.session

        # Get value of cookie header that was sent
        cookie_str = self.headers.get('Cookie')
        if cookie_str:
            cookie_obj = SimpleCookie(cookie_str)
            sid_morsel = cookie_obj.get(self.SESSION_COOKIE_NAME, None)
            if sid_morsel is not None:
                sid = sid_morsel.value
            else:
                sid = None
        else:
            sid = None

        # If a session id was not set, create a new one
        if sid is None:
            sid = randomString(16, '0123456789abcdef')
            session = None
        else:
            session = self.server.sessions.get(sid)

        # If no session exists for this session ID, create one
        if session is None:
            session = self.server.sessions[sid] = {}

        session['id'] = sid
        self.session = session
        return session
Esempio n. 14
0
 def __init__(self, hnd, name = session.COOKIE_NAME, timeout = 0):
     super(DatastoreSession, self).__init__(hnd, name, timeout)
     
     SessionStore.clear()
     
     # check from cookie
     if not timeout:
         config = Config()
         timeout = config.get('session_timeout', 60*60)
     elif timeout == -1:
         timeout = 356*24*60*60*50
     if name in hnd.request.cookies:
         self._id = hnd.request.cookies[name]
         res = SessionStore.gql("WHERE id = :1", self._id).get()
         if res:
             self._store = res
             session_data = self._store.value
             if session_data:
                 self.update(pickle.loads(session_data))
         else:
             self._create_store(self._id)
     else:   # not in the cookie, set it
         c = SimpleCookie()
         c[name] = self._id
         c[name]['path'] = '/'
         c[name]['expires'] = rfc822.formatdate(time()+timeout)
         cs = c.output().replace('Set-Cookie: ', '')
         hnd.response.headers.add_header('Set-Cookie', cs)
         self._create_store(self._id)
def username(cookie, name=None):
    """ try to extract username from PAS cookie """
    if cookie is not None:
        cookies = SimpleCookie()
        try:
            cookies.load(cookie)
        except CookieError:
            return name

        if cookie_name in cookies:
            # Deal with doubly quoted cookies
            ac_cookie = repeatedly_unquote(cookies[cookie_name].value)

            try:
                ac = decodestring(ac_cookie + '=====')
            except (TypeError, binascii.Error):
                return name

            # plone.session 3.x (Plone 4.x)
            if '!' in ac[40:]:
                name, user_data = ac[40:].split('!', 1)
            # plone.session 2.x (Plone 3.x)
            elif ' ' in ac[20:21]:
                name = ac[21:]
            # PluggableAuthService.CookieAuthHelper
            elif ':' in ac:
                user, pwd = ac.split(':', 1)
                # PluggableAuthService >= 1.5
                try:
                    name = user.decode('hex')
                # PluggableAuthService < 1.5
                except TypeError:
                    name = user
    return name
Esempio n. 16
0
 def _parse_cookie(self):
     cookiestr = self.environ.get('HTTP_COOKIE', '')
     if not cookiestr:
         return
     cookies = SimpleCookie(cookiestr)
     for c in cookies.values():
         self.cookie[c.key] = c.value
Esempio n. 17
0
class CookieHandler(object):

    def __init__(self, *args, **kw):
        # Somewhere to store cookies between consecutive requests
        self.cookies = SimpleCookie()
        super(CookieHandler, self).__init__(*args, **kw)


    def httpCookie(self, path):
         """Return self.cookies as an HTTP_COOKIE environment value."""
         l = [m.OutputString() for m in self.cookies.values()
              if path.startswith(m['path'])]
         return '; '.join(l)

    def loadCookies(self, envstring):
        self.cookies.load(envstring)

    def saveCookies(self, response):
        """Save cookies from the response."""
        # Urgh - need to play with the response's privates to extract
        # cookies that have been set
        for k,v in response._cookies.items():
            k = k.encode('utf8')
            self.cookies[k] = v['value'].encode('utf8')
            if self.cookies[k].has_key('Path'):
                self.cookies[k]['Path'] = v['Path']
Esempio n. 18
0
class CookieScraper(object):
	"Scraper that keeps track of getting and setting cookies."
	def __init__(self):
		self._cookies = SimpleCookie()

	def get_page(self, url, post_data=None, headers=()):
		"""
		Helper method that gets the given URL, handling the sending and storing
		of cookies. Returns the requested page as a string.
		"""
		socket.timeout(300)
		opener = urllib.URLopener()
		opener.addheader('Cookie', self._cookies.output(attrs=[], header='',
sep=';').strip())
		for k, v in headers:
			opener.addheader(k, v)
		try:
			f = opener.open(url, post_data)
		except IOError, e:
			if e[1] == 302:
				# Got a 302 redirect, but check for cookies before redirecting.
				# e[3] is a httplib.HTTPMessage instance.
				if e[3].dict.has_key('set-cookie'):
					self._cookies.load(e[3].dict['set-cookie'])
				return self.get_page(e[3].getheader('location'))
			else:
				raise
		if f.headers.dict.has_key('set-cookie'):
			self._cookies.load(f.headers.dict['set-cookie'])
		return f.read()
Esempio n. 19
0
def get_cookie_dict(environ):
    """Return a *plain* dictionary of cookies as found in the request.

    Unlike ``get_cookies`` this returns a dictionary, not a
    ``SimpleCookie`` object.  For incoming cookies a dictionary fully
    represents the information.  Like ``get_cookies`` this caches and
    checks the cache.
    """
    header = environ.get('HTTP_COOKIE')
    if not header:
        return {}
    if environ.has_key('paste.cookies.dict'):
        cookies, check_header = environ['paste.cookies.dict']
        if check_header == header:
            return cookies
    cookies = SimpleCookie()
    try:
        cookies.load(header)
    except CookieError:
        pass
    result = {}
    for name in cookies:
        result[name] = cookies[name].value
    environ['paste.cookies.dict'] = (result, header)
    return result
Esempio n. 20
0
    def __init__(self, environ, backend, ttl, cookie_name, fp_use_ip, log):
        self.handler = backend
        self.ttl = ttl
        self.cookie_name = cookie_name
        self.sid = None
        self.data = {}
        self.log = log
        self.clear_cookie = False
        self.session_start = False

        fingerprint = '%s%s%s' % (environ.get('HTTP_USER_AGENT'),
                                  environ.get('HTTP_ACCEPT_ENCODING'),
                                  environ.get('HTTP_ACCEPT_LANGUAGE'))

        if fp_use_ip:
            fingerprint += environ.get('REMOTE_ADDR')

        self.fingerprint = hashlib.sha1(fingerprint).hexdigest()

        if 'HTTP_COOKIE' in environ:
            cookie = SimpleCookie(environ['HTTP_COOKIE'])

            if cookie.get(self.cookie_name):
                cookie_sid = cookie[self.cookie_name].value

                if cookie_sid:
                    self.sid = cookie_sid
Esempio n. 21
0
    def status(self, environ, start_response):

        name1 = ''
        name1_key = '*empty*'
        if 'HTTP_COOKIE' in environ:
            c = SimpleCookie(environ.get('HTTP_COOKIE', ''))
            if 'name1' in c:
                key = c.get('name1').value
                name1 = usernames.get(key, '')
                name1_key = key
                data = """
<html>
<body>

Your username is """
                data += name1
                data += " and your key is "
                data += name1_key
                data += "<p><a href='/'>Home</a></body></html>"
            else:
                data = """
<html>
<body>

You're not Logged in.....<p>
<a href='/'>Home</a>
</body>
</html>
"""

        start_response('200 OK', list(html_headers))
        
        return [data]
Esempio n. 22
0
 def get_cookie(self, name):
     cookie_str = self.request.headers.get('cookie')
     if not cookie_str:
         return None
     cookie = SimpleCookie()
     cookie.load(cookie_str)
     return cookie[name].value;
Esempio n. 23
0
 def __get_params(self):
     self._kwargs = self.__get_query_params()
     self._cog_ajax = self._kwargs.get('cog_ajax')
     self.__cog_target = self._kwargs.get('cog_target')
     self._cog_raw = self._kwargs.get('cog_raw', None)
     self._cog_method = self._kwargs.get('cog_method', None)
     # cog_method must not contain non-word caracters
     assert self._cog_method is None or \
         re.search('\W', self._cog_method) is None
     if self._cog_method is not None and self._cog_method[0] == '_':
         # we never should receive a protected method...
         self._cog_method = "w3error"
         self._kwargs['cog_method'] = "w3error"
         self._kwargs['cog_error'] = "Can't call a protected method!"
     self._cog_ref_oid = self._kwargs.get('cog_ref_oid', None)
     self._cog_oid_ = self._kwargs.get('cog_oid_', None)
     self._session_key = None
     if 'HTTP_COOKIE' in self._environ:
         cookie_string = self._environ.get('HTTP_COOKIE')
         cookie = SimpleCookie()
         cookie.load(cookie_string)
         if 'cog_session' in cookie:
             self._session_key = cookie['cog_session'].value
     self.__cog_environment = self.__get_env()
     self._cog_fqtn_ = self._kwargs.get('cog_fqtn_', None)
     if self._cog_ref_oid and self._cog_ref_oid == self._cog_oid_:
         self._cog_oid_ = None
     self._kwargs['cog_controller'] = self
     self._kwargs['cog_first_call'] = True
Esempio n. 24
0
 def __read_cookies(self):
     from Cookie import SimpleCookie
     cookies_raw = SimpleCookie(self.get_env('HTTP_COOKIE'))
     cookies     = {}
     for key, field in cookies_raw.iteritems():
         cookies[key] = field.value
     return Table(cookies, allow_duplicates = False, readonly = True)
Esempio n. 25
0
def application(environ, start_response): 

    GET = parse_qs(environ['QUERY_STRING'])
    path = environ['PATH_INFO']
    cookies = SimpleCookie(environ.get('HTTP_COOKIE', ''))
    headers = {'Content-Type': 'text/html'}

    if path == '/':
        response = base%{'contenido': form}
    elif path == '/set':
        cookies['sessionId'] = store.add(GET.get('name', ['NULL McNULL',])[0])
        response = base%{'contenido': '<div style="background-color:green;color:white">Cookie establecida</div>'}
        headers.update({'Set-Cookie': cookies['sessionId'].OutputString()})
    else:
        cookie = cookies.get('sessionId',None)
        name = cookie and store.get(cookie.value, None) or None
        response = base%{'contenido': "<p>El valor de la sesión es: %s</p>"%name if name else 'Ninguno'}
    
    headers.update({'Content-Length': str(len(response))})

    start_response(
          "200 OK",
          headers.items()
          )
    return [response]
Esempio n. 26
0
 def start(self, cookies, cookieopts=None):
     c = SimpleCookie(cookies)
     sid = c.get(self.cookiename)
     create = True
     if sid is not None:
         for m in self.get(sid.value):
             yield m
         if self.apiroutine.retvalue is not None:
             self.apiroutine.retvalue = (self.SessionHandle(self.apiroutine.retvalue, self.apiroutine), [])
             create = False
     if create:
         for m in self.create():
             yield m
         sh = self.apiroutine.retvalue
         m = Morsel()
         m.key = self.cookiename
         m.value = sh.id
         m.coded_value = sh.id
         opts = {"path": "/", "httponly": True}
         if cookieopts:
             opts.update(cookieopts)
             if not cookieopts["httponly"]:
                 del cookieopts["httponly"]
         m.update(opts)
         self.apiroutine.retvalue = (sh, [m])
Esempio n. 27
0
    def __read_cookie(self):
        """Reads the HTTP Cookie and loads the sid and data from it (if any)."""
        print 'session: __read_cookie'
        try:
            if self.environ.get('HTTP_COOKIE') is None:
                return #no cookies

            #cookie = SimpleCookie(os.environ['HTTP_COOKIE'])
            cookie = SimpleCookie(self.environ.get('HTTP_COOKIE'))
            self.cookie_keys = filter(is_mole_sessions_key, cookie.keys())
            if not self.cookie_keys:
                return  # no session

            self.cookie_keys.sort()
            data = ''.join(cookie[k].value for k in self.cookie_keys)
            i = SIG_LEN + SID_LEN
            sig, sid, b64pdump = data[:SIG_LEN], data[SIG_LEN:i], data[i:]
            pdump = b64decode(b64pdump)
            actual_sig = Session.__compute_hmac(self.base_key, sid, pdump)
            if sig == actual_sig:
                self.__set_sid(sid, False)
                if self.get_expiration() != 0 and time.time() > self.get_expiration():
                    return self.terminate()

                if pdump:
                    self.data = self.__decode_data(pdump)
                else:
                    self.data = None
            else:
                logging.warn('cookie with invalid sig received from %s: %s' % (os.environ.get('REMOTE_ADDR'), b64pdump))
        except (CookieError, KeyError, IndexError, TypeError):
            import traceback;traceback.print_exc()
            logging.error("session error:", exc_info=True)
            self.terminate(False)
Esempio n. 28
0
    def set_cookie(
        self, key, value, max_age=None, expires=None, path='/', domain=None
        ):
        """
        Adds the given cookie to the response, so it will be set on
        the user's browser.
        """
        cookies = Cookie()
        cookies[key] = value
        if isinstance(max_age, timedelta):
            max_age = max_age.seconds + max_age.days*24*60*60
        if max_age is not None and expires is None:
            expires = datetime.utcnow() + timedelta(seconds=max_age)
        if isinstance(expires, timedelta):
            expires = datetime.utcnow() + expires
        if isinstance(expires, datetime):
            expires = '"'+datetime_utils._serialize_cookie_date(expires)+'"'
        for var_name, var_value in [('max-age', max_age), ('path', path),
                                    ('domain', domain), ('expires', expires)]:
            if var_value is not None:
                cookies[key][var_name] = str(var_value)

        cookies = cookies.output(header='').lstrip()
        if cookies:
            self.extra_headers.append(('Set-Cookie', cookies))
Esempio n. 29
0
    def getSession(self):
        """Return the existing session or a new session"""
        if self.session is not None:
            return self.session

        # Get value of cookie header that was sent
        cookie_str = self.headers.get('Cookie')
        if cookie_str:
            cookie_obj = SimpleCookie(cookie_str)
            sid_morsel = cookie_obj.get(self.SESSION_COOKIE_NAME, None)
            if sid_morsel is not None:
                sid = sid_morsel.value
            else:
                sid = None
        else:
            sid = None

        # If a session id was not set, create a new one
        if sid is None:
            # Pure pragmatism: Use function for nonce salt to generate session ID.
            sid = make_nonce_salt(16)
            session = None
        else:
            session = self.server.sessions.get(sid)

        # If no session exists for this session ID, create one
        if session is None:
            session = self.server.sessions[sid] = {}

        session['id'] = sid
        self.session = session
        return session
Esempio n. 30
0
    def set_cookie(self, cookie):
        """
        Set a cookie.

        The cookie will actually be recorded in the WSGI environ and the
        'Set-Cookie' header will be generated with the responses are first
        sent.

        'cookie' can be one of four things:

            * a string: the value is considered a cookie header value, i.e. the
              bit that would normally be added after 'Set-Cookie: '.
            * (name, value) tuple: a persistent cookie is created.
            * (name, None) tuple: the named cookie will be removed.
            * cookie instance: e.g. one of the cookie types in Python's Cookie
              module.
        """
        if isinstance(cookie, str):
            pass
        elif isinstance(cookie, tuple):
            name, value = cookie
            cookie = SimpleCookie()
            cookie[name] = value or ''
            cookie[name]['path'] = self.environ['SCRIPT_NAME'] or '/'
            if value is None:
                cookie[name]['expires'] = 0
                cookie[name]['max-age'] = 0
            cookie = cookie.output(header='').strip()
        else:
            cookie = cookie.output(header='').strip()
        self.headers.append(cookie)
Esempio n. 31
0
 def _BaseCookie__set(self, key, real_value, coded_value):
     if not isinstance(key, bytes):
         key = key.encode('ascii')  # Python 2.x cannot handle unicode keys
     return SimpleCookie._BaseCookie__set(self, key, real_value,
                                          coded_value)
Esempio n. 32
0
 def clear_cookies(self):
     self._cookies = SimpleCookie()
Esempio n. 33
0
class Retriever(object):
    'HTTP client.'

    def __init__(self, user_agent=None, cache=Default, timeout=20, sleep=0):
        # Use cache=None to explicitly turn off caching.
        # If you don't provide cache, then it will cache in
        # settings.HTTP_CACHE, or '/tmp/eb_scraper_cache' if
        # the setting is undefined.
        # sleep should be the number of seconds to sleep between requests.
        from django.conf import settings
        if cache is Default:
            cache = getattr(settings, 'HTTP_CACHE', '/tmp/eb_scraper_cache')
        self.h = httplib2.Http(cache, timeout=timeout)
        self.h.force_exception_to_status_code = False
        self.h.follow_redirects = False
        self.user_agent = user_agent or 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)'
        self._cookies = SimpleCookie()
        self.logger = logging.getLogger('eb.retrieval.retriever')
        self.sleep = sleep

        # Keep track of whether we've downloaded any pages yet.
        # This makes sure we don't sleep before the very first requested page.
        self.page_downloaded = False

    def clear_cookies(self):
        self._cookies = SimpleCookie()

    def fetch_data_and_headers(self,
                               uri,
                               data=None,
                               headers=None,
                               send_cookies=True,
                               follow_redirects=True,
                               raise_on_error=True):
        "Retrieves the resource and returns a tuple of (content, header dictionary)."
        # Sleep, if necessary, but only if a page has already been downloaded
        # with this retriever. (We don't want to sleep before the very first
        # request that a retriever makes, because that would be unnecessary.)
        if self.sleep and self.page_downloaded:
            self.logger.debug('Sleeping for %s seconds', self.sleep)
            time.sleep(self.sleep)
        self.page_downloaded = True

        # Prepare the request.
        if not headers:
            headers = {}
        headers['user-agent'] = headers.get('user-agent', self.user_agent)
        if send_cookies and self._cookies:
            # Some broken ASP.NET servers put "\r\n" in there, so we replace
            # that with semicolon to get proper behavior.
            headers['Cookie'] = self._cookies.output(
                attrs=[], header='').strip().replace('\r\n', ';')
        method = data and "POST" or "GET"
        body = data and urlencode(data) or None
        if method == "POST" and body:
            headers.setdefault('Content-Type',
                               'application/x-www-form-urlencoded')

        # Get the response.
        resp_headers = None
        for attempt_number in range(3):
            self.logger.debug('Attempt %s: %s %s', attempt_number + 1, method,
                              uri)
            if data:
                self.logger.debug('%r', data)
            try:
                resp_headers, content = self.h.request(uri,
                                                       method,
                                                       body=body,
                                                       headers=headers)
                if resp_headers['status'] == '500':
                    self.logger.debug("Request got a 500 error: %s %s", method,
                                      uri)
                    continue  # Try again.
                break
            except socket.timeout:
                self.logger.debug("Request timed out after %s seconds: %s %s",
                                  self.h.timeout, method, uri)
                continue  # Try again.
            except socket.error, e:
                self.logger.debug("Got socket error: %s", e)
                continue  # Try again.
            except AttributeError, e:
                self.logger.debug("Got httplib bug where socket is None: %s",
                                  e)
                continue  # Try again
            except httplib2.ServerNotFoundError:
                raise RetrievalError("Could not %s %r: server not found" %
                                     (method, uri))
Esempio n. 34
0
def parse_cookie(request):
    """Translate request's cookie into a Cookie.SimpleCookie.
    """

    raw_cookie = request.message.get('Cookie','')
    return SimpleCookie(raw_cookie)
Esempio n. 35
0
 def load(cls, cookie_data):
     cookie = SessionCookie()
     SimpleCookie.load(cookie, cookie_data)
     return cookie
Esempio n. 36
0
    def convert_cookie(self, cookie):
        sc = SimpleCookie()
        for key, value in cookie.iteritems():
            sc[key] = value

        return sc
Esempio n. 37
0
def error_reporter(request):
    """ Grab an error submitted as a GET request """
    if not request.GET and not request.POST:
        return HttpResponse(
            '')  ## If someone just hits this page at random, ignore it

    url = request.GET.get('url', "")
    domain = Site.objects.get_current().domain
    if url[:4] == 'http' and (domain not in (url[7:(7 + len(domain))],
                                             url[8:(8 + len(domain))])):
        ## Punt responses not from us
        return HttpResponse(
            '')  ## Return something, so we don't trigger an error

    cookies = StringIO()
    get = StringIO()
    meta = StringIO()
    post = StringIO()

    pprint(dict(request.COOKIES), cookies)
    pprint(dict(request.GET), get)
    pprint(dict(request.META), meta)

    user_str = request.user.username if hasattr(
        request,
        'user') and request.user.is_authenticated() else "(not authenticated)"
    user_agent_str = request.META.get('HTTP_USER_AGENT', "(not specified)")

    msg = request.GET.get('msg', "(no message)")

    json_flag = ""

    if request.POST:
        if request.raw_post_data.strip()[0] == '[':
            ## Probably a JSON error report
            ## Let's try to decode it
            try:
                err = json.loads(request.raw_post_data)

                ## Deal with messages that we don't want to deal with
                if is_quirk_should_be_ignored(err):
                    return HttpResponse('')

                json_flag = " (JSON-encoded)"

                for e in err:
                    try:
                        c = SimpleCookie()
                        c.load(str(e['data']['cookie']))
                        e['data']['cookie'] = dict(
                            (str(x), str(y)) for x, y in c.iteritems())
                    except:  ## Whoops, don't have cookie data after all
                        pass

                    ## Also pull out some data, if we can
                    ## 'err' is an array, and we don't need to do this more than once;
                    ## but it should typically be an array of either 0 or 1 elements,
                    ## and we don't want to do it for a 0-length array,
                    ## so just do it in the loop
                    try:
                        if user_str == "(not authenticated)":
                            user_str = "%s %s" % (user_str,
                                                  e['data']['cookie'])
                    except:
                        pass

                    try:
                        if user_agent_str == "(not specified)":
                            user_agent_str = e['env']['user_agent']
                    except:
                        pass

                    try:
                        if msg == "(no message)":
                            msg = e['exception']['message']
                    except:
                        pass
            except Exception, e:
                print "*** Exception!", e
                print json.__dict__
                err = request.raw_post_data

            pprint(err, post)

        else:
            pprint(dict(request.POST), post)
Esempio n. 38
0
class Session(object):
    def __init__(self):
        self.data = {}
        self.started = False
        self._flock = None
        self.expires = 0  # delete right away

        self.__sid = sid = self.__getsid()
        self.path = os.path.join(S_DIR, sid + S_EXT)

    def isset(self, name):
        """Is the variable set in the session?"""
        if not self.started:
            raise NotStarted("Session must be started")

        return name in self

    def unset(self, name):
        """Unset the name from the session"""
        if not self.started:
            raise NotStarted("Session must be started")
        del self[name]

    @staticmethod
    def __newsid():
        """Create a new session ID"""
        h = hashlib.new("ripemd160")
        h.update(str(time.time() / time.clock()**-1) + str(os.getpid()))
        return h.hexdigest()

    def __getsid(self):
        """Get the current session ID or return a new one"""
        # first, try to load the sid from the GET or POST forms
        #query_string = sys.stdin.read()
        #parser = FormParser()
        #parser.parse_values(query_string)#query_string.partition('&')
        #_S_ID = parser.get_value("S_ID", "")
        #if S_ID:
        #    sid = S_ID
        #    return sid

        # then try to load the sid from the HTTP cookie
        self.cookie = SimpleCookie()
        if os.environ.has_key('HTTP_COOKIE'):
            self.cookie.load(os.environ['HTTP_COOKIE'])

            if S_ID in self.cookie:
                sid = self.cookie[S_ID].value
                return sid
        else:
            raise NoCookiesError("Could not find any cookies")

# if all else fails, return a new sid
        return self.__newsid()

    def getsid(self):
        """
        Return the name and value that the sid needs to have in a GET or POST
        request
        """
        if not self.started:
            raise NotStarted("Session must be started")
        return (S_ID, self.__sid)

    def start(self):
        """Start the session"""
        if self.started:
            return True  # session cannot be started more than once per script

        self._flock = FileLock(self.path)
        self._flock.acquire()

        # load the session if it exists
        if os.path.exists(self.path):
            with open(self.path, "rb") as f:
                self.data = dict(load(f))
                self.data["__date_loaded__"] = TODAY

        else:  # create a session
            with open(self.path, "wb") as f:
                self.data = {"__date_loaded__": TODAY}

# the session is officially started!
        self.started = True

        # store the sid in the cookie
        self.cookie[S_ID] = self.__sid
        self.cookie[S_ID]["expires"] = str(self.expires)
        self.cookie[S_ID]["version"] = "1"

        return True

    def commit(self):
        """Commit the changes to the session"""
        if not self.started:
            raise NotStarted("Session must be started")
        with open(self.path, "wb") as f:
            dump(self.data, f, HIGHEST_PROTOCOL)

    def destroy(self):
        """Destroy the session"""
        if not self.started:
            raise NotStarted("Session must be started")
        os.remove(self.path)
        if self._flock:
            self._flock.release()
        self.started = False

    def output(self):
        """Commit changes and send headers."""
        if not self.started:
            raise NotStarted("Session must be started")
        self.commit()
        return self.cookie.output()

    def setdefault(self, item, default=None):
        if not self.started:
            raise NotStarted("Session must be started")
        if not self.isset(item):
            self[item] = default

        return self[item]

    def set_expires(self, days):
        """Sets the expiration of the cookie"""
        date = datetime.date.today() + datetime.timedelta(days=days)
        self.expires = date.strftime("%a, %d-%b-%Y %H:%M:%S PST")
        self.cookie[S_ID]["expires"] = str(self.expires)

    def __getitem__(self, item):
        """Get the item from the session"""
        if not self.started:
            raise NotStarted("Session must be started")
        return self.data.__getitem__(item)

    def __setitem__(self, item, value):
        """set the item into the session"""
        if not self.started:
            raise NotStarted("Session must be started")
        self.data.__setitem__(item, value)

    def __delitem__(self, item):
        if not self.started:
            raise NotStarted("Session must be started")
        self.data.__delitem__(item)

    def __contains__(self, item):
        """Return if item in the session"""
        if not self.started:
            raise NotStarted("Session must be started")
        return self.data.__contains__(item)

    def __iter__(self):
        """Go through the names of all the session variables"""
        if not self.started:
            raise NotStarted("Session must be started")
        return self.data.__iter__()
Esempio n. 39
0
class Response(object):
    """An HTTP Response, including status, headers, and body.
    
    Application developers should use Response.headers (a dict) to
    set or modify HTTP response headers. When the response is finalized,
    Response.headers is transformed into Response.header_list as
    (key, value) tuples.
    """
    
    __metaclass__ = cherrypy._AttributeDocstrings
    
    # Class attributes for dev-time introspection.
    status = ""
    status__doc = """The HTTP Status-Code and Reason-Phrase."""
    
    header_list = []
    header_list__doc = """
    A list of the HTTP response headers as (name, value) tuples.
    In general, you should use response.headers (a dict) instead."""
    
    headers = httputil.HeaderMap()
    headers__doc = """
    A dict-like object containing the response headers. Keys are header
    names (in Title-Case format); however, you may get and set them in
    a case-insensitive manner. That is, headers['Content-Type'] and
    headers['content-type'] refer to the same value. Values are header
    values (decoded according to RFC 2047 if necessary). See also:
    httputil.HeaderMap, httputil.HeaderElement."""
    
    cookie = SimpleCookie()
    cookie__doc = """See help(Cookie)."""
    
    body = ResponseBody()
    body__doc = """The body (entity) of the HTTP response."""
    
    time = None
    time__doc = """The value of time.time() when created. Use in HTTP dates."""
    
    timeout = 300
    timeout__doc = """Seconds after which the response will be aborted."""
    
    timed_out = False
    timed_out__doc = """
    Flag to indicate the response should be aborted, because it has
    exceeded its timeout."""
    
    stream = False
    stream__doc = """If False, buffer the response body."""
    
    def __init__(self):
        self.status = None
        self.header_list = None
        self._body = []
        self.time = time.time()
        
        self.headers = httputil.HeaderMap()
        # Since we know all our keys are titled strings, we can
        # bypass HeaderMap.update and get a big speed boost.
        dict.update(self.headers, {
            "Content-Type": 'text/html',
            "Server": "CherryPy/" + cherrypy.__version__,
            "Date": httputil.HTTPDate(self.time),
        })
        self.cookie = SimpleCookie()
    
    def collapse_body(self):
        """Collapse self.body to a single string; replace it and return it."""
        if isinstance(self.body, basestring):
            return self.body

        newbody = ''.join([chunk for chunk in self.body])
        self.body = newbody
        return newbody
    
    def finalize(self):
        """Transform headers (and cookies) into self.header_list. (Core)"""
        try:
            code, reason, _ = httputil.valid_status(self.status)
        except ValueError, x:
            raise cherrypy.HTTPError(500, x.args[0])
        
        headers = self.headers
        
        self.output_status = str(code) + " " + headers.encode(reason)
        
        if self.stream:
            # The upshot: wsgiserver will chunk the response if
            # you pop Content-Length (or set it explicitly to None).
            # Note that lib.static sets C-L to the file's st_size.
            if dict.get(headers, 'Content-Length') is None:
                dict.pop(headers, 'Content-Length', None)
        elif code < 200 or code in (204, 205, 304):
            # "All 1xx (informational), 204 (no content),
            # and 304 (not modified) responses MUST NOT
            # include a message-body."
            dict.pop(headers, 'Content-Length', None)
            self.body = ""
        else:
            # Responses which are not streamed should have a Content-Length,
            # but allow user code to set Content-Length if desired.
            if dict.get(headers, 'Content-Length') is None:
                content = self.collapse_body()
                dict.__setitem__(headers, 'Content-Length', len(content))
        
        # Transform our header dict into a list of tuples.
        self.header_list = h = headers.output()
        
        cookie = self.cookie.output()
        if cookie:
            for line in cookie.split("\n"):
                if line.endswith("\r"):
                    # Python 2.4 emits cookies joined by LF but 2.5+ by CRLF.
                    line = line[:-1]
                name, value = line.split(": ", 1)
                if isinstance(name, unicode):
                    name = name.encode("ISO-8859-1")
                if isinstance(value, unicode):
                    value = headers.encode(value)
                h.append((name, value))
Esempio n. 40
0
class WebClient:
    "Minimal webservice client to do POST request with multipart encoded FORM data"

    def __init__(
        self,
        location,
        enctype="multipart/form-data",
        trace=False,
        cacert=None,
    ):
        kwargs = {}
        if httplib2.__version__ >= '0.7.0':
            kwargs['disable_ssl_certificate_validation'] = cacert is None
            kwargs['ca_certs'] = cacert
        self.http = httplib2.Http('.cache', **kwargs)
        self.trace = trace
        self.location = location
        self.enctype = enctype
        self.cookies = None
        self.method = "POST"
        self.referer = None

    def multipart_encode(self, vars):
        "Enconde form data (vars dict)"
        boundary = mimetools.choose_boundary()
        buf = StringIO()
        for key, value in vars.items():
            if not isinstance(value, file):
                buf.write('--%s\r\n' % boundary)
                buf.write('Content-Disposition: form-data; name="%s"' % key)
                buf.write('\r\n\r\n' + value + '\r\n')
            else:
                fd = value
                file_size = os.fstat(fd.fileno())[stat.ST_SIZE]
                filename = fd.name.split('/')[-1]
                contenttype = mimetypes.guess_type(
                    filename)[0] or 'application/octet-stream'
                buf.write('--%s\r\n' % boundary)
                buf.write(
                    'Content-Disposition: form-data; name="%s"; filename="%s"\r\n'
                    % (key, filename))
                buf.write('Content-Type: %s\r\n' % contenttype)
                # buffer += 'Content-Length: %s\r\n' % file_size
                fd.seek(0)
                buf.write('\r\n' + fd.read() + '\r\n')
        buf.write('--' + boundary + '--\r\n\r\n')
        buf = buf.getvalue()
        return boundary, buf

    def __call__(self, **vars):
        "Perform a GET/POST request and return the response"

        location = self.location
        if self.method == "GET":
            location += "?%s" % urlencode(vars)

        # prepare the request content suitable to be sent to the server:
        if self.enctype == "multipart/form-data":
            boundary, body = self.multipart_encode(vars)
            content_type = '%s; boundary=%s' % (self.enctype, boundary)
        elif self.enctype == "application/x-www-form-urlencoded":
            body = urlencode(vars)
            content_type = self.enctype

        # add headers according method, cookies, etc.:
        headers = {}
        if self.method == "POST":
            headers.update({
                'Content-type': content_type,
                'Content-length': str(len(body)),
            })
        if self.cookies:
            headers['Cookie'] = self.cookies.output(attrs=(),
                                                    header="",
                                                    sep=";")
        if self.referer:
            headers['Referer'] = self.referer

        if self.trace:
            print "-" * 80
            print "%s %s" % (self.method, location)
            print '\n'.join(["%s: %s" % (k, v) for k, v in headers.items()])
            print "\n%s" % body

        # send the request to the server and store the result:
        response, content = self.http.request(location,
                                              self.method,
                                              body=body,
                                              headers=headers)
        self.response = response
        self.content = content

        if self.trace:
            print
            print '\n'.join(["%s: %s" % (k, v) for k, v in response.items()])
            print content
            print "=" * 80

        # Parse and store the cookies (if any)
        if "set-cookie" in self.response:
            if not self.cookies:
                self.cookies = SimpleCookie()
            self.cookies.load(self.response["set-cookie"])

        return content
Esempio n. 41
0
class Request(object):
    """An HTTP request.
    
    This object represents the metadata of an HTTP request message;
    that is, it contains attributes which describe the environment
    in which the request URL, headers, and body were sent (if you
    want tools to interpret the headers and body, those are elsewhere,
    mostly in Tools). This 'metadata' consists of socket data,
    transport characteristics, and the Request-Line. This object
    also contains data regarding the configuration in effect for
    the given URL, and the execution plan for generating a response.
    """
    
    __metaclass__ = cherrypy._AttributeDocstrings
    
    prev = None
    prev__doc = """
    The previous Request object (if any). This should be None
    unless we are processing an InternalRedirect."""
    
    # Conversation/connection attributes
    local = httputil.Host("127.0.0.1", 80)
    local__doc = \
        "An httputil.Host(ip, port, hostname) object for the server socket."
    
    remote = httputil.Host("127.0.0.1", 1111)
    remote__doc = \
        "An httputil.Host(ip, port, hostname) object for the client socket."
    
    scheme = "http"
    scheme__doc = """
    The protocol used between client and server. In most cases,
    this will be either 'http' or 'https'."""
    
    server_protocol = "HTTP/1.1"
    server_protocol__doc = """
    The HTTP version for which the HTTP server is at least
    conditionally compliant."""
    
    base = ""
    base__doc = """The (scheme://host) portion of the requested URL.
    In some cases (e.g. when proxying via mod_rewrite), this may contain
    path segments which cherrypy.url uses when constructing url's, but
    which otherwise are ignored by CherryPy. Regardless, this value
    MUST NOT end in a slash."""
    
    # Request-Line attributes
    request_line = ""
    request_line__doc = """
    The complete Request-Line received from the client. This is a
    single string consisting of the request method, URI, and protocol
    version (joined by spaces). Any final CRLF is removed."""
    
    method = "GET"
    method__doc = """
    Indicates the HTTP method to be performed on the resource identified
    by the Request-URI. Common methods include GET, HEAD, POST, PUT, and
    DELETE. CherryPy allows any extension method; however, various HTTP
    servers and gateways may restrict the set of allowable methods.
    CherryPy applications SHOULD restrict the set (on a per-URI basis)."""
    
    query_string = ""
    query_string__doc = """
    The query component of the Request-URI, a string of information to be
    interpreted by the resource. The query portion of a URI follows the
    path component, and is separated by a '?'. For example, the URI
    'http://www.cherrypy.org/wiki?a=3&b=4' has the query component,
    'a=3&b=4'."""
    
    query_string_encoding = 'utf8'
    query_string_encoding__doc = """
    The encoding expected for query string arguments after % HEX HEX decoding).
    If a query string is provided that cannot be decoded with this encoding,
    404 is raised (since technically it's a different URI). If you want
    arbitrary encodings to not error, set this to 'Latin-1'; you can then
    encode back to bytes and re-decode to whatever encoding you like later.
    """
    
    protocol = (1, 1)
    protocol__doc = """The HTTP protocol version corresponding to the set
        of features which should be allowed in the response. If BOTH
        the client's request message AND the server's level of HTTP
        compliance is HTTP/1.1, this attribute will be the tuple (1, 1).
        If either is 1.0, this attribute will be the tuple (1, 0).
        Lower HTTP protocol versions are not explicitly supported."""
    
    params = {}
    params__doc = """
    A dict which combines query string (GET) and request entity (POST)
    variables. This is populated in two stages: GET params are added
    before the 'on_start_resource' hook, and POST params are added
    between the 'before_request_body' and 'before_handler' hooks."""
    
    # Message attributes
    header_list = []
    header_list__doc = """
    A list of the HTTP request headers as (name, value) tuples.
    In general, you should use request.headers (a dict) instead."""
    
    headers = httputil.HeaderMap()
    headers__doc = """
    A dict-like object containing the request headers. Keys are header
    names (in Title-Case format); however, you may get and set them in
    a case-insensitive manner. That is, headers['Content-Type'] and
    headers['content-type'] refer to the same value. Values are header
    values (decoded according to RFC 2047 if necessary). See also:
    httputil.HeaderMap, httputil.HeaderElement."""
    
    cookie = SimpleCookie()
    cookie__doc = """See help(Cookie)."""
    
    body = None
    body__doc = """See help(cherrypy.request.body)"""
    
    rfile = None
    rfile__doc = """
    If the request included an entity (body), it will be available
    as a stream in this attribute. However, the rfile will normally
    be read for you between the 'before_request_body' hook and the
    'before_handler' hook, and the resulting string is placed into
    either request.params or the request.body attribute.
    
    You may disable the automatic consumption of the rfile by setting
    request.process_request_body to False, either in config for the desired
    path, or in an 'on_start_resource' or 'before_request_body' hook.
    
    WARNING: In almost every case, you should not attempt to read from the
    rfile stream after CherryPy's automatic mechanism has read it. If you
    turn off the automatic parsing of rfile, you should read exactly the
    number of bytes specified in request.headers['Content-Length'].
    Ignoring either of these warnings may result in a hung request thread
    or in corruption of the next (pipelined) request.
    """
    
    process_request_body = True
    process_request_body__doc = """
    If True, the rfile (if any) is automatically read and parsed,
    and the result placed into request.params or request.body."""
    
    methods_with_bodies = ("POST", "PUT")
    methods_with_bodies__doc = """
    A sequence of HTTP methods for which CherryPy will automatically
    attempt to read a body from the rfile."""
    
    body = None
    body__doc = """
    If the request Content-Type is 'application/x-www-form-urlencoded'
    or multipart, this will be None. Otherwise, this will contain the
    request entity body as an open file object (which you can .read());
    this value is set between the 'before_request_body' and 'before_handler'
    hooks (assuming that process_request_body is True)."""
    
    body_params = None
    body_params__doc = """
    If the request Content-Type is 'application/x-www-form-urlencoded' or
    multipart, this will be a dict of the params pulled from the entity
    body; that is, it will be the portion of request.params that come
    from the message body (sometimes called "POST params", although they
    can be sent with various HTTP method verbs). This value is set between
    the 'before_request_body' and 'before_handler' hooks (assuming that
    process_request_body is True)."""
    
    # Dispatch attributes
    dispatch = cherrypy.dispatch.Dispatcher()
    dispatch__doc = """
    The object which looks up the 'page handler' callable and collects
    config for the current request based on the path_info, other
    request attributes, and the application architecture. The core
    calls the dispatcher as early as possible, passing it a 'path_info'
    argument.
    
    The default dispatcher discovers the page handler by matching path_info
    to a hierarchical arrangement of objects, starting at request.app.root.
    See help(cherrypy.dispatch) for more information."""
    
    script_name = ""
    script_name__doc = """
    The 'mount point' of the application which is handling this request.
    
    This attribute MUST NOT end in a slash. If the script_name refers to
    the root of the URI, it MUST be an empty string (not "/").
    """
    
    path_info = "/"
    path_info__doc = """
    The 'relative path' portion of the Request-URI. This is relative
    to the script_name ('mount point') of the application which is
    handling this request."""

    login = None
    login__doc = """
    When authentication is used during the request processing this is
    set to 'False' if it failed and to the 'username' value if it succeeded.
    The default 'None' implies that no authentication happened."""
    
    # Note that cherrypy.url uses "if request.app:" to determine whether
    # the call is during a real HTTP request or not. So leave this None.
    app = None
    app__doc = \
        """The cherrypy.Application object which is handling this request."""
    
    handler = None
    handler__doc = """
    The function, method, or other callable which CherryPy will call to
    produce the response. The discovery of the handler and the arguments
    it will receive are determined by the request.dispatch object.
    By default, the handler is discovered by walking a tree of objects
    starting at request.app.root, and is then passed all HTTP params
    (from the query string and POST body) as keyword arguments."""
    
    toolmaps = {}
    toolmaps__doc = """
    A nested dict of all Toolboxes and Tools in effect for this request,
    of the form: {Toolbox.namespace: {Tool.name: config dict}}."""
    
    config = None
    config__doc = """
    A flat dict of all configuration entries which apply to the
    current request. These entries are collected from global config,
    application config (based on request.path_info), and from handler
    config (exactly how is governed by the request.dispatch object in
    effect for this request; by default, handler config can be attached
    anywhere in the tree between request.app.root and the final handler,
    and inherits downward)."""
    
    is_index = None
    is_index__doc = """
    This will be True if the current request is mapped to an 'index'
    resource handler (also, a 'default' handler if path_info ends with
    a slash). The value may be used to automatically redirect the
    user-agent to a 'more canonical' URL which either adds or removes
    the trailing slash. See cherrypy.tools.trailing_slash."""
    
    hooks = HookMap(hookpoints)
    hooks__doc = """
    A HookMap (dict-like object) of the form: {hookpoint: [hook, ...]}.
    Each key is a str naming the hook point, and each value is a list
    of hooks which will be called at that hook point during this request.
    The list of hooks is generally populated as early as possible (mostly
    from Tools specified in config), but may be extended at any time.
    See also: _cprequest.Hook, _cprequest.HookMap, and cherrypy.tools."""
    
    error_response = cherrypy.HTTPError(500).set_response
    error_response__doc = """
    The no-arg callable which will handle unexpected, untrapped errors
    during request processing. This is not used for expected exceptions
    (like NotFound, HTTPError, or HTTPRedirect) which are raised in
    response to expected conditions (those should be customized either
    via request.error_page or by overriding HTTPError.set_response).
    By default, error_response uses HTTPError(500) to return a generic
    error response to the user-agent."""
    
    error_page = {}
    error_page__doc = """
    A dict of {error code: response filename or callable} pairs.
    
    The error code must be an int representing a given HTTP error code,
    or the string 'default', which will be used if no matching entry
    is found for a given numeric code.
    
    If a filename is provided, the file should contain a Python string-
    formatting template, and can expect by default to receive format 
    values with the mapping keys %(status)s, %(message)s, %(traceback)s,
    and %(version)s. The set of format mappings can be extended by
    overriding HTTPError.set_response.
    
    If a callable is provided, it will be called by default with keyword
    arguments 'status', 'message', 'traceback', and 'version', as for a
    string-formatting template. The callable must return a string or iterable of
    strings which will be set to response.body. It may also override headers or
    perform any other processing.
    
    If no entry is given for an error code, and no 'default' entry exists,
    a default template will be used.
    """
    
    show_tracebacks = True
    show_tracebacks__doc = """
    If True, unexpected errors encountered during request processing will
    include a traceback in the response body."""

    show_mismatched_params = True
    show_mismatched_params__doc = """
    If True, mismatched parameters encountered during PageHandler invocation
    processing will be included in the response body."""
    
    throws = (KeyboardInterrupt, SystemExit, cherrypy.InternalRedirect)
    throws__doc = \
        """The sequence of exceptions which Request.run does not trap."""
    
    throw_errors = False
    throw_errors__doc = """
    If True, Request.run will not trap any errors (except HTTPRedirect and
    HTTPError, which are more properly called 'exceptions', not errors)."""
    
    closed = False
    closed__doc = """
    True once the close method has been called, False otherwise."""
    
    stage = None
    stage__doc = """
    A string containing the stage reached in the request-handling process.
    This is useful when debugging a live server with hung requests."""
    
    namespaces = _cpconfig.NamespaceSet(
        **{"hooks": hooks_namespace,
           "request": request_namespace,
           "response": response_namespace,
           "error_page": error_page_namespace,
           "tools": cherrypy.tools,
           })
    
    def __init__(self, local_host, remote_host, scheme="http",
                 server_protocol="HTTP/1.1"):
        """Populate a new Request object.
        
        local_host should be an httputil.Host object with the server info.
        remote_host should be an httputil.Host object with the client info.
        scheme should be a string, either "http" or "https".
        """
        self.local = local_host
        self.remote = remote_host
        self.scheme = scheme
        self.server_protocol = server_protocol
        
        self.closed = False
        
        # Put a *copy* of the class error_page into self.
        self.error_page = self.error_page.copy()
        
        # Put a *copy* of the class namespaces into self.
        self.namespaces = self.namespaces.copy()
        
        self.stage = None
    
    def close(self):
        """Run cleanup code. (Core)"""
        if not self.closed:
            self.closed = True
            self.stage = 'on_end_request'
            self.hooks.run('on_end_request')
            self.stage = 'close'
    
    def run(self, method, path, query_string, req_protocol, headers, rfile):
        """Process the Request. (Core)
        
        method, path, query_string, and req_protocol should be pulled directly
            from the Request-Line (e.g. "GET /path?key=val HTTP/1.0").
        path should be %XX-unquoted, but query_string should not be.
            They both MUST be byte strings, not unicode strings.
        headers should be a list of (name, value) tuples.
        rfile should be a file-like object containing the HTTP request entity.
        
        When run() is done, the returned object should have 3 attributes:
          status, e.g. "200 OK"
          header_list, a list of (name, value) tuples
          body, an iterable yielding strings
        
        Consumer code (HTTP servers) should then access these response
        attributes to build the outbound stream.
        
        """
        response = cherrypy.serving.response
        self.stage = 'run'
        try:
            self.error_response = cherrypy.HTTPError(500).set_response
            
            self.method = method
            path = path or "/"
            self.query_string = query_string or ''
            self.params = {}
            
            # Compare request and server HTTP protocol versions, in case our
            # server does not support the requested protocol. Limit our output
            # to min(req, server). We want the following output:
            #     request    server     actual written   supported response
            #     protocol   protocol  response protocol    feature set
            # a     1.0        1.0           1.0                1.0
            # b     1.0        1.1           1.1                1.0
            # c     1.1        1.0           1.0                1.0
            # d     1.1        1.1           1.1                1.1
            # Notice that, in (b), the response will be "HTTP/1.1" even though
            # the client only understands 1.0. RFC 2616 10.5.6 says we should
            # only return 505 if the _major_ version is different.
            rp = int(req_protocol[5]), int(req_protocol[7])
            sp = int(self.server_protocol[5]), int(self.server_protocol[7])
            self.protocol = min(rp, sp)
            response.headers.protocol = self.protocol
            
            # Rebuild first line of the request (e.g. "GET /path HTTP/1.0").
            url = path
            if query_string:
                url += '?' + query_string
            self.request_line = '%s %s %s' % (method, url, req_protocol)
            
            self.header_list = list(headers)
            self.headers = httputil.HeaderMap()
            
            self.rfile = rfile
            self.body = None
            
            self.cookie = SimpleCookie()
            self.handler = None
            
            # path_info should be the path from the
            # app root (script_name) to the handler.
            self.script_name = self.app.script_name
            self.path_info = pi = path[len(self.script_name):]
            
            self.stage = 'respond'
            self.respond(pi)
            
        except self.throws:
            raise
        except:
            if self.throw_errors:
                raise
            else:
                # Failure in setup, error handler or finalize. Bypass them.
                # Can't use handle_error because we may not have hooks yet.
                cherrypy.log(traceback=True, severity=40)
                if self.show_tracebacks:
                    body = format_exc()
                else:
                    body = ""
                r = bare_error(body)
                response.output_status, response.header_list, response.body = r
        
        if self.method == "HEAD":
            # HEAD requests MUST NOT return a message-body in the response.
            response.body = []
        
        try:
            cherrypy.log.access()
        except:
            cherrypy.log.error(traceback=True)
        
        if response.timed_out:
            raise cherrypy.TimeoutError()
        
        return response
    
    # Uncomment for stage debugging
    # stage = property(lambda self: self._stage, lambda self, v: print(v))
    
    def respond(self, path_info):
        """Generate a response for the resource at self.path_info. (Core)"""
        response = cherrypy.serving.response
        try:
            try:
                try:
                    if self.app is None:
                        raise cherrypy.NotFound()
                    
                    # Get the 'Host' header, so we can HTTPRedirect properly.
                    self.stage = 'process_headers'
                    self.process_headers()
                    
                    # Make a copy of the class hooks
                    self.hooks = self.__class__.hooks.copy()
                    self.toolmaps = {}
                    
                    self.stage = 'get_resource'
                    self.get_resource(path_info)
                    
                    self.body = _cpreqbody.RequestBody(
                        self.rfile, self.headers, request_params=self.params)
                    
                    self.namespaces(self.config)
                    
                    self.stage = 'on_start_resource'
                    self.hooks.run('on_start_resource')
                    
                    # Parse the querystring
                    self.stage = 'process_query_string'
                    self.process_query_string()
                    
                    # Process the body
                    if self.process_request_body:
                        if self.method not in self.methods_with_bodies:
                            self.process_request_body = False
                    self.stage = 'before_request_body'
                    self.hooks.run('before_request_body')
                    if self.process_request_body:
                        self.body.process()
                    
                    # Run the handler
                    self.stage = 'before_handler'
                    self.hooks.run('before_handler')
                    if self.handler:
                        self.stage = 'handler'
                        response.body = self.handler()
                    
                    # Finalize
                    self.stage = 'before_finalize'
                    self.hooks.run('before_finalize')
                    response.finalize()
                except (cherrypy.HTTPRedirect, cherrypy.HTTPError), inst:
                    inst.set_response()
                    self.stage = 'before_finalize (HTTPError)'
                    self.hooks.run('before_finalize')
                    response.finalize()
            finally:
                self.stage = 'on_end_resource'
                self.hooks.run('on_end_resource')
        except self.throws:
            raise
        except:
            if self.throw_errors:
                raise
            self.handle_error()
    
    def process_query_string(self):
        """Parse the query string into Python structures. (Core)"""
        try:
            p = httputil.parse_query_string(
                self.query_string, encoding=self.query_string_encoding)
        except UnicodeDecodeError:
            raise cherrypy.HTTPError(
                404, "The given query string could not be processed. Query "
                "strings for this resource must be encoded with %r." % 
                self.query_string_encoding)
        
        # Python 2 only: keyword arguments must be byte strings (type 'str').
        for key, value in p.items():
            if isinstance(key, unicode):
                del p[key]
                p[key.encode(self.query_string_encoding)] = value
        self.params.update(p)
    
    def process_headers(self):
        """Parse HTTP header data into Python structures. (Core)"""
        # Process the headers into self.headers
        headers = self.headers
        for name, value in self.header_list:
            # Call title() now (and use dict.__method__(headers))
            # so title doesn't have to be called twice.
            name = name.title()
            value = value.strip()
            
            # Warning: if there is more than one header entry for cookies (AFAIK,
            # only Konqueror does that), only the last one will remain in headers
            # (but they will be correctly stored in request.cookie).
            if "=?" in value:
                dict.__setitem__(headers, name, httputil.decode_TEXT(value))
            else:
                dict.__setitem__(headers, name, value)
            
            # Handle cookies differently because on Konqueror, multiple
            # cookies come on different lines with the same key
            if name == 'Cookie':
                try:
                    self.cookie.load(value)
                except CookieError:
                    msg = "Illegal cookie name %s" % value.split('=')[0]
                    raise cherrypy.HTTPError(400, msg)
        
        if not dict.__contains__(headers, 'Host'):
            # All Internet-based HTTP/1.1 servers MUST respond with a 400
            # (Bad Request) status code to any HTTP/1.1 request message
            # which lacks a Host header field.
            if self.protocol >= (1, 1):
                msg = "HTTP/1.1 requires a 'Host' request header."
                raise cherrypy.HTTPError(400, msg)
        host = dict.get(headers, 'Host')
        if not host:
            host = self.local.name or self.local.ip
        self.base = "%s://%s" % (self.scheme, host)
    
    def get_resource(self, path):
        """Call a dispatcher (which sets self.handler and .config). (Core)"""
        # First, see if there is a custom dispatch at this URI. Custom
        # dispatchers can only be specified in app.config, not in _cp_config
        # (since custom dispatchers may not even have an app.root).
        dispatch = self.app.find_config(path, "request.dispatch", self.dispatch)
        
        # dispatch() should set self.handler and self.config
        dispatch(path)
    
    def handle_error(self):
        """Handle the last unanticipated exception. (Core)"""
        try:
            self.hooks.run("before_error_response")
            if self.error_response:
                self.error_response()
            self.hooks.run("after_error_response")
            cherrypy.serving.response.finalize()
        except cherrypy.HTTPRedirect, inst:
            inst.set_response()
            cherrypy.serving.response.finalize()
Esempio n. 42
0
class Request(object):
    """Creates a new Request object to hold information about a request.
    
    :param sock: The socket object of the request.
    :type  sock: socket.socket

    :param method: The requsted method.
    :type  method: str

    :param scheme: The requsted scheme.
    :type  scheme: str

    :param path: The requsted path.
    :type  path: str

    :param protocol: The requsted protocol.
    :type  protocol: str

    :param qs: The query string of the request.
    :type  qs: str
    """

    server = None
    """@cvar: A reference to the underlying server"""

    scheme = "http"
    protocol = (1, 1)
    server_protocol = (1, 1)
    host = ""
    local = Host("127.0.0.1", 80)
    remote = Host("127.0.0.1", 1111)

    xhr = False

    index = None
    script_name = ""

    login = None
    handled = False

    def __init__(self, sock, method, scheme, path, protocol, qs):
        "initializes x; see x.__class__.__doc__ for signature"

        self.sock = sock
        self.method = method
        self.scheme = scheme or Request.scheme
        self.path = path
        self.protocol = protocol
        self.qs = qs
        self.cookie = SimpleCookie()

        self._headers = None

        if sock:
            name = sock.getpeername()
            if name:
                self.remote = Host(*name)
            else:
                name = sock.getsockname()
                self.remote = Host(name, "", name)

        self.body = StringIO()

    def _getHeaders(self):
        return self._headers

    def _setHeaders(self, headers):
        self._headers = headers

        if "Cookie" in self.headers:
            self.cookie.load(self.headers["Cookie"])

        host = self.headers.get("Host", None)
        if not host:
            host = self.local.name or self.local.ip
        self.base = "%s://%s" % (self.scheme, host)

        self.xhr = self.headers.get("X-Requested-With", "").lower() == \
                "xmlhttprequest"

    headers = property(_getHeaders, _setHeaders)

    def __repr__(self):
        protocol = "HTTP/%d.%d" % self.protocol
        return "<Request %s %s %s>" % (self.method, self.path, protocol)

    def url(self):
        return url(self)
Esempio n. 43
0
def track_page_view(request):
    """
    // Track a page view, updates all the cookies and campaign tracker,
    // makes a server side request to Google Analytics and writes the transparent
    // gif byte data to the response.
    """
    environ = request.environ
    time_tup = time.localtime(time.time() + COOKIE_USER_PERSISTENCE)

    # set some useful items in environ:
    environ['COOKIES'] = parse_cookie(environ.get('HTTP_COOKIE', ''))
    environ['GET'] = {}
    for key, value in parse_qsl(environ.get('QUERY_STRING', ''), True):
        environ['GET'][
            key] = value  # we only have one value per key name, right? :)
    x_utmac = environ['GET'].get('x_utmac', None)

    domain = environ.get('HTTP_HOST', '')

    # Get the referrer from the utmr parameter, this is the referrer to the
    # page that contains the tracking pixel, not the referrer for tracking
    # pixel.
    document_referer = environ.get("HTTP_REFERER", "")
    if not document_referer or document_referer == "0":
        document_referer = "-"
    else:
        document_referer = unquote(document_referer)

    document_path = request.url
    if document_path:
        document_path = unquote(document_path)

    account = environ.get('UTMAC', 'UA-29152694-1')
    user_agent = environ.get("HTTP_USER_AGENT", '')

    # // Try and get visitor cookie from the request.
    cookie = environ['COOKIES'].get(COOKIE_NAME)

    visitor_id = get_visitor_id(environ.get("HTTP_X_DCMGUID", ''), account,
                                user_agent, cookie)

    # // Always try and add the cookie to the response.
    cookie = SimpleCookie()
    cookie[COOKIE_NAME] = visitor_id
    morsel = cookie[COOKIE_NAME]
    morsel['expires'] = time.strftime('%a, %d-%b-%Y %H:%M:%S %Z', time_tup)
    morsel['path'] = COOKIE_PATH

    utm_gif_location = "http://www.google-analytics.com/__utm.gif"

    for utmac in [account, x_utmac]:
        if not utmac:
            continue  # ignore empty utmacs
        # // Construct the gif hit url.
        utm_url = "".join(
            (utm_gif_location, "?", "utmwv=", VERSION, "&utmn=",
             get_random_number(), "&utmhn=", quote(domain), "&utmsr=",
             environ['GET'].get('utmsr',
                                ''), "&utme=", environ['GET'].get('utme', ''),
             "&utmr=", quote(document_referer), "&utmp=", quote(document_path),
             "&utmac=", utmac, "&utmcc=__utma%3D999.999.999.999.999.1%3B",
             "&utmvid=", visitor_id, "&utmip=",
             get_ip(environ.get("REMOTE_ADDR", ''))))
        logger.debug("utm_url: " + utm_url)
        # send_request_to_google_analytics(utm_url, environ)
    return utm_url
Esempio n. 44
0
 def run(self, method, path, query_string, req_protocol, headers, rfile):
     """Process the Request. (Core)
     
     method, path, query_string, and req_protocol should be pulled directly
         from the Request-Line (e.g. "GET /path?key=val HTTP/1.0").
     path should be %XX-unquoted, but query_string should not be.
         They both MUST be byte strings, not unicode strings.
     headers should be a list of (name, value) tuples.
     rfile should be a file-like object containing the HTTP request entity.
     
     When run() is done, the returned object should have 3 attributes:
       status, e.g. "200 OK"
       header_list, a list of (name, value) tuples
       body, an iterable yielding strings
     
     Consumer code (HTTP servers) should then access these response
     attributes to build the outbound stream.
     
     """
     response = cherrypy.serving.response
     self.stage = 'run'
     try:
         self.error_response = cherrypy.HTTPError(500).set_response
         
         self.method = method
         path = path or "/"
         self.query_string = query_string or ''
         self.params = {}
         
         # Compare request and server HTTP protocol versions, in case our
         # server does not support the requested protocol. Limit our output
         # to min(req, server). We want the following output:
         #     request    server     actual written   supported response
         #     protocol   protocol  response protocol    feature set
         # a     1.0        1.0           1.0                1.0
         # b     1.0        1.1           1.1                1.0
         # c     1.1        1.0           1.0                1.0
         # d     1.1        1.1           1.1                1.1
         # Notice that, in (b), the response will be "HTTP/1.1" even though
         # the client only understands 1.0. RFC 2616 10.5.6 says we should
         # only return 505 if the _major_ version is different.
         rp = int(req_protocol[5]), int(req_protocol[7])
         sp = int(self.server_protocol[5]), int(self.server_protocol[7])
         self.protocol = min(rp, sp)
         response.headers.protocol = self.protocol
         
         # Rebuild first line of the request (e.g. "GET /path HTTP/1.0").
         url = path
         if query_string:
             url += '?' + query_string
         self.request_line = '%s %s %s' % (method, url, req_protocol)
         
         self.header_list = list(headers)
         self.headers = httputil.HeaderMap()
         
         self.rfile = rfile
         self.body = None
         
         self.cookie = SimpleCookie()
         self.handler = None
         
         # path_info should be the path from the
         # app root (script_name) to the handler.
         self.script_name = self.app.script_name
         self.path_info = pi = path[len(self.script_name):]
         
         self.stage = 'respond'
         self.respond(pi)
         
     except self.throws:
         raise
     except:
         if self.throw_errors:
             raise
         else:
             # Failure in setup, error handler or finalize. Bypass them.
             # Can't use handle_error because we may not have hooks yet.
             cherrypy.log(traceback=True, severity=40)
             if self.show_tracebacks:
                 body = format_exc()
             else:
                 body = ""
             r = bare_error(body)
             response.output_status, response.header_list, response.body = r
     
     if self.method == "HEAD":
         # HEAD requests MUST NOT return a message-body in the response.
         response.body = []
     
     try:
         cherrypy.log.access()
     except:
         cherrypy.log.error(traceback=True)
     
     if response.timed_out:
         raise cherrypy.TimeoutError()
     
     return response
Esempio n. 45
0
 def COOKIES(self):
     if not self._COOKIES:
         self._COOKIES = SimpleCookie()
     return self._COOKIES
Esempio n. 46
0
 def __init__(self, input=None):
     SimpleCookie.__init__(self, input)
Esempio n. 47
0
 def __init__(self):
     self._cookies = SimpleCookie()
Esempio n. 48
0
class Retriever(object):
    'HTTP client.'
    def __init__(self, user_agent=None, timeout=20, sleep=0, disable_ssl_certificate_validation=True):
        # sleep should be the number of seconds to sleep between requests.
        self.h = httplib2.Http(timeout=timeout, disable_ssl_certificate_validation=disable_ssl_certificate_validation)
        self.h.force_exception_to_status_code = False
        self.h.follow_redirects = False
        self.user_agent = user_agent or 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)'
        self._cookies = SimpleCookie()
        self.logger = logging.getLogger('eb.retrieval.retriever')
        self.sleep = sleep

        # Keep track of whether we've downloaded any pages yet.
        # This makes sure we don't sleep before the very first requested page.
        self.page_downloaded = False

    def clear_cookies(self):
        self._cookies = SimpleCookie()

    def get_html_and_headers(self, uri, data=None, headers=None, send_cookies=True, follow_redirects=True, raise_on_error=True, basic_auth=None):
        "Retrieves the resource and returns a tuple of (content, header dictionary)."
        # Sleep, if necessary, but only if a page has already been downloaded
        # with this retriever. (We don't want to sleep before the very first
        # request that a retriever makes, because that would be unnecessary.)
        if self.sleep and self.page_downloaded:
            self.logger.debug('Sleeping for %s seconds', self.sleep)
            time.sleep(self.sleep)
        self.page_downloaded = True

        # Prepare the request.
        if not headers:
            headers = {}
        headers['user-agent'] = headers.get('user-agent', self.user_agent)

        # Take care of cookie header, if necessary.
        if send_cookies and self._cookies:
            # Some broken ASP.NET servers put "\r\n" in there, so we replace
            # that with semicolon to get proper behavior.
            headers['Cookie'] = self._cookies.output(attrs=[], header='').strip().replace('\r\n', ';')

        method = data and "POST" or "GET"
        body = urlencode(data) if isinstance(data, dict) else data
        if method == "POST" and body and 'Content-Type' not in headers:
            headers.setdefault('Content-Type', 'application/x-www-form-urlencoded')

        # Get the response.
        resp_headers = None
        for attempt_number in range(3):
            self.logger.debug('Attempt %s: %s', attempt_number + 1, method)
            try:
                resp_headers, content = self.h.request(uri, method, body=body, headers=headers)
                if resp_headers['status'] == '500':
                    self.logger.debug("Request got a 500 error: %s", method)
                    continue # Try again.
                break
            except socket.timeout:
                self.logger.debug("Request timed out after %s seconds: %s ", self.h.timeout, method)
                continue # Try again.
            except socket.error, e:
                self.logger.debug("Got socket error: %s", e)
                continue # Try again.
            except httplib2.ServerNotFoundError:
                raise RetrievalError("Could not %s : server not found" % (method))
Esempio n. 49
0
    def convertAsync(self, config, connectionSettings=None):
        if (config != None):
            config['clientName'] = "PYTHON"
            config['clientVersion'] = PDFreactor.VERSION

        url = self.url + "/convert/async.json"
        if (self.apiKey != None):
            url += '?apiKey=' + self.apiKey
        result = ""
        if (connectionSettings != None and 'headers' in connectionSettings
                and len(connectionSettings['headers'].keys()) == False):
            headers = connectionSettings['headers']
        else:
            headers = {}
            if (connectionSettings != None
                    and 'headers' in connectionSettings):
                for (key, value) in connectionSettings['headers'].items():
                    lcKey = key.lower()
                    if lcKey != "content-type" and lcKey != "range" and lcKey != "user-agent":
                        headers[key] = value
        headers['Content-Type'] = 'application/json'
        if (connectionSettings != None and 'cookies' in connectionSettings):
            headers['Cookie'] = '; '.join([
                '%s=%s' % (key, value)
                for (key, value) in connectionSettings['cookies'].items()
            ])
        headers['User-Agent'] = 'PDFreactor Python API v7'
        headers['X-RO-User-Agent'] = 'PDFreactor Python API v7'
        req = None
        if sys.version_info[0] == 2:
            from urllib2 import HTTPError
        else:
            from urllib.error import HTTPError
        try:
            if sys.version_info[0] == 2:
                import Cookie
                from Cookie import SimpleCookie
                import urllib2
                from urllib2 import URLError
                options = json.dumps(config)
                req = urllib2.Request(url, options, headers)
                response = urllib2.urlopen(req)
            else:
                import http.cookies
                from http.cookies import SimpleCookie
                import urllib.request
                options = json.dumps(config)
                req = urllib.request.Request(url, options.encode(), headers)
                response = urllib.request.urlopen(req)
        except HTTPError as e:
            if (e.code == 422):
                raise Exception(json.loads(e.read().decode('utf-8'))['error'])
            elif (e.code == 400):
                raise Exception('Invalid client data. ' +
                                json.loads(e.read().decode('utf-8'))['error'])
            elif (e.code == 401):
                raise Exception('Unauthorized. ' +
                                json.loads(e.read().decode('utf-8'))['error'])
            elif (e.code == 413):
                raise Exception('The configuration is too large to process.')
            elif (e.code == 500):
                raise Exception(json.loads(e.read().decode('utf-8'))['error'])
            elif (e.code == 503):
                raise Exception('Asynchronous conversions are unavailable.')
            elif (e.code > 400):
                raise Exception('PDFreactor Web Service error (status: ' +
                                str(e.code) + ').')
        except Exception as e:
            msg = e
            if hasattr(e, 'reason'):
                msg = e.reason
            raise Exception(
                'Error connecting to PDFreactor Web Service at ' + self.url +
                '. Please make sure the PDFreactor Web Service is installed and running (Error: '
                + str(msg) + ')')
        documentId = None
        if (response != None and response.info() != None):
            location = response.info().get("Location")
            if (location != None):
                documentId = location[location.rfind("/") + 1:len(location)]
            cookieHeader = response.info().get("Set-Cookie")
            if (cookieHeader != None and connectionSettings != None):
                if ('cookies' not in connectionSettings):
                    connectionSettings['cookies'] = {}
                cookiesObj = SimpleCookie()
                cookiesObj.load(cookieHeader)
                for name in cookiesObj:
                    connectionSettings['cookies'][name] = cookiesObj[
                        name].value
        return documentId
Esempio n. 50
0
 def __init__(self, host, port=None):
     self.host = host
     self.port = port
     self.response = None
     self.cookies = SimpleCookie()
Esempio n. 51
0
    def get_cookie(self):
        if 'HTTP_COOKIE' in self.environ:
            return SimpleCookie(self.environ['HTTP_COOKIE'])

        return None
Esempio n. 52
0
 def test_signed_out_user_is_anonymous(self):
     self.make_participant('alice')
     alice = User.from_username('alice')
     assert not alice.ANON
     alice.sign_out(SimpleCookie())
     assert alice.ANON
Esempio n. 53
0
 def cookies(self):
     """ Cookies parsed into a :class:`FormsDict`. Signed cookies are NOT
         decoded. Use :meth:`get_cookie` if you expect signed cookies. """
     cookies = SimpleCookie(self.environ.get('HTTP_COOKIE', '')).values()
     return FormsDict((c.key, c.value) for c in cookies)
Esempio n. 54
0
 def __init__(self):
     SimpleCookie.__init__(self)
     self.token = ''
     self.userid = ''
     self.passport = ''
     self._loaded = False
Esempio n. 55
0
 def __init__(self, *args, **kwargs):
     super(RpcHttpResponse, self).__init__(*args, **kwargs)
     self.cookies = SimpleCookie()
Esempio n. 56
0
class HTTPBase(object):
    def __init__(self,
                 verify=True,
                 ca_bundle=None,
                 key_file=None,
                 cert_file=None):
        self.request_args = {
            "allow_redirects": False,
        }
        self.cookies = {}
        self.cookiejar = cookielib.CookieJar()

        self.request_args["verify"] = verify
        if ca_bundle:
            self.request_args["verify"] = ca_bundle
        if key_file:
            self.request_args["cert"] = (cert_file, key_file)

        self.sec = None

    def _cookies(self):
        cookie_dict = {}

        for _, a in list(self.cookiejar._cookies.items()):
            for _, b in list(a.items()):
                for cookie in list(b.values()):
                    # print cookie
                    cookie_dict[cookie.name] = cookie.value

        return cookie_dict

    def set_cookie(self, kaka, request):
        """Returns a cookielib.Cookie based on a set-cookie header line"""

        # default rfc2109=False
        # max-age, httponly
        for cookie_name, morsel in kaka.items():
            std_attr = ATTRS.copy()
            std_attr["name"] = cookie_name
            _tmp = morsel.coded_value
            if _tmp.startswith('"') and _tmp.endswith('"'):
                std_attr["value"] = _tmp[1:-1]
            else:
                std_attr["value"] = _tmp

            std_attr["version"] = 0
            # copy attributes that have values
            for attr in morsel.keys():
                if attr in ATTRS:
                    if morsel[attr]:
                        if attr == "expires":
                            std_attr[attr] = _since_epoch(morsel[attr])
                        else:
                            std_attr[attr] = morsel[attr]
                elif attr == "max-age":
                    if morsel["max-age"]:
                        std_attr["expires"] = _since_epoch(morsel["max-age"])

            for att, set in PAIRS.items():
                if std_attr[att]:
                    std_attr[set] = True

            if std_attr["domain"] and std_attr["domain"].startswith("."):
                std_attr["domain_initial_dot"] = True

            if morsel["max-age"] is 0:
                try:
                    self.cookiejar.clear(domain=std_attr["domain"],
                                         path=std_attr["path"],
                                         name=std_attr["name"])
                except ValueError:
                    pass
            else:
                new_cookie = cookielib.Cookie(**std_attr)

                self.cookiejar.set_cookie(new_cookie)

    def send(self, url, method="GET", **kwargs):
        _kwargs = copy.copy(self.request_args)
        if kwargs:
            _kwargs.update(kwargs)

        if self.cookiejar:
            _kwargs["cookies"] = self._cookies()
            #logger.info("SENT COOKIEs: %s" % (_kwargs["cookies"],))
        try:
            r = requests.request(method, url, **_kwargs)
        except requests.ConnectionError, exc:
            raise ConnectionError("%s" % exc)

        try:
            #logger.info("RECEIVED COOKIEs: %s" % (r.headers["set-cookie"],))
            self.set_cookie(SimpleCookie(r.headers["set-cookie"]), r)
        except AttributeError, err:
            pass
Esempio n. 57
0
 def __init__(self):
     logger.debug(locals())
     self._cookies = SimpleCookie()
Esempio n. 58
0
class Scraper(object):
    primary_key = UNDEFINED  # Set to None if the scraper should never look for existing records.
    schema = None
    sleep = 0
    timeout = 20
    update = True  # Whether to update old records.

    # If a record is found with an item_date more than 14 days old, the
    # pub_date will be set to the item_date. Set to None to bypass this
    # behavior for an individual scraper (i.e., always set pub_date to
    # the time the scraper runs).
    fresh_days = 14

    # Set this to False for data sets where we don't have a natural item_date.
    # In this case, the scraper will set it to datetime.date.now() the first
    # time it sees a record, and it will stay that way for subsequent updates
    # to the same record.
    item_date_available = True

    # Hook for setting NewsItem.title from values in datadict. By default, just
    # use the dictionary key 'title'.
    # Other examples:
    #   "{secondary_type.name}" (for a lookup object)
    #   "News at {street_number} {street_name} {street_suffix}"
    title_format = u'{title}'

    def __init__(self, retriever=None):
        self.retriever = retriever
        self.logger = logging.getLogger('eb.retrieval.%s' % self.schema)
        #        self._geocoder = SmartGeocoder()
        self.clear_cache()
        self.h = httplib2.Http(timeout=20,
                               disable_ssl_certificate_validation=True)
        self.user_agent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)'
        self._cookies = SimpleCookie()
        module_name = re.match('^everyblock\.(.*)\.scrape$', self.__module__)
        if module_name:
            self.scraper_name = module_name.group(1)
        else:
            self.scraper_name = None
        self.is_dry_run = False
        self.num_geocode_attempted = 0
        self.num_geocode_succeeded = 0

    def clear_cache(self):
        self._metro_object_cache = None
        self._schema_fields_cache = None
        self._schema_object_cache = None
        self.created_newsitem_ids = []
        if hasattr(self, '_temp_files'):
            for filename in self._temp_files:
                os.unlink(filename)
        self._temp_files = []
        self.cleanup()

    def cleanup(self):
        """
        This is a hook for cleaning up after the scraper is done. It's
        guaranteed to be called at the end of run() and dry_run().
        """
        pass

    def dry_run(self, prepare=False, save_ungeocoded_addresses=False):
        """
        Run the scraper, but do not create or update any NewsItem objects.
        If `prepare` is True, this method will geocode location_names and create
        Lookup objects.
        If `save_ungeocoded_addresses` is a string value, the string will be
        treated as a file path, and all addresses that fail to be geocoded
        will be saved into that file.
        """
        for item in self.dry_run_iter(prepare, save_ungeocoded_addresses):
            pass

    def dry_run_iter(self, prepare=False, save_ungeocoded_addresses=False):
        """
        Just like dry_run(), but returns a generator that iterates over the
        data dictionaries created.
        """
        import pprint
        self.is_dry_run = True
        self.cache_retriever = CacheRetriever(self)
        self.start_time = datetime.datetime.now()
        self.start_date = self.start_time.date()
        if save_ungeocoded_addresses:
            self.ungeocoded_addresses = {}
            self.geocode = self.geocode_and_log

        try:
            for datadict in self.data():
                if prepare:
                    datadict = self.prepare_data(datadict)
                pprint.pprint(datadict)
                yield datadict
        finally:
            self.clear_cache()

        self.logger.info('Geocoding succeeded/attempted: {0}/{1}'.format(
            self.num_geocode_succeeded, self.num_geocode_attempted))
        if save_ungeocoded_addresses:
            self.create_geocoding_report()

    def run(self, raise_errors=True):
        self.logger.info("run() started")
        self.num_added = self.num_changed = 0
        self.start_time = datetime.datetime.now()
        self.start_date = self.start_time.date()
        # We use a try/finally here so that the DataUpdate object is created
        # regardless of whether the scraper raised an exception.
        results = None

        filename = self.schema + '-data.txt'
        ni = []
        try:
            got_error = True
            for datadict in self.data():
                ni.append(self.save(datadict))
            got_error = False
            with open(filename, 'w+') as outfile:
                json.dump(ni, outfile)

        except:
            # Record exceptions in the finally block
            if raise_errors:
                raise
            else:
                pass
        finally:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            # Rollback, in case the database is in an aborted transaction. This
            # avoids the "psycopg2.ProgrammingError: current transaction is aborted,
            # commands ignored until end of transaction block" error.
            #            from django.db import connection
            #           connection._rollback()
            finish_time = datetime.datetime.now()
            self.clear_cache()

            results = Results(
                schema=self.schema,
                update_start=self.start_time,
                update_finish=finish_time,
                num_added=self.num_added,
                num_changed=self.num_changed,
                num_skipped=0,
                num_hidden=0,
                got_error=got_error,
                traceback=''.join([
                    x for x in traceback.format_exception(
                        exc_type, exc_value, exc_traceback)
                ]),
                num_geocode_succeeded=self.num_geocode_succeeded,
                num_geocode_attempted=self.num_geocode_attempted,
            )

            self.logger.info('Records added: %s', self.num_added)
            self.logger.info('Records changed: %s', self.num_changed)
            self.logger.info('Geocoding succeeded/attempted: {0}/{1}'.format(
                self.num_geocode_succeeded, self.num_geocode_attempted))
        return results

    def prepare_data(self, datadict):
        # Get/create Lookup objects for lookup fields and set the value to the
        # Lookup ID.
        # Set the NewsItem fields in case they don't exist.
        datadict['description'] = datadict.get('description', '')
        datadict['url'] = datadict.get('url', '')

        # Set the NewsItem.pub_date according to the fresh_days value.
        datadict['pub_date'] = datadict.get('pub_date', self.start_time)
        if self.fresh_days is not None and (
                self.start_date -
                datadict['item_date']).days >= self.fresh_days:
            datadict['pub_date'] = datetime.datetime.combine(
                datadict['item_date'], datetime.time(0, 0))

        # Calculate location.
        datadict['location'] = None
        # Calculate title.
        TITLE_MAXLENGTH = 255
        datadict['title'] = self.title_format.format(**datadict)
        if len(datadict['title']) >= TITLE_MAXLENGTH - 3:
            datadict['title'] = datadict['title'][:TITLE_MAXLENGTH -
                                                  3] + u'...'

        # Convert non-many-to-many Lookup objects back to the IDs.
        return datadict

    def save(self, datadict):
        datadict = self.prepare_data(datadict)
        #if datadict['location'] is None:
        #   return
        old_newsitem = None

        # Special case for item_date_available == False.
        # Unfortunately this logic can't live in prepare_data() because we
        # don't have old_newsitem at that point.
        if not self.item_date_available:
            if old_newsitem is None:
                datadict['item_date'] = datetime.date.today()
            else:
                datadict['item_date'] = old_newsitem.item_date
        ni = self.create_newsitem(datadict)
        self.num_added += 1
        #       self.logger.info(u'Created NewsItem %s (total created in this scrape: %s)', ni.id, self.num_added)
        #        self.created_newsitem_ids.append(ni.id)
        return ni

    def create_newsitem(self, datadict):
        ni = {}  #NewsItem.objects.create(
        #schema=self.schema_object,
        ni['title'] = datadict['title'],
        ni['description'] = datadict['description'],
        ni['url'] = datadict['url'],
        ni['pub_date'] = str(datadict['pub_date']),
        ni['item_date'] = str(datadict['item_date']),
        ni['location'] = datadict['location'],
        ni['location_name'] = datadict['location_name'],
        ni['location_id'] = None,  # Scrapers shouldn't post to locations. In theory.

        json_data = json.dumps(ni)
        return json_data

    def data(self):
        """
        Yields final dictionaries of data, each of which MUST contain the
        following keys:
            title -- string
            item_date -- datetime.date
            location_name -- string
        Also, an item_date (datetime.date) must exist, except if you've
        specified item_date_available=False in the scraper.
        These keys might also exist:
            url
            location_name_geocoder
        Other data keys correspond to SchemaField.name.
        If a value of the dictionary is a list or dictionary, it will
        automatically be converted to JSON before being inserted into the
        db_attribute table as a string.
        If a key of the dictionary is a lookup=True SchemaField, then the value
        should be the Lookup.code value, not the Lookup ID or Lookup object.
        """
        raise NotImplementedError()

    def broken(self, message):
        raise ScraperBroken(message)

    def get(self, uri, *args, **kwargs):
        "Returns HTML for the given URL and POST data."
        parse_result = urlparse.urlparse(uri)
        if parse_result.scheme == 'ftp':
            return self.ftp_get(parse_result)
        else:
            return self.get_html(uri, *args, **kwargs)

    def get_to_file(self, *args, **kwargs):
        """
        Retrieves the given URL and POST data to a local file. Returns the filename.
        The Scraper automatically deletes the file when scraping is done.
        """
        filename = self.retriever.get_to_file(*args, **kwargs)
        self._temp_files.append(
            filename)  # Keep track so we can delete after scrape is done.
        return filename

    def cache_get(self, prefix, suffix, url, make_pretty=False, **kwargs):
        """
        Download the file at the given URL and return its contents as a string.
        If a dry run is in process, save it as a file in a cache directory using
        the given prefix and suffix.
        """
        if self.is_dry_run:
            return self.cache_retriever.get(prefix,
                                            suffix,
                                            url,
                                            make_pretty=make_pretty,
                                            **kwargs)
        else:
            return self.get(url, **kwargs)

    def cache_get_to_file(self, prefix, suffix, url, **kwargs):
        """
        Download the file at the given URL, save it to disk, and return its
        filename. If a dry run is in process, save it as a file in a cache
        directory using the given prefix and suffix.
        """
        if self.is_dry_run:
            return self.cache_retriever.get_to_file(prefix, suffix, url,
                                                    **kwargs)
        else:
            return self.get_to_file(url, **kwargs)

    def get_to_file(self, *args, **kwargs):
        """
        Downloads the given URI and saves it to a temporary file. Returns the
        full filename of the temporary file.
        """
        import os
        from tempfile import mkstemp
        fd, name = mkstemp()
        fp = os.fdopen(fd, 'wb')
        fp.write(self.get_html(*args, **kwargs))
        fp.close()
        return name

    def get_html(self,
                 uri,
                 data=None,
                 headers=None,
                 send_cookies=True,
                 follow_redirects=True,
                 raise_on_error=True,
                 basic_auth=None):
        return self.get_html_and_headers(uri, data, headers, send_cookies,
                                         follow_redirects, raise_on_error,
                                         basic_auth)[0]

    def get_html_and_headers(self,
                             uri,
                             data=None,
                             headers=None,
                             send_cookies=True,
                             follow_redirects=True,
                             raise_on_error=True,
                             basic_auth=None):
        if self.sleep and self.page_downloaded:
            self.logger.debug('Sleeping for %s seconds', self.sleep)
            time.sleep(self.sleep)
        self.page_downloaded = True

        # Prepare the request.
        if not headers:
            headers = {}
        headers['user-agent'] = headers.get('user-agent', self.user_agent)

        # Take care of basic auth header, if necessary.
        # See http://en.wikipedia.org/wiki/Basic_access_authentication
        if basic_auth is not None:
            import base64
            auth_header = 'Basic %s' % base64.encodestring(
                '%s:%s' % (basic_auth[0], basic_auth[1])).strip()
            headers['Authorization'] = auth_header

        # Take care of cookie header, if necessary.
        if send_cookies and self._cookies:
            # Some broken ASP.NET servers put "\r\n" in there, so we replace
            # that with semicolon to get proper behavior.
            headers['Cookie'] = self._cookies.output(
                attrs=[], header='').strip().replace('\r\n', ';')

        method = data and "POST" or "GET"
        body = urlencode(data) if isinstance(data, dict) else data
        if method == "POST" and body and 'Content-Type' not in headers:
            headers.setdefault('Content-Type',
                               'application/x-www-form-urlencoded')

        # Get the response.
        resp_headers = None
        for attempt_number in range(3):
            self.logger.debug('Attempt %s: %s %s', attempt_number + 1, method,
                              uri)
            if data:
                self.logger.debug('Data: %r', data)
            if headers:
                self.logger.debug('Headers: %r' % headers)
            try:
                resp_headers, content = self.h.request(uri,
                                                       method,
                                                       body=body,
                                                       headers=headers)
                if resp_headers['status'] == '500':
                    self.logger.debug("Request got a 500 error: %s %s", method,
                                      uri)
                    continue  # Try again.
                break
            except socket.timeout:
                self.logger.debug("Request timed out after %s seconds: %s %s",
                                  self.h.timeout, method, uri)
                continue  # Try again.
            except socket.error, e:
                self.logger.debug("Got socket error: %s", e)
                continue  # Try again.
            except httplib2.ServerNotFoundError:
                raise RetrievalError("Could not %s %r: server not found" %
                                     (method, uri))
Esempio n. 59
0
def allow_access(environ, host):
    #print '\n'.join(map(lambda x: '%s:  %s' % x, zip(environ.keys(), environ.values())))
    #print environ['REQUEST_URI'], environ['SCRIPT_NAME']
    try:
        cookie = SimpleCookie(environ['HTTP_COOKIE'])
    except KeyError:
        #No cookie == no permission
        return False

    #Special overide for INTERAL services, NOT FOR WEB SERVICES!!!
    #There should be no way they have the secret key!
    try:
        secret = cookie['secretkey'].value
        #print 'is "%s" == "%s"?' % (secret, settings.SECRET_KEY)
        if secret == settings.SECRET_KEY:
            #TODO Added a TRUST ips env var, and make sure it's one of those
            return True
    except KeyError:
        pass

    try:
        sessionId = cookie[settings.SESSION_COOKIE_NAME].value
    except KeyError:
        #No session id -> not logged in -> immediate access denied
        return False
    now = datetime.datetime.now(tz=pytz.utc)

    if now > allow_access.nextCheck:
        #if checkFrequency has passed, clear the list
        allow_access.validSessions = {}
        #print 'Cleared cache'
        allow_access.nextCheck = now + allow_access.checkFrequency

    try:
        #print 'Check in cache'
        #Check if in list
        expireTime = allow_access.validSessions[sessionId]
        #Get index
        #print 'In cache'
    except KeyError:  #Session not in dictionary
        #print 'Not in cache', allow_access.validSessions
        try:
            #print 'check in session db', sessionId
            db.reset_queries()
            session = Session.objects.get(pk=sessionId)
            if not SESSION_KEY in session.get_decoded():
                #If session KEY is not in the data stream there this is not a logged in SESSION
                return False
        except Session.DoesNotExist:
            #Not in session DB
            #print 'Not in session db'
            return False
        finally:
            db.connection.close()

        #if it's in the database
        allow_access.validSessions[sessionId] = session.expire_date
        #Add to cache
        expireTime = session.expire_date
        #print 'in session db'

    if expireTime > now:
        #Valid session by expire time
        #print 'Valid time'
        return True
    else:
        #print 'Invalid time'
        return False
Esempio n. 60
0
def track_page_view(environ):
    """
    // Track a page view, updates all the cookies and campaign tracker,
    // makes a server side request to Google Analytics and writes the transparent
    // gif byte data to the response.
    """    
    time_tup = time.localtime(time.time() + COOKIE_USER_PERSISTENCE)
    # set some useful items in environ: 
    environ['COOKIES'] = parse_cookie(environ.get('HTTP_COOKIE', ''))
    environ['GET'] = {}
    for key, value in parse_qsl(environ.get('QUERY_STRING', ''), True):
        environ['GET'][key] = value # we only have one value per key name, right? :) 
    x_utmac = environ['GET'].get('x_utmac', None)
    
    domain = environ.get('HTTP_X_FORWARDED_HOST', 
                         environ.get('HTTP_HOST', ''))
            
    # Get the referrer from the utmr parameter, this is the referrer to the
    # page that contains the tracking pixel, not the referrer for tracking
    # pixel.    
    document_referer = environ['GET'].get("utmr", "")
    if not document_referer or document_referer == "0":
        document_referer = "-"
    else:
        document_referer = unquote(document_referer)

    document_path = environ['GET'].get('utmp', "")
    if document_path:
        document_path = unquote(document_path)

    account = environ['GET'].get('utmac', '')      
    user_agent = environ.get("HTTP_USER_AGENT", '')    

    # // Try and get visitor cookie from the request.
    cookie = environ['COOKIES'].get(COOKIE_NAME)

    visitor_id = get_visitor_id(environ.get("HTTP_X_DCMGUID", ''), account, user_agent, cookie)
    
    # // Always try and add the cookie to the response.
    cookie = SimpleCookie()
    cookie[COOKIE_NAME] = visitor_id
    morsel = cookie[COOKIE_NAME]
    morsel['expires'] = time.strftime('%a, %d-%b-%Y %H:%M:%S %Z', time_tup) 
    morsel['path'] = COOKIE_PATH

    utm_gif_location = "http://www.google-analytics.com/__utm.gif"
    client_ip = environ.get("HTTP_X_FORWARDED_FOR",
                            environ.get("REMOTE_ADDR", ""))
    for utmac in [account, x_utmac]:
        if not utmac:
            continue # ignore empty utmacs
        # // Construct the gif hit url.
        utm_url = utm_gif_location + "?" + \
                "utmwv=" + VERSION + \
                "&utmn=" + get_random_number() + \
                "&utmhn=" + quote(domain) + \
                "&utmsr=" + environ['GET'].get('utmsr', '') + \
                "&utme=" + environ['GET'].get('utme', '') + \
                "&utmr=" + quote(document_referer) + \
                "&utmp=" + quote(document_path) + \
                "&utmac=" + utmac + \
                "&utmcc=__utma%3D999.999.999.999.999.1%3B" + \
                "&utmvid=" + visitor_id + \
                "&utmip=" + get_ip(client_ip)
        dbgMsg("utm_url: " + utm_url)    
        send_request_to_google_analytics(utm_url, environ)

    # // If the debug parameter is on, add a header to the response that contains
    # // the url that was used to contact Google Analytics.
    headers = [('Set-Cookie', str(cookie).split(': ')[1])]
    if environ['GET'].get('utmdebug', False):
        headers.append(('X-GA-MOBILE-URL', utm_url))
    
    # Finally write the gif data to the response
    response = write_gif_data()
    response_headers = response['response_headers']
    response_headers.extend(headers)
    return response