def process(self, req_type): print self.requestline if req_type == 2 and self.path.startswith("/uid_idx"): query = urllib.splitquery(self.path)[1].split("&") print "---", query uid = urllib.splitvalue(query[1])[1] f_url = urllib.splitvalue(query[0])[1] print "---url=", f_url print "---uid=", uid resp_uid = "9bd2936001794d0381d76a37b74a6dff" uid_idx = 1 content = { # "uid": int(2), "uid": str(resp_uid), "uid_idx": int(uid_idx) } print "***resp", content # url, uid = query.split("&") # print url, uid # time.sleep(1) content = json.dumps(content) f = io.BytesIO() f.write(content) f.seek(0) self.send_response(400) # self.send_header("Content-type", "application/json") self.send_header("Content-Length", str(len(content))) # self.end_headers() shutil.copyfileobj(f, self.wfile)
def __init__(self, soup, parent=False): """ Parse the form attributes and fields from the soup. Make sure to get the action right. When parent is set, then the parent element is used as anchor for the search for form elements. """ self._extra_args = {} self.soup = soup # Make sure to use base strings, not unicode for attr, value in soup.attrMap.iteritems(): setattr(self, str(attr), str(value)) # Set right anchor point for harvest if parent: self.soup = soup.parent # Harvest input elements. self._args = {} for item in self.soup.findAll("input"): # Make sure to initialize to '' to avoid None strings to appear # during submit self._args[str(item.get("name"))] = item.get("value") or "" # Harvest url self.scheme, self.host, self.action = urlsplit(self.action) self.action, args = urllib.splitquery(self.action) if args: args = args.split("&") for arg in args: attr, value = urllib.splitvalue(arg) self._extra_args[str(attr)] = value or ""
def __init__(self, soup, parent=False): ''' Parse the form attributes and fields from the soup. Make sure to get the action right. When parent is set, then the parent element is used as anchor for the search for form elements. ''' self._extra_args = {} self.soup = soup # Make sure to use base strings, not unicode for attr, value in soup.attrMap.iteritems(): setattr(self, str(attr), str(value)) # Set right anchor point for harvest if parent: self.soup = soup.parent # Harvest input elements self._args = {} for item in self.soup.findAll('input'): self._args[str(item.get('name'))] = item.get('value') # Harvest url self.scheme, self.host, self.action = urlsplit(self.action) self.action, args = urllib.splitquery(self.action) if args: args = args.split('&') for arg in args: attr, value = urllib.splitvalue(arg) self._extra_args[str(attr)] = value
def do_GET(self): log = open("/tmp/log.txt","w") log.write("Calling Handler %s\n" % self.path) try: _, query_args = urllib.splitquery(self.path) arguments = dict([ urllib.splitvalue(query_arg) for query_arg in query_args.split('&') ]) session_id = arguments.get('sessionid') log.write("Session id: %s\n" % session_id) if session_id is None: self.send_error(400) self.end_headers() self.wfile.write("fail: sessionid argument is required") else: try: log.write("Changing password...\n") change_password(session_id) log.write("Password changed\n") except Exception, e: log.write("There was an error %s\n", e) traceback.print_exc(file=log) traceback.print_exc() self.send_error(500) self.end_headers() self.wfile.write("Internal error: %s" % str(e)) else: log.write("Sending 'ok'...\n")
def extract_query(query_str): querys = {} for i in query_str.split('&'): key_value = urllib.splitvalue(i) querys[key_value[0]] = key_value[1] return querys
def do_GET(self): log = open("/tmp/log.txt", "w") log.write("Calling Handler %s\n" % self.path) try: _, query_args = urllib.splitquery(self.path) arguments = dict([ urllib.splitvalue(query_arg) for query_arg in query_args.split('&') ]) session_id = arguments.get('sessionid') log.write("Session id: %s\n" % session_id) if session_id is None: self.send_error(400) self.end_headers() self.wfile.write("fail: sessionid argument is required") else: try: log.write("Changing password...\n") change_password(session_id) log.write("Password changed\n") except Exception, e: log.write("There was an error %s\n", e) traceback.print_exc(file=log) traceback.print_exc() self.send_error(500) self.end_headers() self.wfile.write("Internal error: %s" % str(e)) else: log.write("Sending 'ok'...\n")
def create(self, method, input, component_name): """ Accept a resource creation form for a specified component. """ if not input: return self.error_return(component_name, "Need form input!") elems = input.split("&") d = dict() for e in elems: name, value = urllib.splitvalue(e) value = urllib.unquote_plus(value) path_elems = name.split("__") d2 = d for i, pe in enumerate(path_elems): if i < len(path_elems) - 1: # More elements to come later? We must create a dict d2 = d2.setdefault(pe, dict()) else: if value: d2[pe] = value try: ret_msg = makeResource(component_name, d) except RestxException, e: return self.error_return(component_name, e.msg)
def do_GET(self): if re.match("/images/[bcdWDF][123456789eswnrgw]\.png", self.path): self.send_response(200) self.send_header('Content-Type', 'image/png') self.send_header('Cache-Control', 'max-age=86400, must-revalidate') self.end_headers() from os import curdir, sep filename = curdir + sep + self.path print filename f = open(curdir + sep + self.path, 'rb') self.wfile.write(f.read()) f.close() else: self.printCustomTextHTTPResponse(200) query_string = urllib.unquote_plus(self.path) path, query = urllib.splitquery(query_string) print query_string, path, query parameters = dict( urllib.splitvalue(v) for v in query.split("&")) if query else {} print parameters if 'sit' in parameters: situation = parameters['sit'] else: situation = None #else: #situation = query_string[1:] print "situation:", situation page = get_page(situation) self.wfile.write('<html>') self.wfile.write(page) self.wfile.write('</html>')
def makeReqHeaders(self): range_format = self.url.range_format Range = (self.progress.begin + self.progress.go_inc, self.progress.end) req_path = self.target.path req_headers = dict(self.url.headers.items()) if range_format[0] == '&': path, query = splitquery(self.target.path) query_dict = extract_query(query) range_format = range_format % Range for i in range_format[1:].split('&'): param_key, param_value = splitvalue(i) query_dict[param_key] = param_value new_query = urlencode(query_dict) req_path = '%s?%s' % (path, new_query) else: range_field = range_format % Range key_value = [i.strip() for i in range_field.split(':')] key = key_value[0] value = key_value[1] add_headers = {key: value, 'Accept-Ranges': 'bytes'} req_headers.update(add_headers) return req_path, req_headers
def do_GET(self): if re.match("/images/[bcdWDF][123456789eswnrgw]\.png", self.path): self.send_response(200) self.send_header('Content-Type', 'image/png') self.send_header('Cache-Control', 'max-age=86400, must-revalidate') self.end_headers() from os import curdir, sep filename = curdir + sep + self.path print filename f = open(curdir + sep + self.path, 'rb') self.wfile.write(f.read()) f.close() else: self.printCustomTextHTTPResponse(200) query_string = urllib.unquote_plus(self.path) path, query = urllib.splitquery(query_string) print query_string, path, query parameters = dict(urllib.splitvalue(v) for v in query.split("&")) if query else {} print parameters if 'sit' in parameters: situation = parameters['sit'] else: situation = None #else: #situation = query_string[1:] print "situation:", situation page = get_page(situation) self.wfile.write('<html>') self.wfile.write(page) self.wfile.write('</html>')
def __init__(self, soup, parent=False): ''' Parse the form attributes and fields from the soup. Make sure to get the action right. When parent is set, then the parent element is used as anchor for the search for form elements. ''' self._extra_args = {} self.soup = soup # Make sure to use base strings, not unicode for attr, value in soup.attrMap.iteritems(): setattr(self, str(attr), str(value)) # Set right anchor point for harvest if parent: self.soup = soup.parent # Harvest input elements. self._args = {} for item in self.soup.findAll('input'): # Make sure to initialize to '' to avoid None strings to appear # during submit self._args[str(item.get('name'))] = item.get('value') or '' # Harvest url self.scheme, self.host, self.action = urlsplit(self.action) self.action, args = urllib.splitquery(self.action) if args: args = args.split('&') for arg in args: attr, value = urllib.splitvalue(arg) self._extra_args[str(attr)] = value or ''
def handleTraceback(object): context = object.context entry_url = object.entry_url if entry_url is None: return LOGGER.info("handle traceback [%s]" % entry_url) try: cleanup_lock.acquire() # we don't want to produce any errors here, thus, we'll be nice and die # silently if an error occurs here try: transaction.begin() # get our logbook view to use the api logbook = context.unrestrictedTraverse('@@logbook') # get the generated error url from Products.SiteErrorLog err_id = urllib.splitvalue(entry_url)[1] # save error logbook.save_error(err_id, context=aq_parent(context)) transaction.get().note('collective.logbook traceback [%s]' % entry_url) transaction.commit() finally: cleanup_lock.release() # only warning except Exception, e: LOGGER.warning("An error occured while handling the traceback") LOGGER.warning("%s" % e) LOGGER.exception(e)
class FTPHandler(BaseHandler): def ftp_open(self, req): import ftplib import mimetypes host = req.get_host() if not host: raise URLError('ftp error: no host given') host, port = splitport(host) if port is None: port = ftplib.FTP_PORT else: port = int(port) # username/password handling user, host = splituser(host) if user: user, passwd = splitpasswd(user) else: passwd = None host = unquote(host) user = unquote(user or '') passwd = unquote(passwd or '') try: host = socket.gethostbyname(host) except socket.error, msg: raise URLError(msg) path, attrs = splitattr(req.get_selector()) dirs = path.split('/') dirs = map(unquote, dirs) dirs, file = dirs[:-1], dirs[-1] if dirs and not dirs[0]: dirs = dirs[1:] try: fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) type = file and 'I' or 'D' for attr in attrs: attr, value = splitvalue(attr) if attr.lower() == 'type' and \ value in ('a', 'A', 'i', 'I', 'd', 'D'): type = value.upper() fp, retrlen = fw.retrfile(file, type) headers = "" mtype = mimetypes.guess_type(req.get_full_url())[0] if mtype: headers += "Content-type: %s\n" % mtype if retrlen is not None and retrlen >= 0: headers += "Content-length: %d\n" % retrlen sf = StringIO(headers) headers = mimetools.Message(sf) return addinfourl(fp, headers, req.get_full_url()) except ftplib.all_errors, msg: raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2]
def open_ftp(self, url): host, path = urllib.splithost(url) if not host: raise IOError, ('ftp error', 'no host given') host, port = urllib.splitport(host) user, host = urllib.splituser(host) # if user: user, passwd = splitpasswd(user) if user: passwd = getpass.getpass() else: passwd = None host = urllib.unquote(host) user = urllib.unquote(user or '') passwd = urllib.unquote(passwd or '') host = socket.gethostbyname(host) if not port: import ftplib port = ftplib.FTP_PORT else: port = int(port) path, attrs = urllib.splitattr(path) path = urllib.unquote(path) dirs = string.splitfields(path, '/') dirs, file = dirs[:-1], dirs[-1] if dirs and not dirs[0]: dirs = dirs[1:] key = (user, host, port, string.joinfields(dirs, '/')) # XXX thread unsafe! if len(self.ftpcache) > MAXFTPCACHE: # Prune the cache, rather arbitrarily for k in self.ftpcache.keys(): if k != key: v = self.ftpcache[k] del self.ftpcache[k] v.close() try: if not self.ftpcache.has_key(key): print 'Creating ftpwrapper: ',user,host,port,dirs self.ftpcache[key] = \ urllib.ftpwrapper(user, passwd, host, port, dirs) if not file: type = 'D' else: type = 'I' for attr in attrs: attr, value = urllib.splitvalue(attr) if string.lower(attr) == 'type' and \ value in ('a', 'A', 'i', 'I', 'd', 'D'): type = string.upper(value) (fp, retrlen) = self.ftpcache[key].retrfile(file, type) if retrlen is not None and retrlen >= 0: import mimetools, StringIO headers = mimetools.Message(StringIO.StringIO( 'Content-Length: %d\n' % retrlen)) else: headers = noheaders() return urllib.addinfourl(fp, headers, "ftp:" + url) except urllib.ftperrors(), msg: raise IOError, ('ftp error', msg), sys.exc_info()[2]
def _get_youtube_url(self): url = self.surl.get_long_url() # split 'http://www.youtube.com/v=VIDEOID#tag' -> 'v=VIDEOID#tag' query = urllib.splitquery(os.path.split(url)[1])[1] # split 'v=VIDEOID#tag' -> 'VIDEOID#tag' value = urllib.splitvalue(query)[1] # split 'VIDEOID#tag' -> ('VIDEOID', 'tag') vid, tag = urllib.splittag(value) if tag: # split 't=5m2s' -> ('t', '5m2s') tag = urllib.splitvalue(tag)[1] # convert '5m2s' -> int(302) time_offset = Forwarder._friendly_to_seconds(tag) else: time_offset = 0 if time_offset > 0: return 'http://www.youtube.com/v/%s?start=%s' % (vid, time_offset) else: return 'http://www.youtube.com/v/%s' % (vid)
def open_ftp(self, url): host, path = urllib.splithost(url) if not host: raise IOError, ('ftp error', 'no host given') host, port = urllib.splitport(host) user, host = urllib.splituser(host) # if user: user, passwd = splitpasswd(user) if user: passwd = getpass.getpass() else: passwd = None host = urllib.unquote(host) user = urllib.unquote(user or '') passwd = urllib.unquote(passwd or '') host = socket.gethostbyname(host) if not port: import ftplib port = ftplib.FTP_PORT else: port = int(port) path, attrs = urllib.splitattr(path) path = urllib.unquote(path) dirs = string.splitfields(path, '/') dirs, file = dirs[:-1], dirs[-1] if dirs and not dirs[0]: dirs = dirs[1:] key = (user, host, port, string.joinfields(dirs, '/')) # XXX thread unsafe! if len(self.ftpcache) > MAXFTPCACHE: # Prune the cache, rather arbitrarily for k in self.ftpcache.keys(): if k != key: v = self.ftpcache[k] del self.ftpcache[k] v.close() try: if not self.ftpcache.has_key(key): print 'Creating ftpwrapper: ', user, host, port, dirs self.ftpcache[key] = \ urllib.ftpwrapper(user, passwd, host, port, dirs) if not file: type = 'D' else: type = 'I' for attr in attrs: attr, value = urllib.splitvalue(attr) if string.lower(attr) == 'type' and \ value in ('a', 'A', 'i', 'I', 'd', 'D'): type = string.upper(value) (fp, retrlen) = self.ftpcache[key].retrfile(file, type) if retrlen is not None and retrlen >= 0: import mimetools, StringIO headers = mimetools.Message( StringIO.StringIO('Content-Length: %d\n' % retrlen)) else: headers = noheaders() return urllib.addinfourl(fp, headers, "ftp:" + url) except urllib.ftperrors(), msg: raise IOError, ('ftp error', msg), sys.exc_info()[2]
def ftp_open(self, req): import ftplib import mimetypes host = req.get_host() if not host: raise URLError('ftp error: no host given') host, port = splitport(host) if port is None: port = ftplib.FTP_PORT else: port = int(port) user, host = splituser(host) if user: user, passwd = splitpasswd(user) else: passwd = None host = unquote(host) user = user or '' passwd = passwd or '' try: host = socket.gethostbyname(host) except socket.error as msg: raise URLError(msg) path, attrs = splitattr(req.get_selector()) dirs = path.split('/') dirs = map(unquote, dirs) dirs, file = dirs[:-1], dirs[-1] if dirs and not dirs[0]: dirs = dirs[1:] try: fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) type = file and 'I' or 'D' for attr in attrs: attr, value = splitvalue(attr) if attr.lower() == 'type' and value in ('a', 'A', 'i', 'I', 'd', 'D'): type = value.upper() fp, retrlen = fw.retrfile(file, type) headers = '' mtype = mimetypes.guess_type(req.get_full_url())[0] if mtype: headers += 'Content-type: %s\n' % mtype if retrlen is not None and retrlen >= 0: headers += 'Content-length: %d\n' % retrlen sf = StringIO(headers) headers = mimetools.Message(sf) return addinfourl(fp, headers, req.get_full_url()) except ftplib.all_errors as msg: raise URLError, 'ftp error: %s' % msg, sys.exc_info()[2] return
def __extract_video_id_from_uri(self, uri): """ GET uri like '/watch?v=AsXf9v¶m=1&p=3#junk' RETURNS value for 'v' parameter --> 'AsXf9v' """ uri = uri.replace('&', ';') uri = uri.replace('?', ';') req, params = urllib.splitattr(uri) for item in params: k, v = urllib.splitvalue(params[0]) if k == 'v': return v raise ValueError("Can't find parameter 'v' from '%s'" % uri)
def get_url(self, id, quality): skey, timestamp = self.genkey() self.browser.readurl('http://online.nolife-tv.com/_nlfplayer/api/api_player.php', 'quality=%s&a=EML&skey=%s&id%%5Fnlshow=%s×tamp=%s' % (quality, skey, id, timestamp)) skey, timestamp = self.genkey() data = self.browser.readurl('http://online.nolife-tv.com/_nlfplayer/api/api_player.php', 'quality=%s&a=UEM%%7CSEM%%7CMEM%%7CCH%%7CSWQ&skey=%s&id%%5Fnlshow=%s×tamp=%s' % (quality, skey, id, timestamp)) values = dict([urllib.splitvalue(s) for s in data.split('&')]) if 'url' not in values: return None return unicode(values['url'])
def __init__(self, url): """ @param url: AGI request url. See L{FastAGIProtocol.env} for detail. """ scheme, rest = urllib.splittype(url) host_and_port, path_and_query = urllib.splithost(rest) path, query = urllib.splitquery(path_and_query) path = tuple(part for part in path.split('/') if part) pairs = query and query.split('&') or () params = dict(urllib.splitvalue(p) for p in pairs) self.path = path self.params = params
def get_url(self, id, quality): skey, timestamp = self.genkey() self.browser.readurl('http://online.nolife-tv.com/_nlfplayer/api/api_player.php', 'quality=%s&a=EML&skey=%s&id%%5Fnlshow=%s×tamp=%s' % (quality, skey, id, timestamp)) skey, timestamp = self.genkey() data = self.browser.readurl('http://online.nolife-tv.com/_nlfplayer/api/api_player.php', 'quality=%s&a=UEM%%7CSEM%%7CMEM%%7CCH%%7CSWQ&skey=%s&id%%5Fnlshow=%s×tamp=%s' % (quality, skey, id, timestamp)) values = dict([urllib.splitvalue(s) for s in data.split('&')]) if not 'url' in values: return None return unicode(values['url'])
def output(self): ret = [] out_array = self.stdout.split("###") # loop through return output for out in out_array: if out == "": continue ret.append({}) attr_list = out.split("&") # loop through attribute fields for attr in attr_list: key, value = urllib.splitvalue(attr) # clean up key if key == None: key = "" else: key = urllib.unquote(key).replace("+", " ") # clean up value if value == None: value = "" else: value = urllib.unquote(value).replace("+", " ") # determine if it is a list, and if so, treat it as one pos = key.find("[") if pos != -1 and key[-1:] == "]": key = key[:pos] # create the list if not created already if not ret[-1].has_key(key): ret[-1][key] = [] # try to cast as an integer, otherwise settle with a string # otherwise make it a string try: value = int(value) except ValueError: pass # store it as a list if necessary, otherwise just plain store it try: ret[-1][key].append(value) except (KeyError, AttributeError): ret[-1][key] = value # return and quit return ret
def makeSocketPacket(self): range_format = self.url.range_format Range = (self.progress.begin + self.progress.go_inc, self.progress.end) add_headers = { 'Host': self.target.host, 'Connection': 'keep-alive', } if range_format[0] == '&': path, query = splitquery(self.target.path) query_dict = extract_query(query) range_format = range_format % Range for i in range_format[1:].split('&'): param_key, param_value = splitvalue(i) query_dict[param_key] = param_value new_query = urlencode(query_dict) http_head_top = 'GET %s HTTP/1.1\r\n' % ('%s?%s' % (path, new_query)) packet = http_head_top + '%s\r\n\r\n' add_headers = { 'Host': self.target.host, 'Connection': 'keep-alive' } else: http_head_top = 'GET %s HTTP/1.1\r\n' % self.target.path packet = http_head_top + '%s\r\n\r\n' range_field = range_format % Range key_value = [i.strip() for i in range_field.split(':')] key = key_value[0] value = key_value[1] add_headers[key] = value add_headers['Accept-Ranges'] = 'bytes' request_headers = dict(self.url.headers.items()) request_headers.update(add_headers) request_headers_str = [] for i in request_headers.items(): request_headers_str.append(': '.join(i)) packet = packet % '\r\n'.join(request_headers_str) return str.encode(str(packet))
def addSubscriber(self, channel, session): """ Subscribe to a channel. A subscribe request of /foo?a=b will listen to foo messages if attribute a of the message has value "b". """ channel, attrs = urllib.splitquery(channel) pattern = {} if attrs: attr, rest = urllib.splitattr(attrs) for attr in [attr] + rest: key, value = urllib.splitvalue(attr) pattern[key] = value self._subscribers.setdefault(channel, []).append((session, pattern))
def testRequestWithParameters(self): url = 'http://test' params = {'a': 1, 'b': 'b&b'} dummy_http_client = DummyHttpClient(0, 404) status_code, content = dummy_http_client.Get(url, params=params) self.assertEquals(200, status_code) self.assertEquals('success - GET', content) self.assertEquals(1, dummy_http_client.request_count) target_url, query_str = urllib.splitquery( dummy_http_client.requests[0]['url']) self.assertEquals(url, target_url) for query in query_str.split('&'): name, value = urllib.splitvalue(query) self.assertIn(name, params) self.assertEqual(urllib.quote(str(params[name])), value)
def testRequestWithParameters(self): url = 'http://test' params = {'a': 1, 'b': 'b&b'} dummy_http_client = DummyHttpClient(0, 404) status_code, content = dummy_http_client.Get( url, params=params) self.assertEquals(200, status_code) self.assertEquals('success - GET', content) self.assertEquals(1, dummy_http_client.request_count) target_url, query_str = urllib.splitquery( dummy_http_client.requests[0]['url']) self.assertEquals(url, target_url) for query in query_str.split('&'): name, value = urllib.splitvalue(query) self.assertIn(name, params) self.assertEqual(urllib.quote(str(params[name])), value)
def get(self): DEFAULT_QUERY_URL = 'http://www.google.com/search?q=' QUERY_DELIMITER = ' ' # WORKAROUND (API bug): self.request.get(QUERY_DELIMITER) was broken and stripped all special signs in Firefox request_query = urllib.unquote_plus( urllib.splitvalue(self.request.query)[1]).split(QUERY_DELIMITER) cmd = request_query[0] # FIX: if cmd contains special signs, the UnicodeDecodeError exception is issued user = users.get_current_user() if user is None: web_cmd_from_mc = memcache.get(WEB_CMD_OBJS_MEMCACHE_KEY_PREFIX + IS_PUBLIC_CMD + cmd) else: web_cmd_from_mc = memcache.get(WEB_CMD_OBJS_MEMCACHE_KEY_PREFIX + IS_USER_CMD + str(user) + '_' + cmd) if web_cmd_from_mc is None: # Web Command is not in the cache if user is None: web_cmd_from_db = WebCmd().get_public_web_cmd(cmd) else: web_cmd_from_db = WebCmd().get_user_web_cmd(cmd) if web_cmd_from_db is None: # default fallback Web Command query_url = DEFAULT_QUERY_URL cmd_query = request_query else: # if Web Command exists but is not in the cache if user is None: web_cmds = WebCmd().get_public_web_cmds() Caching().reset_public_web_cmd_cache(web_cmds) else: web_cmds = WebCmd().get_user_web_cmds() Caching().reset_user_web_cmd_cache(web_cmds) query_url = web_cmd_from_db.url cmd_query = request_query[1:] else: query_url = web_cmd_from_mc.url cmd_query = request_query[1:] # self.redirect(query_url + str.join(QUERY_DELIMITER, cmd_query)) self.redirect(query_url + urllib.quote(str.join(QUERY_DELIMITER, cmd_query)))
def do_GET(self): _, query_args = urllib.splitquery(self.path) arguments = dict([ urllib.splitvalue(query_arg) for query_arg in query_args.split('&') ]) session_id = arguments.get('sessionid') if session_id is None: self.send_error(400) self.end_headers() self.wfile.write("fail: sessionid argument is required") else: try: change_password(session_id) except Exception, e: traceback.print_exc() self.send_error(500) self.end_headers() self.wfile.write("Internal error: %s" % str(e)) else:
def getoauth( consumer_key="5wEwFCF0rbiHXYZQQeQnNetuwZMmIyrUxIePLqUMcZlheVXwc4", consumer_secret="GCLMI2LnMZqO2b5QheRvUSYY51Ujk7nWG2sYroqozW06x4hWch" ): # oauth_consumer_secret request_tokenURL = 'http://www.tumblr.com/oauth/request_token' oauth_parameters = { 'oauth_consumer_key': consumer_key, 'oauth_nonce': makenonce(), 'oauth_timestamp': str(int(time.time())), 'oauth_signature_method': "HMAC-SHA1", 'oauth_version': "1.0" } normalized_parameters = encodeparams('&'.join([ '%s=%s' % (encodeparams(str(k)), encodeparams(str(oauth_parameters[k]))) for k in sorted(oauth_parameters) ])) normalized_http_method = 'GET' normalized_http_url = encodeparams(request_tokenURL) signature_base_string = '&'.join( [normalized_http_method, normalized_http_url, normalized_parameters]) oauth_key = consumer_secret + '&' hashed = hmac.new(oauth_key, signature_base_string, hashlib.sha1) oauth_parameters['oauth_signature'] = base64.b64encode(hashed.digest()) oauth_header = 'OAuth realm="http://www.tumblr.com",' + 'oauth_nonce="' + oauth_parameters[ 'oauth_nonce'] + '",' + 'oauth_timestamp="' + oauth_parameters[ 'oauth_timestamp'] + '",' + 'oauth_consumer_key="' + oauth_parameters[ 'oauth_consumer_key'] + '",' + 'oauth_signature_method="HMAC-SHA1",oauth_version="1.0",oauth_signature="' + oauth_parameters[ 'oauth_signature'] + '"' req = urllib2.Request(request_tokenURL) req.add_header('Authorization', oauth_header) tokenstr = urllib2.urlopen(req).read() tokens = {} for token in tokenstr.split('&'): tname, tval = urllib.splitvalue(token) tokens.update({tname: tval}) TUMBLRAUTH.update({ 'oauth_token': tokens.get('oauth_token', ''), 'oauth_secret': tokens.get('oauth_token_secret', '') }) return TUMBLRAUTH
def get(self): DEFAULT_QUERY_URL = 'http://www.google.com/search?q=' QUERY_DELIMITER = ' ' # WORKAROUND (API bug): self.request.get(QUERY_DELIMITER) was broken and stripped all special signs in Firefox request_query = urllib.unquote_plus(urllib.splitvalue(self.request.query)[1]).split(QUERY_DELIMITER) cmd = request_query[0] # FIX: if cmd contains special signs, the UnicodeDecodeError exception is issued user = users.get_current_user() if user is None: web_cmd_from_mc = memcache.get(WEB_CMD_OBJS_MEMCACHE_KEY_PREFIX + IS_PUBLIC_CMD + cmd) else: web_cmd_from_mc = memcache.get(WEB_CMD_OBJS_MEMCACHE_KEY_PREFIX + IS_USER_CMD + str(user) + '_' + cmd) if web_cmd_from_mc is None: # Web Command is not in the cache if user is None: web_cmd_from_db = WebCmd().get_public_web_cmd(cmd) else: web_cmd_from_db = WebCmd().get_user_web_cmd(cmd) if web_cmd_from_db is None: # default fallback Web Command query_url = DEFAULT_QUERY_URL cmd_query = request_query else: # if Web Command exists but is not in the cache if user is None: web_cmds = WebCmd().get_public_web_cmds() Caching().reset_public_web_cmd_cache(web_cmds) else: web_cmds = WebCmd().get_user_web_cmds() Caching().reset_user_web_cmd_cache(web_cmds) query_url = web_cmd_from_db.url cmd_query = request_query[1:] else: query_url = web_cmd_from_mc.url cmd_query = request_query[1:] # self.redirect(query_url + str.join(QUERY_DELIMITER, cmd_query)) self.redirect(query_url + urllib.quote(str.join(QUERY_DELIMITER, cmd_query)))
def gotHeaders(self, headers): """ The downloader will feeds headers via this function """ debug(str(self) + ' gotHeaders') self.isGzipped = headers.get('content-encoding', [None])[0] == 'gzip' # Grab the file name of the NZB via content-disposition header keys = headers.keys() found = None for key in keys: if key.lower() == 'content-disposition': found = key break if found is None: return type, attrs = splitattr(headers[found][0]) key, val = splitvalue(attrs[0].strip()) val = val.strip().strip('"') if val: debug(str(self) + ' gotHeaders: found filename: %s' % val) self.nzbFilename = val
def extractBugTrackerAndBug(self, url): """See `IBugWatchSet`.""" for trackertype, parse_func in ( self.bugtracker_parse_functions.items()): scheme, host, path, query_string, frag = urlsplit(url) query = {} for query_part in query_string.split('&'): key, value = urllib.splitvalue(query_part) query[key] = value bugtracker_data = parse_func(scheme, host, path, query) if not bugtracker_data: continue base_url, remote_bug = bugtracker_data # Check whether we have a registered bug tracker already. bugtracker = getUtility(IBugTrackerSet).queryByBaseURL(base_url) if bugtracker is not None: return bugtracker, remote_bug else: raise NoBugTrackerFound(base_url, remote_bug, trackertype) raise UnrecognizedBugTrackerURL(url)
def get_repository(self, repos_type, repos_dir, authname): assert repos_type == 'perforce' import urllib urltype, url = urllib.splittype(repos_dir) assert urltype == 'p4' or url == 'p4' options = dict(self.config.options('perforce')) if urltype != None: machine, path_query = urllib.splithost(url) user_passwd, host_port = urllib.splituser(machine) user, password = urllib.splitpasswd(user_passwd) self._update_option(options, 'port', host_port) self._update_option(options, 'password', password) self._update_option(options, 'user', user) path, query = urllib.splitquery(path_query) if path and path != '': for attr in self._splitattributes(query): key, val = urllib.splitvalue(attr) self._update_option(options, key, val) self._update_option(options, 'path', path) self.log.debug("get_repository options : %s" % (options)) if 'port' not in options: raise TracError( message="Missing 'port' value in [perforce] config section.", title="TracPerforce configuration error", ) # Try to connect to the Perforce server from perforce import Connection, ConnectionFailed p4 = Connection(port=options['port'], api='58', # Limit to 2005.2 behaviour ) try: from trac import __version__ as tracVersion p4.connect(prog='Trac', version=tracVersion) except ConnectionFailed: raise TracError( message="Could not connect to Perforce repository.", title="Perforce connection error", ) if 'user' not in options: raise TracError( message="Missing 'user' value in [perforce] config section.", title="Perforce configuration error", ) p4.user = options['user'] if 'password' in options: p4.password = options['password'] else: p4.password = '' if 'unicode' in options: if options['unicode'] == '1': p4.charset = 'utf8' elif options['unicode'] == '0': p4.charset = 'none' else: raise TracError( message="Invalid 'unicode' value in [perforce] config " \ "section.", title="Perforce configuration error", ) else: p4.charset = 'none' if 'language' in options: p4.language = options['language'] else: p4.language = '' jobPrefixLength = 3 # default value because by default prefix is 'job' if 'job_prefix' in options: jobPrefixLength = len(options['job_prefix']) p4.client = '' repos = PerforceRepository(p4, self.log, jobPrefixLength) from trac.versioncontrol.cache import CachedRepository return PerforceCachedRepository(self.env.get_db_cnx(), repos, None, self.log)
def getRemoteProductFromSourceForge(self, sf_project): """Return the remote product of a SourceForge project. :return: The group_id and atid of the SourceForge project's bug tracker as an ampersand-separated string in the form 'group_id&atid'. """ # First, fetch the project page. try: soup = BeautifulSoup(self._getPage("projects/%s" % sf_project)) except requests.HTTPError as error: self.logger.error("Error fetching project %s: %s" % (sf_project, error)) return None # Find the Tracker link and fetch that. tracker_link = soup.find('a', text='Tracker') if tracker_link is None: self.logger.error("No tracker link for project '%s'" % sf_project) return None tracker_url = tracker_link.findParent()['href'] # Clean any leading '/' from tracker_url so that urlappend # doesn't choke on it. tracker_url = tracker_url.lstrip('/') try: soup = BeautifulSoup(self._getPage(tracker_url)) except requests.HTTPError as error: self.logger.error("Error fetching project %s: %s" % (sf_project, error)) return None # Extract the group_id and atid from the bug tracker link. bugtracker_link = soup.find('a', text='Bugs') if bugtracker_link is None: self.logger.error("No bug tracker link for project '%s'" % sf_project) return None bugtracker_url = bugtracker_link.findParent()['href'] # We need to replace encoded ampersands in the URL since # SourceForge usually encodes them. bugtracker_url = bugtracker_url.replace('&', '&') schema, host, path, query, fragment = urlsplit(bugtracker_url) query_dict = {} query_bits = query.split('&') for bit in query_bits: key, value = urllib.splitvalue(bit) query_dict[key] = value try: atid = int(query_dict.get('atid', None)) group_id = int(query_dict.get('group_id', None)) except ValueError: # If anything goes wrong when int()ing the IDs, just return # None. return None return u'%s&%s' % (group_id, atid)
def parse_string(string): return dict(urllib.splitvalue(s) for s in string.split('&'))
def splitvalue(string): return urllib.splitvalue(to_utf8(string))
def get_video(self, video=None): _id = to_unicode(self.group_dict['id']) if video is None: video = NolifeTVVideo(_id) # Check if video is external. try: div = self.parser.select(self.document.getroot(), 'div#message_lien_ext', 1) except BrokenPageError: pass else: link = div.find('a').attrib['href'] raise ForbiddenVideo('Video is only available here: %s' % link) meta = self.parser.select(self.document.getroot(), 'meta[property="og:title"]', 1) try: video.title = unicode(meta.attrib['content']) except BrokenPageError: video.title = NotAvailable meta = self.parser.select(self.document.getroot(), 'meta[property="og:description"]', 1) try: video.description = unicode(meta.attrib['content']) except BrokenPageError: video.description = NotAvailable meta = self.parser.select(self.document.getroot(), 'meta[property="og:image"]', 1) try: video.thumbnail = Thumbnail(unicode(meta.attrib['content'])) except BrokenPageError: video.thumbnail = NotAvailable try: video.date = parse_dt(self.parser.select(div, 'div#infos_complementaires', 1).find('p').text.strip()) except Exception: video.date = NotAvailable video.author = NotAvailable video.duration = NotAvailable video.rating = NotAvailable video.rating_max = NotAvailable if not video.url: skey, timestamp = self.genkey() self.browser.readurl('http://online.nolife-tv.com/_nlfplayer/api/api_player.php', 'skey=%s&a=MD5×tamp=%s' % (skey, timestamp)) skey, timestamp = self.genkey() self.browser.readurl('http://online.nolife-tv.com/_nlfplayer/api/api_player.php', 'a=EML&skey=%s&id%%5Fnlshow=%s×tamp=%s' % (skey, _id, timestamp)) skey, timestamp = self.genkey() data = self.browser.readurl('http://online.nolife-tv.com/_nlfplayer/api/api_player.php', 'quality=0&a=UEM%%7CSEM%%7CMEM%%7CCH%%7CSWQ&skey=%s&id%%5Fnlshow=%s×tamp=%s' % (skey, _id, timestamp)) values = dict([urllib.splitvalue(s) for s in data.split('&')]) if not 'url' in values: raise ForbiddenVideo(values.get('message', 'Not available').decode('iso-8859-15')) video.url = unicode(values['url']) video.set_empty_fields(NotAvailable) return video
generated_url = encode(url, policy, options.expires_epoch, options.key_pair_id, options.private_key_filename) if options.stream: print "Encoded stream (for use within a swf):\n%s" % generated_url print "Encoded and escaped stream (for use on a webpage):\n%s" % urllib.quote(generated_url) else: print "Encoded URL:\n%s" % generated_url else: base_url, params = urllib.splitquery(url) unparsed_params = params.split('&') params = {} for param in unparsed_params: key, value = urllib.splitvalue(param) params[key] = value try: encoded_signature = params['Signature'] except KeyError: print "Missing Signature URL parameters" sys.exit() try: encoded_policy = params['Policy'] except KeyError: # no policy, so make canned one try: expires = params['Expires'] except KeyError:
def get_url_params(url): "Return the params from the url as a dict. Does not support multi-valued parameters" base, query = urllib.splitquery(url) params = query.split("&") return dict([urllib.splitvalue(p) for p in params])
import urllib import sys with open(sys.argv[1], 'r') as f: for line in f.readlines(): line = line.strip() host, query = urllib.splitquery(line) key = host querylist = query.split('&') querylist.sort() for attr in querylist: key += "+" + urllib.splitvalue(attr)[0] print key
import urllib import sys with open(sys.argv[1],'r') as f: for line in f.readlines(): line=line.strip() host,query=urllib.splitquery(line) key=host querylist=query.split('&') querylist.sort() for attr in querylist: key+="+"+urllib.splitvalue(attr)[0] print key
def __init__(self, url, method, params): Assert(method == 'GET') netloc, path = splithost(url) if not netloc: raise IOError, ('ftp error', 'no host given') host, port = splitport(netloc) user, host = splituser(host) if user: user, passwd = splitpasswd(user) else: passwd = None host = socket.gethostbyname(host) if port: try: port = string.atoi(port) except string.atoi_error: raise IOError, ('ftp error', 'bad port') else: port = ftplib.FTP_PORT path, attrs = splitattr(path) self.url = "ftp://%s%s" % (netloc, path) dirs = string.splitfields(path, '/') dirs, file = dirs[:-1], dirs[-1] self.content_length = None if not file: self.content_type, self.content_encoding = None, None type = 'd' else: self.content_type, self.content_encoding = app.guess_type(file) if self.content_encoding: type = 'i' elif self.content_type and self.content_type[:5] == 'text/': type = 'a' elif file[-1] == '/': type = 'd' else: type = 'i' if dirs and not dirs[0]: dirs = dirs[1:] key = (user, host, port, string.joinfields(dirs, '/')) self.debuglevel = None try: if not ftpcache.has_key(key): ftpcache[key] = [] for attr in attrs: [attr, value] = map(string.lower, splitvalue(attr)) if attr == 'type' and value in ('a', 'i', 'd'): type = value elif attr == 'debug': try: self.debuglevel = string.atoi(value) except string.atoi_error: pass candidates = ftpcache[key] for cand in candidates: if not cand.busy(): break else: cand = ftpwrapper(user, passwd, host, port, dirs, self.debuglevel) candidates.append(cand) # XXX Ought to clean the cache every once in a while self.cand = cand self.sock, self.isdir = cand.retrfile(file, type) self.content_length = cand.content_length except ftplib.all_errors, msg: raise IOError, ('ftp error', msg)
def get_repository(self, repos_type, repos_dir, authname): assert repos_type == 'perforce' import urllib urltype, url = urllib.splittype(repos_dir) assert urltype == 'p4' or url == 'p4' options = dict(self.config.options('perforce')) if urltype != None: machine, path_query = urllib.splithost(url) user_passwd, host_port = urllib.splituser(machine) user, password = urllib.splitpasswd(user_passwd) self._update_option(options, 'port', host_port) self._update_option(options, 'password', password) self._update_option(options, 'user', user) path, query = urllib.splitquery(path_query) if path and path != '': for attr in self._splitattributes(query): key, val = urllib.splitvalue(attr) self._update_option(options, key, val) self._update_option(options, 'path', path) self.log.debug("get_repository options : %s" % (options)) if 'port' not in options: raise TracError( message="Missing 'port' value in [perforce] config section.", title="TracPerforce configuration error", ) # Try to connect to the Perforce server from perforce import Connection, ConnectionFailed p4 = Connection( port=options['port'], api='58', # Limit to 2005.2 behaviour ) try: from trac import __version__ as tracVersion p4.connect(prog='Trac', version=tracVersion) except ConnectionFailed: raise TracError( message="Could not connect to Perforce repository.", title="Perforce connection error", ) if 'user' not in options: raise TracError( message="Missing 'user' value in [perforce] config section.", title="Perforce configuration error", ) p4.user = options['user'] if 'password' in options: p4.password = options['password'] else: p4.password = '' if 'unicode' in options: if options['unicode'] == '1': p4.charset = 'utf8' elif options['unicode'] == '0': p4.charset = 'none' else: raise TracError( message="Invalid 'unicode' value in [perforce] config " \ "section.", title="Perforce configuration error", ) else: p4.charset = 'none' if 'language' in options: p4.language = options['language'] else: p4.language = '' jobPrefixLength = 3 # default value because by default prefix is 'job' if 'job_prefix' in options: jobPrefixLength = len(options['job_prefix']) p4.client = '' repos = PerforceRepository(p4, None, self.log, jobPrefixLength) crepos = CachedRepository(self.env.get_db_cnx(), repos, None, self.log) return crepos
def initializeRemoteBugDB(self, bug_ids): """See `ExternalBugTracker`. We override this method because SourceForge does not provide a nice way for us to export bug statuses en masse. Instead, we resort to screen-scraping on a per-bug basis. Therefore the usual choice of batch vs. single export does not apply here and we only perform single exports. """ self.bugs = {} for bug_id in bug_ids: query_url = self.export_url % bug_id page_data = self._getPage(query_url) soup = BeautifulSoup(page_data) status_tag = soup.find(text=re.compile('Status:')) status = None private = False if status_tag: # We can extract the status by finding the grandparent tag. # Happily, BeautifulSoup will turn the contents of this tag # into a newline-delimited list from which we can then # extract the requisite data. status_row = status_tag.findParent().findParent() status = status_row.contents[-1] status = status.strip() else: error_message = self._extractErrorMessage(page_data) # If the error message suggests that the bug is private, # set the bug's private field to True. # XXX 2008-05-01 gmb bug=225354: # We should know more about possible errors and deal # with them accordingly. if error_message and 'private' in error_message.lower(): private = True else: # If we can't find a status line in the output from # SourceForge there's little point in continuing. raise UnparsableBugData( 'Remote bug %s does not define a status.' % bug_id) # We need to do the same for Resolution, though if we can't # find it it's not critical. resolution_tag = soup.find(text=re.compile('Resolution:')) if resolution_tag: resolution_row = resolution_tag.findParent().findParent() resolution = resolution_row.contents[-1] resolution = resolution.strip() else: resolution = None # We save the group_id and atid parameters from the # query_url. They'll be returned by getRemoteProduct(). query_dict = {} bugtracker_link = soup.find('a', text='Bugs') if bugtracker_link: href = bugtracker_link.findParent()['href'] # We need to replace encoded ampersands in the URL since # SourceForge occasionally encodes them. href = href.replace('&', '&') schema, host, path, query, fragment = urlsplit(href) query_bits = query.split('&') for bit in query_bits: key, value = urllib.splitvalue(bit) query_dict[key] = value try: atid = int(query_dict.get('atid', None)) group_id = int(query_dict.get('group_id', None)) except ValueError: atid = None group_id = None else: group_id = None atid = None self.bugs[int(bug_id)] = { 'id': int(bug_id), 'private': private, 'status': status, 'resolution': resolution, 'group_id': group_id, 'atid': atid, }
def getRemoteProductFromSourceForge(self, sf_project): """Return the remote product of a SourceForge project. :return: The group_id and atid of the SourceForge project's bug tracker as an ampersand-separated string in the form 'group_id&atid'. """ # First, fetch the project page. try: soup = BeautifulSoup(self._getPage("projects/%s" % sf_project)) except HTTPError as error: self.logger.error( "Error fetching project %s: %s" % (sf_project, error)) return None # Find the Tracker link and fetch that. tracker_link = soup.find('a', text='Tracker') if tracker_link is None: self.logger.error( "No tracker link for project '%s'" % sf_project) return None tracker_url = tracker_link.findParent()['href'] # Clean any leading '/' from tracker_url so that urlappend # doesn't choke on it. tracker_url = tracker_url.lstrip('/') try: soup = BeautifulSoup(self._getPage(tracker_url)) except HTTPError as error: self.logger.error( "Error fetching project %s: %s" % (sf_project, error)) return None # Extract the group_id and atid from the bug tracker link. bugtracker_link = soup.find('a', text='Bugs') if bugtracker_link is None: self.logger.error( "No bug tracker link for project '%s'" % sf_project) return None bugtracker_url = bugtracker_link.findParent()['href'] # We need to replace encoded ampersands in the URL since # SourceForge usually encodes them. bugtracker_url = bugtracker_url.replace('&', '&') schema, host, path, query, fragment = urlsplit(bugtracker_url) query_dict = {} query_bits = query.split('&') for bit in query_bits: key, value = urllib.splitvalue(bit) query_dict[key] = value try: atid = int(query_dict.get('atid', None)) group_id = int(query_dict.get('group_id', None)) except ValueError: # If anything goes wrong when int()ing the IDs, just return # None. return None return u'%s&%s' % (group_id, atid)
def open_ftp(self, url): """Use FTP protocol.""" if not isinstance(url, str): raise IOError(('ftp error', 'proxy support for ftp protocol currently not implemented')) import mimetypes import mimetools try: from cStringIO import StringIO except ImportError: from StringIO import StringIO host, path = urllib.splithost(url) if not host: raise IOError(('ftp error', 'no host given')) host, port = urllib.splitport(host) user, host = urllib.splituser(host) if user: user, passwd = urllib.splitpasswd(user) else: passwd = None host = urllib.unquote(host) user = urllib.unquote(user or '') passwd = urllib.unquote(passwd or '') host = socket.gethostbyname(host) if not port: import ftplib # noqa port = ftplib.FTP_PORT else: port = int(port) path, attrs = urllib.splitattr(path) path = urllib.unquote(path) dirs = path.split('/') dirs, file = dirs[:-1], dirs[-1] if dirs and not dirs[0]: dirs = dirs[1:] if dirs and not dirs[0]: dirs[0] = '/' key = user, host, port, '/'.join(dirs) # XXX thread unsafe! if len(self.ftpcache) > urllib.MAXFTPCACHE: # Prune the cache, rather arbitrarily for k in self.ftpcache.keys(): if k != key: v = self.ftpcache[k] del self.ftpcache[k] v.close() try: if not key in self.ftpcache: self.ftpcache[key] = \ Myftpwrapper(user, passwd, host, port, dirs) if not file: type = 'D' else: type = 'I' for attr in attrs: attr, value = urllib.splitvalue(attr) if attr.lower() == 'type' and \ value in ('a', 'A', 'i', 'I', 'd', 'D'): type = value.upper() (fp, retrlen) = self.ftpcache[key].retrfile(file, type, rest=os.environ.get("REST")) mtype = mimetypes.guess_type("ftp:" + url)[0] headers = "" if mtype: headers += "Content-Type: %s\n" % mtype if retrlen is not None and retrlen >= 0: headers += "Content-Length: %d\n" % retrlen headers = mimetools.Message(StringIO(headers)) return urllib.addinfourl(fp, headers, "ftp:" + url) except urllib.ftperrors() as msg: raise IOError(('ftp error', msg), sys.exc_info()[2])
def _parse_digest(self): return urllib.splitvalue(urlparse.urldefrag(self.path)[1])[1]
def initializeRemoteBugDB(self, bug_ids): """See `ExternalBugTracker`. We override this method because SourceForge does not provide a nice way for us to export bug statuses en masse. Instead, we resort to screen-scraping on a per-bug basis. Therefore the usual choice of batch vs. single export does not apply here and we only perform single exports. """ self.bugs = {} for bug_id in bug_ids: query_url = self.export_url % bug_id page_data = self._getPage(query_url) soup = BeautifulSoup(page_data) status_tag = soup.find(text=re.compile("Status:")) status = None private = False if status_tag: # We can extract the status by finding the grandparent tag. # Happily, BeautifulSoup will turn the contents of this tag # into a newline-delimited list from which we can then # extract the requisite data. status_row = status_tag.findParent().findParent() status = status_row.contents[-1] status = status.strip() else: error_message = self._extractErrorMessage(page_data) # If the error message suggests that the bug is private, # set the bug's private field to True. # XXX 2008-05-01 gmb bug=225354: # We should know more about possible errors and deal # with them accordingly. if error_message and "private" in error_message.lower(): private = True else: # If we can't find a status line in the output from # SourceForge there's little point in continuing. raise UnparsableBugData("Remote bug %s does not define a status." % bug_id) # We need to do the same for Resolution, though if we can't # find it it's not critical. resolution_tag = soup.find(text=re.compile("Resolution:")) if resolution_tag: resolution_row = resolution_tag.findParent().findParent() resolution = resolution_row.contents[-1] resolution = resolution.strip() else: resolution = None # We save the group_id and atid parameters from the # query_url. They'll be returned by getRemoteProduct(). query_dict = {} bugtracker_link = soup.find("a", text="Bugs") if bugtracker_link: href = bugtracker_link.findParent()["href"] # We need to replace encoded ampersands in the URL since # SourceForge occasionally encodes them. href = href.replace("&", "&") schema, host, path, query, fragment = urlsplit(href) query_bits = query.split("&") for bit in query_bits: key, value = urllib.splitvalue(bit) query_dict[key] = value try: atid = int(query_dict.get("atid", None)) group_id = int(query_dict.get("group_id", None)) except ValueError: atid = None group_id = None else: group_id = None atid = None self.bugs[int(bug_id)] = { "id": int(bug_id), "private": private, "status": status, "resolution": resolution, "group_id": group_id, "atid": atid, }
def parse(self, response): #print "#################" base_url = get_base_url(response) page_url = response.url host = urlparse.urlparse(page_url)[1] #print "request url:"+page_url #判断是否六合彩网站 if self.myre(response, self.lvhecai_keys): #if self.myre(response,self.lvhecai_keys) and Selector(response).re('<xml>|<wml>'): #if Selector(response).re('<xml>|<wml>'): #判断是否登录网站 if self.myre(response, self.login_keys): #判断是否已记录在登录域名 #print page_url+":六合彩已记录" if host in self.loginhost: pass else: log1 = open('login.txt', 'a') log1.write(page_url + "\n") log1.close() self.loginhost.append(host) #if host in self.lvhecaihost: #self.lvhecaihost.remove(host) #是六合彩网站不是登录网站 else: #判断是否在已记录的登录网站或者六合彩网站,如果记录就不重复 #print page_url+":尼马,明显的六合彩网站" if host not in self.lvhecaihost and host not in self.loginhost: #print host #print self.lvhecaihost log2 = open('lvhecai.txt', 'a') log2.write(page_url + "\n") log2.close() self.lvhecaihost.append(host) #分析所有a标签 link = response.xpath('//a') items = [] if link: for a in link: #抽取连接 links = a.select('@href').extract() if len(links) > 0: relative_url = links[0] url = urlparse.urljoin(base_url, relative_url) #print "url:"+url query = urlparse.urlparse(url)[4] path = url.replace("?" + urlparse.urlparse(url)[4], '') #path=urllib.splitquery(url)[0] sorturl = "" #用base_url加上query的key作为url记录,防止同类型的模版爬太多 if query: querylist = query.split('&') querylist.sort() sorturl = path + "?" for key_value in querylist: sorturl += urllib.splitvalue(key_value)[0] elif re.match( ".*[0-9]+\.htm$|.*[0-9]+\.asp$|.*[0-9]+\.wml$", url): sorturl = url.rsplit('/', 1)[0] else: sorturl = url #if sorturl not in self.spideredurl and urllib.splittype(url)[0]=='http': if sorturl not in self.spideredurl: self.spideredurl.append(sorturl) self.spideredurl.sort() print sorturl #print "spider:"+url yield scrapy.Request(url, callback=self.parse) if urlparse.urlparse( url)[1] in self.allowed_domains: url_filter(url) #print a.select('text()').extract() [0] #else: # print "not spider:"+url # print self.spideredurl #print link #print items #print "*********************" #print self.spideredurl else: log3 = open('no.txt', 'a') log3.write(page_url + "\n") log3.close()