return #客户端方法 def csqueryConents(self,csdompagination): return csdompagination def csqueryPagination(self,csdom,pagesPath): pages=[] for index,item in enumerate(pagesPath): csquery=item cspage=csdom.Select(csquery) if cspage: #找到所有的a标签 children=cspage.Find("a") if children.Length>0: for i in range(0,children.Length): cshyper=CsQuery.CQ.Create(children[i]) href=cshyper.Attr("href") text=cshyper.Text() pagelamda=self.config.cfgContent.Options.PageLamda if pagelamda: str=pagelamda(cshyper,href,text) print "this is type for url : %s" % (type(str)) if str: href=str print " this is true" else: print "this is false" continue if href and href[0:1]=="/": proto, rest = urllib2.splittype(self.config.cfgUrl) host, rest = urllib2.splithost(rest) href=proto+"://"+host+href elif href and href[0:1]=="?": proto, rest = urllib2.splittype(self.config.cfgUrl) host, rest = urllib2.splithost(rest) p=rest.split("?") p[1]=href[1:] href=proto+"://"+host+"?".join(p) elif href.find("http")==-1: proto, rest = urllib2.splittype(self.config.cfgUrl) host, rest = urllib2.splithost(rest) p_rest=rest.split("/") p_rest[len(p_rest)-1]=href href=proto+"://"+host+"/".join(p_rest) scale=self.config.cfgContent.Options.PageSimilarity rate=0.0 simlilar=StringHelper.LevenshteinDistance(self.__url,href,rate) print "this is simliar :%f " % simlilar[1] if href and simlilar[1]>scale and simlilar[1]<1: pages.append(href)
def parse(uri): with closing( urllib2.urlopen(uri) if urllib2.splittype(uri)[0] else codecs. open(uri, 'r')) as inf: # initialize playlist variables before reading file playlist = [] song = Track(None, None, None) for line_no, line in enumerate(inf): try: line = line.strip(codecs.BOM_UTF8).strip() if line.startswith('#EXTINF:'): # pull length and title from #EXTINF line length, title = line.split('#EXTINF:')[1].split(',', 1) song = Track(length, title, None) elif line.startswith('#'): # comment, #EXTM3U pass elif len(line) != 0: # pull song path from all other, non-blank lines song.path = line playlist.append(song) # reset the song variable so it doesn't use the same EXTINF more than once song = Track(None, None, None) except Exception, ex: raise Exception("Can't parse line %d: %s" % (line_no, line), ex)
def url_splits(url): domain_splits = [] path_splits = [] try: if url: protocol, rest = urllib2.splittype(url) if not protocol: rest = '//' + rest host, rest = urllib2.splithost(rest) #域部分解析 if host: splits = host.split('.') if splits: index_list = range(len(splits)) index_list.reverse() for index in index_list: if not splits[index]: splits.remove('') domain_splits += splits #路径部分解析 if rest: rest = urlparse.urlparse(rest) splits = rest.path.split('/') if splits: index_list = range(len(splits)) index_list.reverse() for index in index_list: if not splits[index]: splits.remove('') path_splits += splits except Exception, e: log.msg(traceback.format_exc(), level=log.ERROR)
def loadUrl(): args = request.args url = args.get("url") jobTemplateId = args.get("jobTemplateId") request.args = ImmutableMultiDict() data = {} if jobTemplateId != None and jobTemplateId != "": data = TestService.parseTemplate(jobTemplateId, flag=False) # url = data.get("renderUrl") global host, cookie, refer, headers if (url.startswith("http")): host = urllib2.splittype(url) refer = url if host != None: proto, rest = urllib.splittype(url) res, host = urllib.splithost(rest) host = proto + "://" + res else: url = host + url request.url = url send_headers = { # 'Host':'www.jb51.net', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection': 'keep-alive' } postRequest = urllib2.Request(url, headers=send_headers) response = None try: response = urllib2.urlopen(postRequest) except Exception, e: logging.error("error" + str(e))
def download_file(self, url): injectd_url = self.extract_url(urllib2.unquote(url)) try: req = urllib2.Request(injectd_url) # Set User-Agent to look more credible req.add_unredirected_header('User-Agent', '-') # FIXME: We need a timeout on read here injected_file = urllib2.urlopen(req, timeout=4).read() # If the file is hosted on a SSL enabled host get the certificate if re.match('^https', injectd_url, re.IGNORECASE): proto, rest = urllib2.splittype(injectd_url) host, rest = urllib2.splithost(rest) host, port = urllib2.splitport(host) if port is None: port = 443 cert_file = ssl.get_server_certificate((host, int(port))) cert_name = self.store_file(cert_file) except IOError as e: logger.exception( "Failed to fetch injected file, I/O error: {0}".format(e)) # TODO: We want to handle the case where we can't download # the injected file but pretend to be vulnerable. file_name = None else: file_name, file_sha256 = self.store_file(injected_file) return file_name, file_sha256
def download_file(self, url): injectd_url = self.extract_url(urllib2.unquote(url)) try: req = urllib2.Request(injectd_url) # Set User-Agent to look more credible req.add_unredirected_header('User-Agent', '-') # FIXME: We need a timeout on read here injected_file = urllib2.urlopen(req, timeout=4).read() # If the file is hosted on a SSL enabled host get the certificate if re.match('^https', injectd_url, re.IGNORECASE): proto, rest = urllib2.splittype(injectd_url) host, rest = urllib2.splithost(rest) host, port = urllib2.splitport(host) if port is None: port = 443 cert_file = ssl.get_server_certificate((host, int(port))) cert_name = self.store_file(cert_file) except IOError as e: logger.exception("Failed to fetch injected file, I/O error: {0}".format(e)) # TODO: We want to handle the case where we can't download # the injected file but pretend to be vulnerable. file_name = None else: file_name, file_sha256 = self.store_file(injected_file) return file_name, file_sha256
def _add_proxies(): if sickrage.app.config.proxy_setting: sickrage.app.log.debug("Using global proxy: " + sickrage.app.config.proxy_setting) scheme, address = urllib2.splittype(sickrage.app.config.proxy_setting) address = ('http://{}'.format(sickrage.app.config.proxy_setting), sickrage.app.config.proxy_setting)[scheme] return {"http": address, "https": address}
def _hook(self, request): host = request.get_host() if not host: raise urllib2.URLError('no host given') if request.has_data(): # POST data = request.get_data() if not request.has_header('Content-type'): request.add_unredirected_header( 'Content-type', 'application/x-www-form-urlencoded') if not request.has_header('Content-length') and not conf.chunk: request.add_unredirected_header( 'Content-length', '%d' % len(data)) sel_host = host if request.has_proxy(): scheme, sel = urllib2.splittype(request.get_selector()) sel_host, sel_path = urllib2.splithost(sel) if not request.has_header('Host'): request.add_unredirected_header('Host', sel_host) for name, value in self.parent.addheaders: name = name.capitalize() if not request.has_header(name): request.add_unredirected_header(name, value) return request
def get_host_from_url(url): """ 功能:把url转换为域名 """ root_proto, root_rest = urllib2.splittype(url) root_host, root_rest = urllib2.splithost(root_rest) return root_host
def _expand_recipe(content, url=''): urls = [] for line in content.splitlines(): line = line.lstrip().rstrip() try: target_type, target = line.split(':', 1) except ValueError: continue # blank line in recipe if target_type in ACCEPTED_RECIPE_TYPES: if isinstance(target, unicode): target = target.encode('utf-8') target = target.lstrip().rstrip() # translate well-known variables for name in COOK_VARIABLES: target = target.replace("$"+name, COOK_VARIABLES[name]) # Check to see if the target is a URL (has a scheme) # if not we want to join it to the current url before # carrying on. scheme, _ = urllib2.splittype(target) if not scheme: if not '%' in target: target = urllib.quote(target) target = urlparse.urljoin(url, target) if target_type == 'recipe': urls.extend(recipe_to_urls(target)) else: urls.append(target) return urls
def doQuery(self, query, name): # urllib doesn't honor user Content-type, use urllib2 garbage, path = urllib2.splittype(FieldVal(self.site, "url")) host, selector = urllib2.splithost(path) response = False try: errmsg = "** An ERROR occurred attempting HTTPS connection to" h = httplib.HTTPSConnection(host, timeout=5) errmsg = "** An ERROR occurred sending POST request to" p = h.request( 'POST', selector, query, { "Content-type": "application/x-ofx", "Accept": "*/*, application/x-ofx" }) errmsg = "** An ERROR occurred retrieving POST response from" #allow up to 30 secs for the server response (it has to assemble the statement) h.sock.settimeout(30) response = h.getresponse().read() f = file(name, "w") f.write(response) f.close() except Exception as inst: self.status = False print errmsg, host print " Exception type:", type(inst) print " Exception Val :", inst if response: print " HTTPS ResponseCode :", response.status print " HTTPS ResponseReason:", response.reason if h: h.close()
def get_cookie(url): """ 获取该的可用cookie :param url: :return: """ domain = urllib2.splithost(urllib2.splittype(url)[1])[0] domain_list = ['.' + domain, domain] if len(domain.split('.')) > 2: dot_index = domain.find('.') domain_list.append(domain[dot_index:]) domain_list.append(domain[dot_index + 1:]) print domain_list conn = None cookie_str = None try: conn = sqlite3.connect(r'%s\Google\Chrome\User Data\Default\Cookies' % os.getenv('LOCALAPPDATA')) cursor = conn.cursor() sql = 'select host_key, name, value, encrypted_value, path from cookies where host_key in (%s)' % ','.join(['"%s"' % x for x in domain_list]) row_list = cursor.execute(sql).fetchall() cookie_list = [] for host_key, name, value, encrypted_value, path in row_list: decrypted_value = win32crypt.CryptUnprotectData(encrypted_value, None, None, None, 0)[1].decode(print_charset) or value cookie_list.append(name + '=' + decrypted_value) cookie_str = '; '.join(cookie_list) except Exception: raise CookieException() finally: conn.close() print cookie_str return cookie_str, domain
def doQuery(self,query,name): # urllib doesn't honor user Content-type, use urllib2 garbage, path = urllib2.splittype(FieldVal(self.site,"url")) host, selector = urllib2.splithost(path) response=False try: errmsg= "** An ERROR occurred attempting HTTPS connection to" h = httplib.HTTPSConnection(host, timeout=5) errmsg= "** An ERROR occurred sending POST request to" p = h.request('POST', selector, query, {"Content-type": "application/x-ofx", "Accept": "*/*, application/x-ofx"} ) errmsg= "** An ERROR occurred retrieving POST response from" #allow up to 30 secs for the server response (it has to assemble the statement) h.sock.settimeout(30) response = h.getresponse().read() f = file(name,"w") f.write(response) f.close() except Exception as inst: self.status = False print errmsg, host print " Exception type:", type(inst) print " Exception Val :", inst if response: print " HTTPS ResponseCode :", response.status print " HTTPS ResponseReason:", response.reason if h: h.close()
def file_or_url_context(resource_name): """Yield name of file from the given resource (i.e. file or url).""" if is_url(resource_name): _, ext = os.path.splitext(resource_name) try: with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as f: proto, rest = urllib2.splittype(resource_name) HOST, rest = urllib2.splithost(rest) HEADER['Host'] = HOST req = urllib2.Request(resource_name, headers=HEADER) u = urllib2.urlopen(req, timeout=4) f.write(u.read()) # f must be closed before yielding yield f.name finally: os.remove(f.name) else: yield resource_name # from skimage.io import imread # fname = "https://imgsa.baidu.com/forum/w%3D580/sign=e960450646086e066aa83f4332097b5a/36844b59252dd42a79cdd89c093b5bb5c8eab874.jpg" # with file_or_url_context(fname) as f: # img = imread(f) # print img.shape
def getRSSFeed(self, url, post_data=None, items=[]): handlers = [] if self.provider.proxy.isEnabled(): self.provider.headers.update( {'Referer': self.provider.proxy.getProxyURL()}) elif sickbeard.PROXY_SETTING: logger.log("Using proxy for url: " + url, logger.DEBUG) scheme, address = urllib2.splittype(sickbeard.PROXY_SETTING) address = sickbeard.PROXY_SETTING if scheme else 'http://' + sickbeard.PROXY_SETTING handlers = [ urllib2.ProxyHandler({ 'http': address, 'https': address }) ] self.provider.headers.update({'Referer': address}) elif 'Referer' in self.provider.headers: self.provider.headers.pop('Referer') return RSSFeeds(self.providerID).getFeed( self.provider.proxy._buildURL(url), post_data, self.provider.headers, items, handlers=handlers)
def request(self, method, url, headers=None, params=None, proxies=None, cache=True, verify=False, *args, **kwargs): if headers is None: headers = {} if params is None: params = {} if proxies is None: proxies = {} headers['Accept-Encoding'] = 'gzip, deflate' headers["User-Agent"] = sickrage.app.user_agent # request session ssl verify if sickrage.app.config.ssl_verify: try: verify = certifi.where() except: pass # request session proxies if 'Referer' not in headers and sickrage.app.config.proxy_setting: sickrage.app.log.debug("Using global proxy: " + sickrage.app.config.proxy_setting) scheme, address = urllib2.splittype(sickrage.app.config.proxy_setting) address = ('http://{}'.format(sickrage.app.config.proxy_setting), sickrage.app.config.proxy_setting)[scheme] proxies.update({"http": address, "https": address}) headers.update({'Referer': address}) # setup caching adapter if cache: adapter = CacheControlAdapter(DBCache(os.path.abspath(os.path.join(sickrage.app.data_dir, 'sessions.db')))) self.mount('http://', adapter) self.mount('https://', adapter) # get web response response = super(WebSession, self).request( method, url, headers=headers, params=params, verify=verify, proxies=proxies, hooks={'response': WebHooks.log_url}, *args, **kwargs ) try: # check web response for errors response.raise_for_status() except requests.exceptions.SSLError as e: if ssl.OPENSSL_VERSION_INFO < (1, 0, 1, 5): sickrage.app.log.info( "SSL Error requesting url: '{}' You have {}, try upgrading OpenSSL to 1.0.1e+".format( e.request.url, ssl.OPENSSL_VERSION)) if sickrage.app.config.ssl_verify: sickrage.app.log.info( "SSL Error requesting url: '{}', try disabling cert verification in advanced settings".format( e.request.url)) except Exception: pass return response
def request(self, method, url, headers=None, params=None, proxies=None, cache=True, verify=False, *args, **kwargs): if headers is None: headers = {} if params is None: params = {} if proxies is None: proxies = {} url = self.normalize_url(url) headers.update({'Accept-Encoding': 'gzip, deflate'}) headers.update(random.choice(USER_AGENTS)) # request session ssl verify if sickrage.srCore.srConfig.SSL_VERIFY: try: verify = certifi.where() except: pass # request session proxies if 'Referer' not in headers and sickrage.srCore.srConfig.PROXY_SETTING: sickrage.srCore.srLogger.debug("Using global proxy: " + sickrage.srCore.srConfig.PROXY_SETTING) scheme, address = urllib2.splittype(sickrage.srCore.srConfig.PROXY_SETTING) address = ('http://{}'.format(sickrage.srCore.srConfig.PROXY_SETTING), sickrage.srCore.srConfig.PROXY_SETTING)[scheme] proxies.update({"http": address, "https": address}) headers.update({'Referer': address}) # setup session caching if cache: cache_file = os.path.abspath(os.path.join(sickrage.DATA_DIR, 'sessions.db')) self.__class__ = cachecontrol.CacheControl(self, cache=DBCache(cache_file), heuristic=ExpiresAfter(days=7)).__class__ # get web response response = super(srSession, self).request(method, url, headers=headers, params=params, verify=verify, proxies=proxies, *args, **kwargs) try: # check web response for errors response.raise_for_status() except requests.exceptions.SSLError as e: if ssl.OPENSSL_VERSION_INFO < (1, 0, 1, 5): sickrage.srCore.srLogger.info( "SSL Error requesting url: '{}' You have {}, try upgrading OpenSSL to 1.0.1e+".format( e.request.url, ssl.OPENSSL_VERSION)) if sickrage.srCore.srConfig.SSL_VERIFY: sickrage.srCore.srLogger.info( "SSL Error requesting url: '{}', try disabling cert verification in advanced settings".format( e.request.url)) except Exception: pass return response
def get_domain(url): try: return get_tld(url) except: base_url = "".join(url) # 删除所有\s+ protocol, rest = urllib2.splittype(base_url) host, rest = urllib2.splithost(rest) return host
def get_local_name(url): url = url.strip() url = re.sub('[\/]+$', '', url) rest = urllib2.splittype(url)[1] host, rest = urllib2.splithost(rest) if rest is None or rest == '': return host return os.path.basename(rest)
def decorator(*args, **kwargs): request = args[0] enabled_https = getattr(settings, 'SESSION_COOKIE_SECURE', False) if enabled_https and not request.is_secure(): http_url = request.build_absolute_uri(request.get_full_path()) https_url = 'https:' + urllib2.splittype(http_url)[1] return HttpResponseRedirect(https_url) return func(*args, **kwargs)
def download(self, url, insecure): """ Tries to download a file from url. Returns the path to the local file. """ scheme = urllib2.splittype(url)[0] DL = downloaders.get(scheme, Downloader) return DL(url, self, insecure).execute()
def request(self, method, url, headers=None, params=None, proxies=None, cache=True, verify=False, *args, **kwargs): if headers is None: headers = {} if params is None: params = {} if proxies is None: proxies = {} headers['Accept-Encoding'] = 'gzip, deflate' headers["User-Agent"] = sickrage.srCore.USER_AGENT # request session ssl verify if sickrage.srCore.srConfig.SSL_VERIFY: try: verify = certifi.where() except: pass # request session proxies if 'Referer' not in headers and sickrage.srCore.srConfig.PROXY_SETTING: sickrage.srCore.srLogger.debug("Using global proxy: " + sickrage.srCore.srConfig.PROXY_SETTING) scheme, address = urllib2.splittype(sickrage.srCore.srConfig.PROXY_SETTING) address = ('http://{}'.format(sickrage.srCore.srConfig.PROXY_SETTING), sickrage.srCore.srConfig.PROXY_SETTING)[scheme] proxies.update({"http": address, "https": address}) headers.update({'Referer': address}) # setup caching adapter if cache: adapter = CacheControlAdapter(DBCache(os.path.abspath(os.path.join(sickrage.DATA_DIR, 'sessions.db')))) self.mount('http://', adapter) self.mount('https://', adapter) # get web response response = super(srSession, self).request( method, url, headers=headers, params=params, verify=verify, proxies=proxies, *args, **kwargs ) try: # check web response for errors response.raise_for_status() except requests.exceptions.SSLError as e: if ssl.OPENSSL_VERSION_INFO < (1, 0, 1, 5): sickrage.srCore.srLogger.info( "SSL Error requesting url: '{}' You have {}, try upgrading OpenSSL to 1.0.1e+".format( e.request.url, ssl.OPENSSL_VERSION)) if sickrage.srCore.srConfig.SSL_VERIFY: sickrage.srCore.srLogger.info( "SSL Error requesting url: '{}', try disabling cert verification in advanced settings".format( e.request.url)) except Exception: pass return response
def request(self, method, url, headers=None, params=None, cache=True, raise_exceptions=True, *args, **kwargs): url = self.normalize_url(url) kwargs.setdefault('params', {}).update(params or {}) kwargs.setdefault('headers', {}).update(headers or {}) # if method == 'POST': # self.session.headers.update({"Content-type": "application/x-www-form-urlencoded"}) kwargs.setdefault('headers', {}).update({'Accept-Encoding': 'gzip, deflate'}) kwargs.setdefault('headers', {}).update(random.choice(USER_AGENTS)) # request session ssl verify kwargs['verify'] = False if sickrage.srCore.srConfig.SSL_VERIFY: try: kwargs['verify'] = certifi.where() except: pass # request session proxies if 'Referer' not in kwargs.get( 'headers', {}) and sickrage.srCore.srConfig.PROXY_SETTING: sickrage.srCore.srLogger.debug( "Using global proxy: " + sickrage.srCore.srConfig.PROXY_SETTING) scheme, address = urllib2.splittype( sickrage.srCore.srConfig.PROXY_SETTING) address = \ ('http://{}'.format(sickrage.srCore.srConfig.PROXY_SETTING), sickrage.srCore.srConfig.PROXY_SETTING)[scheme] kwargs.setdefault('proxies', {}).update({ "http": address, "https": address }) kwargs.setdefault('headers', {}).update({'Referer': address}) # setup session caching if cache: cache_file = os.path.abspath( os.path.join(sickrage.DATA_DIR, 'sessions.db')) cachecontrol.CacheControl(self, cache=DBCache(cache_file), heuristic=ExpiresAfter(days=7)) # get result response = super(srSession, self).request(method, url, *args, **kwargs).result() if raise_exceptions: response.raise_for_status() return response
def _setUpSession(session=None, headers=None, params=None): """ Returns a session initialized with default cache and parameter settings :param session: session object to (re)use :param headers: Headers to pass to session :return: session object """ # request session if headers is None: headers = {} sessionCache = None FileCacheDir = sickrage.srConfig.CACHE_DIR or get_temp_dir() if FileCacheDir: sessionCache = FileCache(os.path.join(FileCacheDir, 'sessions'), use_dir_lock=True) session = cachecontrol.CacheControl(sess=session or requests.Session(), cache=sessionCache, cache_etags=False) # request session headers session.headers.update(headers) session.headers.update({'Accept-Encoding': 'gzip,deflate'}) session.headers.update(random.choice(USER_AGENTS)) # request session clear residual referer if 'Referer' in session.headers and 'Referer' not in headers: session.headers.pop('Referer') try: # request session ssl verify session.verify = False if sickrage.srConfig.SSL_VERIFY: session.verify = certifi.where() except:pass # request session proxies if 'Referer' not in session.headers and sickrage.srConfig.PROXY_SETTING: sickrage.srLogger.debug("Using global proxy: " + sickrage.srConfig.PROXY_SETTING) scheme, address = urllib2.splittype(sickrage.srConfig.PROXY_SETTING) address = ('http://{}'.format(sickrage.srConfig.PROXY_SETTING), sickrage.srConfig.PROXY_SETTING)[scheme] session.proxies = { "http": address, "https": address, } session.headers.update({'Referer': address}) if 'Content-Type' in session.headers: session.headers.pop('Content-Type') if params and isinstance(params, (list, dict)): for param in params: if isinstance(params[param], unicode): params[param] = params[param].encode('utf-8') session.params = params return session
def getRSSFeed(self, url, params=None): handlers = [] if sickrage.app.config.proxy_setting: sickrage.app.log.debug("Using global proxy for url: " + url) scheme, address = urllib2.splittype(sickrage.app.config.proxy_setting) address = sickrage.app.config.proxy_setting if scheme else 'http://' + sickrage.app.config.proxy_setting handlers = [urllib2.ProxyHandler({'http': address, 'https': address})] return getFeed(url, params=params, handlers=handlers)
def getRSSFeed(self, url): handlers = [] if sickrage.srCore.srConfig.PROXY_SETTING: sickrage.srCore.srLogger.debug("Using global proxy for url: " + url) scheme, address = urllib2.splittype(sickrage.srCore.srConfig.PROXY_SETTING) address = sickrage.srCore.srConfig.PROXY_SETTING if scheme else 'http://' + sickrage.srCore.srConfig.PROXY_SETTING handlers = [urllib2.ProxyHandler({'http': address, 'https': address})] return getFeed(url, handlers=handlers)
def getRSSFeed(self, url, params=None): handlers = [] if sickrage.srCore.srConfig.PROXY_SETTING: sickrage.srCore.srLogger.debug("Using global proxy for url: " + url) scheme, address = urllib2.splittype(sickrage.srCore.srConfig.PROXY_SETTING) address = sickrage.srCore.srConfig.PROXY_SETTING if scheme else 'http://' + sickrage.srCore.srConfig.PROXY_SETTING handlers = [urllib2.ProxyHandler({'http': address, 'https': address})] return getFeed(url, params=params, handlers=handlers)
def _setup_server(self, server=None): if server: host, path = urllib2.splithost(urllib2.splittype(server)[-1]) if not path: path = '/' self.client_con = python_webdav.client.Client(host, webdav_path=path) self.client_con.set_connection('wibble', 'fish') else: print "I need a server!" self.client_con = None
def parse_protocols(ctx, base_uri=None): """ Parse ``protocols`` from a root context. If protocols are not provided in root, use baseUri protocol. """ protocols = ctx.get_property_with_schema('protocols', RamlRoot.protocols) if protocols is None and base_uri is not None: protocols = [urllib2.splittype(base_uri)[0]] if protocols: protocols = [p.upper() for p in protocols] return protocols
def url_size(url): import httplib, urllib2 proto, url = urllib2.splittype(url) assert (proto.lower() == 'http') host, path = urllib2.splithost(url) # http://stackoverflow.com/questions/107405/how-do-you-send-a-head-http-request-in-python conn = httplib.HTTPConnection(host) conn.request('HEAD', path) res = conn.getresponse() # FIXME: Follow any redirects return int(res.getheader('content-length'))
def url_size(url): import httplib, urllib2 proto, url = urllib2.splittype(url) assert(proto.lower() == 'http') host, path = urllib2.splithost(url) # http://stackoverflow.com/questions/107405/how-do-you-send-a-head-http-request-in-python conn = httplib.HTTPConnection(host) conn.request('HEAD', path) res = conn.getresponse() # FIXME: Follow any redirects return int(res.getheader('content-length'))
echo(content) except Exception as err: pass finally: return content def pageCsContentImage(self,cspage): """本地img替换为完全img路径""" proto, rest = urllib2.splittype(self.config.cfgUrl) host, rest = urllib2.splithost(rest) csimgs=cspage.Find("img")
def lamda(self,csblock): href=csblock[0].Attr("href") if not href: href=csblock[1] if href: href=href.replace("\\","").replace("\"","") if href and href[0:1]=="/": proto, rest = urllib2.splittype(self.config.cfgUrl) host, rest = urllib2.splithost(rest) href=proto+"://"+host+href return href
def parse_protocols(ctx, base_uri=None): """ Parse ``protocols`` from a root context. If protocols are not provided in root, use baseUri protocol. """ protocols = ctx.get_property_with_schema( 'protocols', RamlRoot.protocols) if protocols is None and base_uri is not None: protocols = [urllib2.splittype(base_uri)[0]] if protocols: protocols = [p.upper() for p in protocols] return protocols
def url_split(url): protocol = None domain = None rest = None try: protocol, rest = urllib2.splittype(url) if not protocol: protocol = 'https' rest = '//' + rest domain, rest = urllib2.splithost(rest) except Exception, e: log.msg(traceback.format_exc(), level=log.ERROR)
def _get_pingback_server(self, target): " Try to find the target's pingback xmlrpc server address " # first try to find the pingback server in the HTTP header try: host, path = urllib2.splithost(urllib2.splittype(target)[1]) conn = httplib.HTTPConnection(host) conn.request('HEAD', path) res = conn.getresponse() server = dict(res.getheaders()).get('x-pingback') except Exception, e: raise PingbackClientError(e.message)
def RemoteAccess(url, *args, **kwargs): """Connect to a remote Subversion server :param url: URL to connect to :return: RemoteAccess object """ if isinstance(url, bytes): url = url.decode("utf-8") (type, opaque) = splittype(url) if type not in url_handlers: raise SubversionException("Unknown URL type '%s'" % type, ERR_BAD_URL) return url_handlers[type](url, *args, **kwargs)
def getRSSFeed(self, url): handlers = [] if sickbeard.PROXY_SETTING: logger.log(u"Using global proxy for url: " + url, logger.DEBUG) scheme, address = urllib2.splittype(sickbeard.PROXY_SETTING) address = sickbeard.PROXY_SETTING if scheme else "http://" + sickbeard.PROXY_SETTING handlers = [urllib2.ProxyHandler({"http": address, "https": address})] self.provider.headers.update({"Referer": address}) elif "Referer" in self.provider.headers: self.provider.headers.pop("Referer") return getFeed(url, request_headers=self.provider.headers, handlers=handlers)
def get_host(url): ''' 通过url获取域名 :param url: 带获取的url地址 :return: host结果 ''' proto, rest = urllib2.splittype(url) res, rest = urllib2.splithost(rest) if res: return res else: print "获取host" + url + "失败" sys.exit(0)
def format_and_filter_urls(base_url, url): # 转换非完整的url格式 if url.startswith('/'): # 以根开头的绝对url地址 base_url = "".join(base_url.split()) # 删除所有\s+ protocol, rest = urllib2.splittype(base_url) host, rest = urllib2.splithost(rest) url = (protocol + "://" + host).rstrip('/') + "/" + url.lstrip('/') if url.startswith('.') or not url.startswith('http'): # 相对url地址 url = base_url.rstrip('/') + "/" + url.lstrip('./') # 过滤描点 return url.split('#')[0]
def go(url): protocol, address=urllib2.splittype(url) # print protocol,address if protocol == "http": global host; host,path=urllib2.splithost(address) # print host,path; content = getPageContent(url); soup = BeautifulSoup(content,'html.parser'); getAllImage(soup); getAllHyperlink(soup); else : print 'URL is not http'
def go(url): protocol, address = urllib2.splittype(url) # print protocol,address if protocol == "http": global host host, path = urllib2.splithost(address) # print host,path; content = getPageContent(url) soup = BeautifulSoup(content, 'html.parser') getAllImage(soup) getAllHyperlink(soup) else: print 'URL is not http'
def getRSSFeed(self, url): handlers = [] if sickrage.PROXY_SETTING: sickrage.LOGGER.debug("Using global proxy for url: " + url) scheme, address = urllib2.splittype(sickrage.PROXY_SETTING) address = sickrage.PROXY_SETTING if scheme else 'http://' + sickrage.PROXY_SETTING handlers = [urllib2.ProxyHandler({'http': address, 'https': address})] self.provider.headers.update({'Referer': address}) elif 'Referer' in self.provider.headers: self.provider.headers.pop('Referer') return getFeed(url, request_headers=self.provider.headers, handlers=handlers)
def get_host(url): """ 通过url获取域名 :param url: 带获取的url地址 :return: host结果 """ proto, rest = urllib2.splittype(url) res, rest = urllib2.splithost(rest) if res: return res else: print "获取host" + url + "失败" sys.exit(0)
def lamda(self,csblock): href=csblock[0].Select("url").Text() type=csblock[0].Select("tag").Text() try: name=csblock[0].Find("name").Text().encode('utf8') time=csblock[0].Select("time").Text() host, rest = urllib2.splittype(href) filename= rest[rest.rindex("/")+1:] id=filename[:filename.rindex(".")] url="http://3g.ali213.net/gl/m/"+id+".html?d="+time+"&t="+type+"&n="+name return url except Exception as err: return href
def __init__(self, url): self.url = url self.schema, url = urllib2.splittype(url) host, path = urllib2.splithost(url) userpass, host = urllib2.splituser(host) if userpass: self.user, self.password = urllib2.splitpasswd(userpass) path, self.querystring = urllib.splitquery(path) self.query = self.querystring and self.querystring.split('&') or [] #urllib.splitquery(url) self.host, self.port = urllib2.splitport(host) path, self.tag = urllib2.splittag(path) self.path = path.strip('/')
def load_json_from_url(*args, **kwargs): "Loads and returns a JSON object obtained from a URL specified by the " \ "positional arguments with query-string generated from keyword arguments" url = '/'.join(args) scheme, url = urllib2.splittype(url) url = url.strip('/') + '/' while '//' in url: url = url.replace('//', '/') url_req = '://'.join((scheme, url)) if kwargs: url_req += '?' + urllib.urlencode(kwargs) req = urllib2.Request(url_req) opener = urllib2.build_opener() return json.load(opener.open(req))
def gain_links(url='http://www.jianshu.com/p/05cfea46e4fd'): html_page = urllib2.urlopen(url) links = BeautifulSoup(html_page).findAll('a') links = [ i.get('href') for i in links if i.get('href') and not i.get('href').startswith('javascript:') ] proto, rest = urllib2.splittype(url) # python提取url中的域名和端口号 domain = urllib2.splithost(rest)[0] # 获取url的host links = map(lambda i: proto + '://' + domain + i if i[0] == '/' else url + i if i[0] == '#' else i, links) # 把链接补全 with open('links_list.txt', 'w') as f: f.write('\n'.join(links))
def _path(uri): # gobject gives us a unicode URL, which is cool. # But python2 decodes this into a _unicode_ string with _utf8_ # byte sequences, like a right loon. # # However, if we give python a str (bytes) instead, it spits out a # str with those same utf-8 bytes, which will do. This may still be broken # if your FS uses something other than UTF-8, though. uri = uri.encode('ascii') # XXX python3 note: this hack should not be necessary, and will break transport, path = parse.splittype(uri) if transport != 'file': raise ValueError("%r type is not 'file'" % (transport,)) return parse.unquote(path[2:]).decode('utf-8').encode(fsenc)
def getHtmlByUrl(url): global domains try: u = urllib2.urlopen(url,timeout = 10.0) content = u.read() if content !="": try: proto, rest = urllib2.splittype(url) host, rest = urllib2.splithost(rest) host, port = urllib2.splitport(host) domains[host] = int(port) except: pass return content except: pass