def _set_query_parameter(self, url, param_name, param_value): scheme, netloc, path, query_string, fragment = urlparse.urlsplit(url) query_params = urlparse.parse_qs(query_string) query_params[param_name] = [param_value] new_query_string = urllib.urlencode(query_params, doseq=True) return urlparse.urlunsplit( (scheme, netloc, path, new_query_string, fragment))
def _build_api_url(url, query): scheme, netloc, path, base_query, fragment = urlparse.urlsplit(url) if base_query: query = '%s&%s' % (base_query, query) return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
def url_fix(s, charset='utf-8'): if isinstance(s, unicode): s = s.encode(charset, 'ignore') scheme, netloc, path, qs, anchor = urlparse.urlsplit(s) path = urllib.quote(path, '/%') qs = urllib.quote_plus(qs, ':&=') return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))
def get_user_details(self, response): """Generate username from identity url""" values = super(LiveJournalBackend, self).get_user_details(response) values['username'] = values.get('username') or \ urlparse.urlsplit(response.identity_url)\ .netloc.split('.', 1)[0] return values
def parse_uri(uri): """Parse absolute URI then return host, port and resource.""" parsed = urlparse.urlsplit(uri) if parsed.scheme != 'wss' and parsed.scheme != 'ws': # |uri| must be a relative URI. # TODO(toyoshim): Should validate |uri|. return None, None, uri if parsed.hostname is None: return None, None, None port = None try: port = parsed.port except ValueError as e: # port property cause ValueError on invalid null port description like # 'ws://host:/path'. return None, None, None if port is None: if parsed.scheme == 'ws': port = 80 else: port = 443 path = parsed.path if not path: path += '/' if parsed.query: path += '?' + parsed.query if parsed.fragment: path += '#' + parsed.fragment return parsed.hostname, port, path
def YATSServer(): if hasattr(settings, 'SSO_SERVER'): parts = list(urlparse.urlsplit(settings.SSO_SERVER)) parts[2] = '' return urlparse.urlunsplit(parts) else: return ''
def url_fix( s, charset='utf-8' ): #UTF-8 is one of the most commonly used encodings. UTF stands for “Unicode Transformation Format”, and the '8' means that 8-bit numbers are used in the encoding. if isinstance(s, unicode): s = s.encode(charset, 'ignore') scheme, netloc, path, qs, anchor = urlparse.urlsplit(s) path = urllib.quote(path, '/%') #for delimeter qs = urllib.quote_plus(qs, ':&=') #python data structures key-value pairs return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))
def Shortining_Service(url): parts = urlparse(url) if not parts.scheme and not parts.hostname: # couldn't parse anything sensible, try again with a scheme. parts = urlparse.urlsplit("http://" + url) if bool(parts.hostname in services and parts.path): return 1 else: return -1
def referrer_path(meta, default=None): referrer = meta.get("HTTP_REFERER") if not referrer: return default parsed = urlparse.urlsplit(referrer) next_domain = drop_subdomain(parsed.netloc) cur_domain = drop_subdomain(meta.get("HTTP_HOST", "")) if next_domain != cur_domain: return default return urlparse.urlunsplit(('', '') + parsed[2:])
def redirect(request, prefix, tiny, converter=default_converter): """ Redirect to a given object from a short URL. """ # Resolve the prefix and encoded ID into a model object and decoded ID. # Many things here could go wrong -- bad prefix, bad value in # SHORTEN_MODELS, no such model, bad encoding -- so just return a 404 if # any of that stuff goes wrong. try: app_label, model_name = settings.SHORTEN_MODELS[prefix].split('.') except KeyError: raise Http404('Bad prefix.') try: model = models.get_model(app_label, model_name) except LookupError: model = False if not model: raise Http404('Bad model specified in SHORTEN_MODELS.') try: id = converter.to_decimal(tiny) except ValueError: raise Http404('Bad encoded ID.') # Try to look up the object. If it's not a valid object, or if it doesn't # have an absolute url, bail again. obj = get_object_or_404(model, pk=id) try: url = obj.get_absolute_url() except AttributeError: raise Http404("'%s' models don't have a get_absolute_url() method." % model.__name__) # We might have to translate the URL -- the badly-named get_absolute_url # actually returns a domain-relative URL -- into a fully qualified one. # If we got a fully-qualified URL, sweet. if urlparse.urlsplit(url)[0]: return HttpResponsePermanentRedirect(url) # Otherwise, we need to make a full URL by prepending a base URL. # First, look for an explicit setting. if hasattr(settings, 'SHORTEN_FULL_BASE_URL') and settings.SHORTEN_FULL_BASE_URL: base = settings.SHORTEN_FULL_BASE_URL # Next, if the sites app is enabled, redirect to the current site. elif Site._meta.installed: base = 'http://%s/' % Site.objects.get_current().domain # Finally, fall back on the current request. else: base = 'http://%s/' % RequestSite(request).domain return HttpResponsePermanentRedirect(urlparse.urljoin(base, url))
def get_host_credentials(config, hostname): """Get login information for a host `hostip` (ipv4) from marvin's `config` @return the tuple username, password for the host else raise keyerror""" for zone in config.get('zones', []): for pod in zone.get('pods', []): for cluster in pod.get('clusters', []): for host in cluster.get('hosts', []): url = host.get('url') if str(url).startswith('http'): hostname_marvin = urlparse.urlsplit(str(url)).netloc else: hostname_marvin = str(url) if hostname == hostname_marvin: return host.get('username'), host.get('password') raise KeyError("Please provide the marvin configuration file with credentials to your hosts")
def goto(self, href, method='get', **args): """ Monkeypatch the TestResponse.goto method so that it doesn't wipe out the scheme and host. """ scheme, host, path, query, fragment = urlparse.urlsplit(href) # We fragment = '' href = urlparse.urlunsplit((scheme, host, path, query, fragment)) href = urlparse.urljoin(self.request.url, href) method = method.lower() assert method in ('get', 'post'), ( 'Only "get" or "post" are allowed for method (you gave %r)' % method) if method == 'get': method = self.test_app.get else: method = self.test_app.post return method(href, **args)
def remove_sensitive(cleartext): redactedtext = cleartext text_index = 0 while True: match = UriCleaner.SENSITIVE_URI_PATTERN.search( redactedtext, text_index) if not match: break o = urlparse.urlsplit(match.group(1)) if not o.username and not o.password: if o.netloc and ":" in o.netloc: # Handle the special case url http://username:password that can appear in url (username, password) = o.netloc.split(':') else: text_index += len(match.group(1)) continue else: username = o.username password = o.password # Given a python MatchObject, with respect to redactedtext, find and # replace the first occurance of username and the first and second # occurance of password uri_str = redactedtext[match.start():match.end()] if username: uri_str = uri_str.replace(username, UriCleaner.REPLACE_STR, 1) # 2, just in case the password is $encrypted$ if password: uri_str = uri_str.replace(password, UriCleaner.REPLACE_STR, 2) t = redactedtext[:match.start()] + uri_str text_index = len(t) if (match.end() < len(redactedtext)): t += redactedtext[match.end():] redactedtext = t if text_index >= len(redactedtext): text_index = len(redactedtext) - 1 return redactedtext
def run_validators(self, value): if self.allow_plain_hostname: try: url_parts = urlparse.urlsplit(value) if url_parts.hostname and '.' not in url_parts.hostname: netloc = '{}.local'.format(url_parts.hostname) if url_parts.port: netloc = '{}:{}'.format(netloc, url_parts.port) if url_parts.username: if url_parts.password: netloc = '{}:{}@{}' % (url_parts.username, url_parts.password, netloc) else: netloc = '{}@{}' % (url_parts.username, netloc) value = urlparse.urlunsplit([ url_parts.scheme, netloc, url_parts.path, url_parts.query, url_parts.fragment ]) except Exception: raise # If something fails here, just fall through and let the validators check it. super(URLField, self).run_validators(value)
async def update_csvs(self): """ Download/update CSVs if so much time has passed since last update """ self.timestamp = self.settings['timestamp'] self.maxDataAgeMinutes = self.settings['maxDataAgeMinutes'] age_delta = epoch_now() - self.timestamp if (age_delta < self.maxDataAgeMinutes): return True print('Updating MCP data file extracts...') for url in self.data_files: fileName = fileName or os.path.basename(urlparse.urlsplit(url)[2]) tempName = fileName + ".tmp" await download(url, tempName) self.remove_file(self, fileName) os.rename(tempName, fileName) return True
def fixurl(url): if not isinstance(url, unicode): url = url.decode('utf8') parsed = urlparse.urlsplit(url) userpass, at, hostport = parsed.netloc.rpartition('@') user, colon1, pass_ = userpass.partition(':') host, colon2, port = hostport.partition(':') scheme = parsed.scheme.encode('utf8') user = quote(user.encode('utf8')) colon1 = colon1.encode('utf8') pass_ = quote(pass_.encode('utf8')) at = at.encode('utf8') host = host.encode('idna') colon2 = colon2.encode('utf8') port = port.encode('utf8') path = '/'.join( # could be encoded slashes! quote(unquote(pce).encode('utf8'), '') for pce in parsed.path.split('/')) query = quote(unquote(parsed.query).encode('utf8'), '=&?/') fragment = quote(unquote(parsed.fragment).encode('utf8')) netloc = ''.join((user, colon1, pass_, at, host, colon2, port)) return urlunsplit((scheme, netloc, path, query, fragment))
def __init__(self, proxy=None, timeout=60, cert=""): self.timeout = timeout self.cert = cert self.connection = None self.host = None self.port = None self.tls = None if isinstance(proxy, str): proxy_sp = urlparse.urlsplit(proxy) self.proxy = { "type": proxy_sp.scheme, "host": proxy_sp.hostname, "port": proxy_sp.port, "user": proxy_sp.username, "pass": proxy_sp.password } elif isinstance(proxy, dict): self.proxy = proxy else: self.proxy = None
def thumb(url, **kwargs): """ Inspired by: http://tech.yipit.com/2013/01/03/how-yipit-scales-thumbnailing-with-thumbor-and-cloudfront/ returns a thumbor url for 'url' with **kwargs as thumbor options. Positional arguments: url -- the location of the original image Keyword arguments: For the complete list of thumbor options https://github.com/globocom/thumbor/wiki/Usage and the actual implementation for the url generation https://github.com/heynemann/libthumbor/blob/master/libthumbor/url.py """ THUMBOR_BASE_URL = getattr(settings, "THUMBOR_BASE_URL", None) THUMBOR_KEY = getattr(settings, "THUMBOR_KEY", "MY_SECURE_KEY") if THUMBOR_BASE_URL: base = THUMBOR_BASE_URL else: # otherwise assume that thumbor is setup behind the same # CDN behind the `thumbor` namespace. scheme, netloc = urlparse.urlsplit(url)[:2] base = f"{scheme}://{netloc}/thumbor" crypto = CryptoURL(key=THUMBOR_KEY) # just for code clarity thumbor_kwargs = kwargs if "fit_in" not in thumbor_kwargs: thumbor_kwargs["fit_in"] = True thumbor_kwargs["image_url"] = url path = crypto.generate(**thumbor_kwargs) return f"{base}{path}"
def postprocess_document(self, uri, document): if uri: match_head_begin = self.re_head_begin.search(document) if match_head_begin: match_base_ref = self.re_base_href.search( document, match_head_begin.end(0)) if not match_base_ref: insert = match_head_begin.end(0) pre = document[:insert] info = urlparse.urlsplit(uri) href = info.scheme + "://" + info.netloc if href[-1] != "\t": href += "/" base = "<base href=\"%s\" />" % href post = document[insert:] document = "".join((pre, base.encode("utf8"), post)) match_head_end = self.re_head_end.search(document) if match_head_end: insert = match_head_end.start(0) pre = document[:insert] post = document[insert:] document = ''.join((pre, self.script, post)) return document
def process_item(self, item, spider): item['_id'] = urlparse.urlsplit(item['url']).path self.db[self.collection_name].update({'_id': item['_id']}, {'$set': item}, upsert=True) return item
def _getSeoLibraryFiltered(startUrl, language, country, useProxy=False): import tldextract from data_mining.seo_document_downloader import SeoDocumentDownloader from config import settings ''' Montamos la query y descargamos los documentos ''' extracted = tldextract.extract(startUrl) domain = u'%s.%s' % (extracted.subdomain, extracted.domain) if extracted.subdomain else extracted.domain domain = u'%s.%s' % (domain, extracted.suffix) # buscamos si hay un path después del dominio from urllib.parse import urlparse urlpath = [u for u in urlparse.urlsplit(startUrl).path.split('/') if u and u!=domain ] path=[] if urlpath: for index, subpath in enumerate(urlpath): if subpath: if index<len(urlpath)-1 or not u'.' in subpath: path.append(subpath) if path: query = u'site:{}/{}'.format(domain, '/'.join(path)) else: query = u'site:{}'.format(domain,) print(query) seoLibrary = SeoDocumentDownloader( query=query, language=language, country=country, searchEngine=settings.DOWNLOADER_SEARCH_ENGINE, downloadLimit=settings.SITE_AUDIT_DOWNLOAD_LIMIT, sameOrigin=True, useProxy=useProxy ).getSeoLibrary() print(u'Número de documentos originales: %s' % (len(seoLibrary.seoDocuments))) if len(seoLibrary.seoDocuments) < 20: raise Exception(u'Error: Not enough documents to get stats') ''' Eliminamos aquellos enlaces que son considerados paginaciones ''' DIFFERENCE_LIMIT = 6 paginator = {} links2Remove = [] import re for seoDocument in seoLibrary.seoDocuments: applied = re.sub(u'\d', u'', seoDocument.link) difference = len(seoDocument.link) - len(applied) if difference != 0 and difference <= DIFFERENCE_LIMIT: if applied not in paginator: paginator[applied] = [] paginator[applied].append(seoDocument.link) for _shortUrl, origins in paginator.items(): if len(origins) > 1: links2Remove.extend(origins) #print u'-'*15 #print u'%s' % _shortUrl #for origin in origins: # print u'\t\t%s' % origin seoLibrary.seoDocuments = [seoDocument for seoDocument in seoLibrary.seoDocuments if seoDocument.link not in links2Remove] print(u'Número de documentos tras Filtrado de Paginaciones: %s' % (len(seoLibrary.seoDocuments))) ''' Descartamos aquellos documentos que no tengan una longitud mínima ''' import numpy as np lengths = [seoDocument.getLenRawTokens() for seoDocument in seoLibrary.seoDocuments] percentilLengthText = np.percentile(lengths, 25) lowerLimit = max(percentilLengthText, settings.SITE_AUDIT_MIN_DOCUMENT_LENGTH) seoLibrary.seoDocuments = [seoDocument for seoDocument in seoLibrary.seoDocuments if seoDocument.getLenRawTokens() > lowerLimit] print(u'Número de documentos tras PRIMER filtrado por Longitud: %s' % (len(seoLibrary.seoDocuments))) allSentences = {} for seoDocument in seoLibrary.seoDocuments: sentences = seoDocument.getSentences() for sentence in sentences: if sentence not in allSentences: allSentences[sentence] = 0 allSentences[sentence] += 1 for seoDocument in seoLibrary.seoDocuments: sentences = seoDocument.getSentences() sentencesFiltered = [] for sentence in sentences: if allSentences[sentence] < 2: sentencesFiltered.append(sentence) seoDocument.dataDocument.text = u' . '.join(sentencesFiltered) seoDocument.resetPreloads() lengths = [seoDocument.getLenRawTokens() for seoDocument in seoLibrary.seoDocuments] lowerLimit = np.percentile(lengths, 25) seoLibrary.seoDocuments = [seoDocument for seoDocument in seoLibrary.seoDocuments if seoDocument.getLenRawTokens() > lowerLimit] print(u'Número de documentos tras SEGUNDO filtrado por Longitud: %s' % (len(seoLibrary.seoDocuments))) return seoLibrary
def post_url(url, fields, files=[]): urlparts = urlparse.urlsplit(url) return post_multipart(urlparts[1], urlparts[2], fields, files)
def down_html(self, url,dir_name): try: #获取biz params = urlparse.urlsplit(url).query params = urlparse.parse_qs(params,True) if not '__biz' in params: #可能是搜狗链接,先转成微信连接 url = self.deal_get_real_url(url) url = url.replace('\\x26','&') url = url.replace('x26','&') print(url) h = http.client.Http(timeout=30) html = self._get_gzh_article_text(url) content = html # 正则表达式javascript里的获取相关变量 ct = re.findall('var ct = "(.*?)";', content)[0] msg_cdn_url = re.findall('var msg_cdn_url = "(.*?)";', content)[0] nickname = re.findall('var nickname = "(.*?)";', content)[0] if(nickname == ""): nickname = "not has name" if(ct == ""): ct = time.time() ctime = time.strftime("%Y%m%d%H%M%S", time.localtime(int(ct))) # int将字符串转成数字,不区分int和long, 这里将时间秒数转成日期格式 # 建立文件夹 #编码转换 if isinstance(dir_name, str): dir_name = dir_name.encode('GB18030','ignore') else: dir_name = dir_name.decode('utf-8','ignore').encode('GB18030','ignore') #print if isinstance(nickname, str): nickname = nickname.encode('GB18030','ignore') else: if chardet.detect(nickname)['encoding'] == 'KOI8-R' : print("KOI8") nickname = nickname.decode('KOI8-R','ignore').encode('GB18030','ignore') else: print("GB18030") nickname = nickname.decode('utf-8','ignore').encode('GB18030','ignore') dir = 'WeiXinGZH/' + nickname + '/' + ctime + '/' + dir_name + '/' #dir = 'WeiXinGZH/' + dir_name + '/' dir = dir.decode('gb2312','ignore') dir = dir.replace("?", "") dir = dir.replace("\\", "") dir = dir.replace("*", "") dir = dir.replace(":", "") dir = dir.replace('\"', "") dir = dir.replace("<", "") dir = dir.replace(">", "") dir = dir.replace("|", "") try : os.makedirs(dir) # 建立相应的文件夹 except : #不处理 errormsg = 'none' # 下载封面 url = msg_cdn_url print (u'正在下载文章:' + url) resp, contentface = h.request(url) file_name = dir + 'cover.jpg' codecs.open(file_name,mode='wb').write(contentface) # 下载其他图片 soup = BeautifulSoup(content, 'html.parser') count = 0 #logger.error(html) err_count = 0 for link in soup.find_all('img') : try: err_count += 1 if(err_count > 200) : break #防止陷阱 if None != link.get('data-src') : count = count + 1 orurl = link.get('data-src') url = orurl.split('?')[0] # 重新构造url,原来的url有一部分无法下载 #print u'正在下载:' + url resp, content = h.request(url) matchurlvalue = re.search(r'wx_fmt=(?P<wx_fmt>[^&]*)', orurl) # 无参数的可能是gif,也有可能是jpg if None != matchurlvalue: wx_fmt = matchurlvalue.group('wx_fmt') # 优先通过wx_fmt参数的值判断文件类型 else: wx_fmt = binascii.b2a_hex(content[0:4]) # 读取前4字节转化为16进制字符串 #print wx_fmt phototype = { 'jpeg': '.jpg', 'gif' : '.gif', 'png' : '.png', 'jpg' : '.jpg', '47494638' : '.gif', 'ffd8ffe0' : '.jpg', 'ffd8ffe1' : '.jpg', 'ffd8ffdb' : '.jpg', 'ffd8fffe' : '.jpg', 'other' : '.jpg', '89504e47' : '.png' } # 方便写文件格式 file_name = 'Picture' + str(count) + phototype[wx_fmt] file_path = dir + file_name open(file_path, 'wb').write(content) #图片替换成本地地址 re_url = 'data-src="%s(.+?)"' % (url[:-5]) re_pic = 'src="%s"' % (file_name) html = re.sub(re_url, re_pic, html) except: continue with open("%sindex.html" % (dir), "wb") as code : code.write(html) print(u'文章下载完成') ret_path = os.path.abspath('.') ret_path = ret_path.replace('\\', "/") ret_path = "%s/%sindex.html" %(ret_path.decode('GB18030').encode('utf-8'),dir) #print(ret_path) #except: except WechatSogouHistoryMsgException: print(u'文章内容有异常编码,无法下载') return "" return ret_path
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) recipe = CookpadRecipe() # id recipe['id'] = int(re.findall(r'recipe/(\d+)', response.url)[0]) # name recipe['name'] = hxs.select("//div[@id='recipe-title']/h1/text()")[0] \ .extract().strip() # author recipe['author'] = int( hxs.select("//a[@id='recipe_author_name']/@href").re('(/d+)')[0]) # description recipe['description'] = ''.join(hxs.select("//div[@id='description']/text()") \ .extract()).strip() # ingredients ingredients = [] ingredient_basepath = ( "//div[@id='ingredients']/div[@id='ingredients_list']/" "div[@class='ingredient ingredient_row']") ingredient_nodes = hxs.select(ingredient_basepath) for ingredient_node in ingredient_nodes: try: if ingredient_node.select('div/span/a'): # keyword ingredient name = ingredient_node.select( 'div[1]/span/a/text()').extract()[0] else: # normal ingredient name = ingredient_node.select( 'div[1]/span/text()').extract()[0] quantity = ingredient_node.select('div[2]/text()').extract()[0] except: continue ingredient = Ingredient() ingredient['name'] = name ingredient['quantity'] = quantity ingredients.append(ingredient) recipe['ingredients'] = ingredients # instructions recipe['instructions'] = hxs.select( "//dd[@class='instruction']/p/text()").extract() # leaf category referer = response.request.headers.get('Referer') recipe['category'] = int( os.path.basename(urlparse.urlsplit(referer).path)) # all categories (including leaf, internal, and root nodes) categories = hxs.select("//div[@id='category_list']/ul/li/a/@href").re( r'\d+') recipe['categories'] = map(lambda category: int(category), categories) # report count try: recipe['report_count'] = int(''.join( hxs.select("//li[@id='tsukurepo_tab']/a/span/text()").re( '(\d+)'))) except: recipe['report_count'] = 0 # comment count try: recipe['comment_count'] = int(''.join( hxs.select("//li[@id='comment_tab']/a/span/text()").re( '(\d+)'))) except: recipe['comment_count'] = 0 # advice and history for text in ('advice', 'history'): recipe[text] = ''.join( hxs.select( "//div[@id='{0}']/text()".format(text)).extract()).strip() # related keywords recipe['related_keywords'] = hxs.select("//div[@class='related_keywords']/a/text()") \ .extract() # main image image_main = hxs.select("//div[@id='main-photo']/img/@src").extract() recipe['image_main'] = image_main[0] if image_main else [] # instruction images recipe['images_instruction'] = hxs.select( "//dd[@class='instruction']/div/div[@class='image']/img/@src" ).extract() # published date recipe['published_date'] = hxs.select( "//div[@id='recipe_id_and_published_date']/span[2]/text()").re( '\d{2}/\d{2}/\d{2}')[0] # udpated date recipe['updated_date'] = hxs.select( "//div[@id='recipe_id_and_published_date']/span[3]/text()").re( '\d{2}/\d{2}/\d{2}')[0] return recipe