def prepare_url(url, params): """Prepares the given HTTP URL.""" #: Accept objects that have string representations. #: We're unable to blindly call unicode/str functions #: as this will include the bytestring indicator (b'') #: on python 3.x. #: https://github.com/kennethreitz/requests/pull/2238 if isinstance(url, bytes): url = url.decode('utf8') else: url = unicode(url) if is_py2 else str(url) # Don't do any URL preparation for non-HTTP schemes like `mailto`, # `data` etc to work around exceptions from `url_parse`, which # handles RFC 3986 only. if ':' in url and not url.lower().startswith('http'): return url # Support for unicode domain names and paths. try: scheme, auth, host, port, path, query, fragment = urllib3.util.url.parse_url( url) except LocationParseError as e: raise InvalidURL(*e.args) if not scheme: error = ( "Invalid URL {0!r}: No schema supplied. Perhaps you meant http://{0}?" ) error = error.format(to_native_string(url, 'utf8')) raise MissingSchema(error) if not host: raise InvalidURL("Invalid URL %r: No host supplied" % url) # Only want to apply IDNA to the hostname try: host = host.encode('idna').decode('utf-8') except UnicodeError: raise InvalidURL('URL has an invalid label.') # Carefully reconstruct the network location netloc = auth or '' if netloc: netloc += '@' netloc += host if port: netloc += ':' + str(port) # Bare domains aren't valid URLs. if not path: path = '/' if is_py2: if isinstance(scheme, str): scheme = scheme.encode('utf-8') if isinstance(netloc, str): netloc = netloc.encode('utf-8') if isinstance(path, str): path = path.encode('utf-8') if isinstance(query, str): query = query.encode('utf-8') if isinstance(fragment, str): fragment = fragment.encode('utf-8') if isinstance(params, (str, bytes)): params = to_native_string(params) enc_params = encode_params(params) if enc_params: if query: query = '%s&%s' % (query, enc_params) else: query = enc_params url = requote_uri(urlunparse([scheme, netloc, path, None, query, fragment])) return url
def prepare_url(url, params): """Prepares the given HTTP URL.""" #: Accept objects that have string representations. #: We're unable to blindly call unicode/str functions #: as this will include the bytestring indicator (b'') #: on python 3.x. #: https://github.com/kennethreitz/requests/pull/2238 if isinstance(url, bytes): url = url.decode('utf8') else: url = unicode(url) if is_py2 else str(url) # Don't do any URL preparation for non-HTTP schemes like `mailto`, # `data` etc to work around exceptions from `url_parse`, which # handles RFC 3986 only. if ':' in url and not url.lower().startswith('http'): return url # Support for unicode domain names and paths. try: scheme, auth, host, port, path, query, fragment = urllib3.util.url.parse_url(url) except LocationParseError as e: raise InvalidURL(*e.args) if not scheme: error = ("Invalid URL {0!r}: No schema supplied. Perhaps you meant http://{0}?") error = error.format(to_native_string(url, 'utf8')) raise MissingSchema(error) if not host: raise InvalidURL("Invalid URL %r: No host supplied" % url) # Only want to apply IDNA to the hostname try: host = host.encode('idna').decode('utf-8') except UnicodeError: raise InvalidURL('URL has an invalid label.') # Carefully reconstruct the network location netloc = auth or '' if netloc: netloc += '@' netloc += host if port: netloc += ':' + str(port) # Bare domains aren't valid URLs. if not path: path = '/' if is_py2: if isinstance(scheme, str): scheme = scheme.encode('utf-8') if isinstance(netloc, str): netloc = netloc.encode('utf-8') if isinstance(path, str): path = path.encode('utf-8') if isinstance(query, str): query = query.encode('utf-8') if isinstance(fragment, str): fragment = fragment.encode('utf-8') if isinstance(params, (str, bytes)): params = to_native_string(params) enc_params = encode_params(params) if enc_params: if query: query = '%s&%s' % (query, enc_params) else: query = enc_params url = requote_uri(urlunparse([scheme, netloc, path, None, query, fragment])) return url
def _load(self, path, skipcrawl=False, originaluri=None, includelogs=False,\ skipinit=False, loadtype='href', loadcomplete=False): """Helper function to main load function. :param path: path to start load from. :type path: str. :param skipcrawl: flag to determine if load should traverse found links. :type skipcrawl: boolean. :param originaluri: variable to assist in determining originating path. :type originaluri: str. :param includelogs: flag to determine if logs should be downloaded also. :type includelogs: boolean. :param skipinit: flag to determine if first run of load. :type skipinit: boolean. :param loadtype: flag to determine if load is meant for only href items. :type loadtype: str. :param loadcomplete: flag to download the entire monolith :type loadcomplete: boolean """ if path.endswith("?page=1"): return elif not includelogs: if "/Logs/" in path: return #TODO: need to find a better way to support non ascii characters path = path.replace("|", "%7C") #remove fragments newpath = urlparse2.urlparse(path) newpath.fragment = '' path = urlparse2.urlunparse(newpath) LOGGER.debug(u'_loading %s', path) if not self.reload: if path.lower() in self._visited_urls: return resp = self._client.get(path) if resp.status != 200 and path.lower() == self._client.typepath.defs.\ biospath: raise BiosUnregisteredError() elif resp.status != 200: path = path + '/' resp = self._client.get(path) if resp.status == 401: raise SessionExpiredRis("Invalid session. Please logout and "\ "log back in or include credentials.") elif resp.status != 200: return if loadtype == "ref": self.parse_schema(resp) self.queue.put((resp, path, skipinit, self)) if loadtype == 'href': #follow all the href attributes if self.is_redfish: jsonpath_expr = jsonpath_rw.parse(u"$..'@odata.id'") else: jsonpath_expr = jsonpath_rw.parse(u'$..href') matches = jsonpath_expr.find(resp.dict) if 'links' in resp.dict and 'NextPage' in resp.dict['links']: if originaluri: next_link_uri = originaluri + '?page=' + \ str(resp.dict['links']['NextPage']['page']) href = u'%s' % next_link_uri self._load(href, originaluri=originaluri, \ includelogs=includelogs, skipcrawl=skipcrawl, \ skipinit=skipinit) else: next_link_uri = path + '?page=' + \ str(resp.dict['links']['NextPage']['page']) href = u'%s' % next_link_uri self._load(href, originaluri=path, includelogs=includelogs,\ skipcrawl=skipcrawl, skipinit=skipinit) (newversion, dirmatch) = self.check_for_directory(matches) if not newversion and not skipcrawl: for match in matches: if path == "/rest/v1": if str(match.full_path) == "links.Schemas.href" or \ str(match.full_path) == "links.Registries.href": continue else: if str(match.full_path) == "*****@*****.**" or \ str(match.full_path) == "*****@*****.**": continue if match.value == path: continue href = u'%s' % match.value self._load(href, skipcrawl=skipcrawl, \ originaluri=originaluri, includelogs=includelogs, \ skipinit=skipinit) elif not skipcrawl: href = u'%s' % dirmatch.value self._load(href, skipcrawl=skipcrawl, originaluri=originaluri, \ includelogs=includelogs, skipinit=skipinit) if loadcomplete: for match in matches: self._load(match.value, skipcrawl=skipcrawl, originaluri=\ originaluri, includelogs=includelogs, skipinit=skipinit)
def _load(self, path, skipcrawl=False, originaluri=None, includelogs=False,\ skipinit=False, loadtype='href', loadcomplete=False): """Helper function to main load function. :param path: path to start load from. :type path: str. :param skipcrawl: flag to determine if load should traverse found links. :type skipcrawl: boolean. :param originaluri: variable to assist in determining originating path. :type originaluri: str. :param includelogs: flag to determine if logs should be downloaded also. :type includelogs: boolean. :param skipinit: flag to determine if first run of load. :type skipinit: boolean. :param loadtype: flag to determine if load is meant for only href items. :type loadtype: str. :param loadcomplete: flag to download the entire monolith :type loadcomplete: boolean """ if path.endswith("?page=1"): return elif not includelogs: if "/Logs/" in path: return #TODO: need to find a better way to support non ascii characters path = path.replace("|", "%7C") #remove fragments newpath = urlparse2.urlparse(path) newpath.fragment = '' path = urlparse2.urlunparse(newpath) LOGGER.debug(u'_loading %s', path) if not self.reload: if path.lower() in self._visited_urls: return resp = self._client.get(path) if resp.status != 200: path = path + '/' resp = self._client.get(path) if resp.status == 401: raise SessionExpiredRis("Invalid session. Please logout and "\ "log back in or include credentials.") elif resp.status != 200: return self.queue.put((resp, path, skipinit, self)) if loadtype == 'href': #follow all the href attributes jsonpath_expr = jsonpath_rw.parse(u"$..'@odata.id'") matches = jsonpath_expr.find(resp.dict) if 'links' in resp.dict and 'NextPage' in resp.dict['links']: if originaluri: next_link_uri = originaluri + '?page=' + \ str(resp.dict['links']['NextPage']['page']) href = u'%s' % next_link_uri self._load(href, originaluri=originaluri, \ includelogs=includelogs, skipcrawl=skipcrawl, \ skipinit=skipinit) else: next_link_uri = path + '?page=' + \ str(resp.dict['links']['NextPage']['page']) href = u'%s' % next_link_uri self._load(href, originaluri=path, includelogs=includelogs,\ skipcrawl=skipcrawl, skipinit=skipinit) if not skipcrawl: for match in matches: if str(match.full_path) == "*****@*****.**" or \ str(match.full_path) == "*****@*****.**": continue if match.value == path: continue href = u'%s' % match.value self._load(href, skipcrawl=skipcrawl, \ originaluri=originaluri, includelogs=includelogs, \ skipinit=skipinit) if loadcomplete: for match in matches: self._load(match.value, skipcrawl=skipcrawl, originaluri=\ originaluri, includelogs=includelogs, skipinit=skipinit)
def parse_list_page(self, response): multi_xpath = '/html/body/div[@id and @class="c"]' html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) multi_hxs = hxs.select(multi_xpath) list_url = response.url query = response.meta.get('query') for hxs in multi_hxs: nick = ''.join(hxs.select('./div[1]/a//text()').extract()) user_url = ''.join(hxs.select('./div[1]/a/@href').extract()) user_url = urllib.unquote(user_url).strip() user_url_up = urlparse(user_url) user_url_up.query = '' user_url = urlunparse(user_url_up) div3 = hxs.select('./div[3]') if div3: content = ''.join(div3.select('.//text()').extract()[1:-10]) else: content = ''.join(hxs.select('./div[1]/span//text()').extract()) misc1 = hxs.select('.//a//text()') zan_count, zhuanfa_count, pinglun_count = self._ana_misc1(misc1) misc2 = hxs.select('.//span[@class="ct"]//text()') time, from_info = self._ana_misc2(misc2) misc3 = hxs.select('.//a[@class="cc"]/@href') own_msg_id, forward_msg_id = self._get_msg_id(misc3) own_user_id, forward_user_id = self._get_user_id(misc3) if forward_msg_id and forward_user_id: is_forward = True forward_msg_url1 = 'http://weibo.com/%s/%s' % (forward_user_id, forward_msg_id) forward_msg_url2 = 'http://weibo.cn/%s/%s' % (forward_user_id, forward_msg_id) else: is_forward = False forward_msg_url1 = '' forward_msg_url2 = '' doc = { 'data_source': '新浪微博搜索', 'nick': nick, 'user_url': user_url, 'content': content, 'zan_count': zan_count, 'zhuanfa_count': zhuanfa_count, 'pinglun_count': pinglun_count, 'time': time, 'from_info': from_info, 'own_user_id': own_user_id, 'own_msg_id': own_msg_id, 'own_msg_url1': 'http://weibo.com/%s/%s' % (own_user_id, own_msg_id), 'own_msg_url2': 'http://weibo.cn/%s/%s' % (own_user_id, own_msg_id), 'forward_user_id': forward_user_id, 'forward_msg_id': forward_msg_id, 'forward_msg_url1': forward_msg_url1, 'forward_msg_url2': forward_msg_url2, 'is_forward': is_forward, 'sort': self.sort, } #暂不处理weibo用户的首页头像 # user_homepage = user_url # if not user_homepage: # next_request = None # else: # next_request = Request(user_homepage, callback=self.parse_user_homepage) item = WeiboItem(doc=doc, next_request=None, list_url=list_url, query=query) yield self.item_or_request(item) #暂不处理weibo用户的首页头像 # def parse_user_homepage(self, response): # item = response.meta['item'] # item['doc']['detail'] = response.body_as_unicode() # yield self.item_or_request(item)