Example #1
0
def prepare_url(url, params):
    """Prepares the given HTTP URL."""
    #: Accept objects that have string representations.
    #: We're unable to blindly call unicode/str functions
    #: as this will include the bytestring indicator (b'')
    #: on python 3.x.
    #: https://github.com/kennethreitz/requests/pull/2238
    if isinstance(url, bytes):
        url = url.decode('utf8')
    else:
        url = unicode(url) if is_py2 else str(url)

    # Don't do any URL preparation for non-HTTP schemes like `mailto`,
    # `data` etc to work around exceptions from `url_parse`, which
    # handles RFC 3986 only.
    if ':' in url and not url.lower().startswith('http'):
        return url

    # Support for unicode domain names and paths.
    try:
        scheme, auth, host, port, path, query, fragment = urllib3.util.url.parse_url(
            url)
    except LocationParseError as e:
        raise InvalidURL(*e.args)

    if not scheme:
        error = (
            "Invalid URL {0!r}: No schema supplied. Perhaps you meant http://{0}?"
        )
        error = error.format(to_native_string(url, 'utf8'))

        raise MissingSchema(error)

    if not host:
        raise InvalidURL("Invalid URL %r: No host supplied" % url)

    # Only want to apply IDNA to the hostname
    try:
        host = host.encode('idna').decode('utf-8')
    except UnicodeError:
        raise InvalidURL('URL has an invalid label.')

    # Carefully reconstruct the network location
    netloc = auth or ''
    if netloc:
        netloc += '@'
    netloc += host
    if port:
        netloc += ':' + str(port)

    # Bare domains aren't valid URLs.
    if not path:
        path = '/'

    if is_py2:
        if isinstance(scheme, str):
            scheme = scheme.encode('utf-8')
        if isinstance(netloc, str):
            netloc = netloc.encode('utf-8')
        if isinstance(path, str):
            path = path.encode('utf-8')
        if isinstance(query, str):
            query = query.encode('utf-8')
        if isinstance(fragment, str):
            fragment = fragment.encode('utf-8')

    if isinstance(params, (str, bytes)):
        params = to_native_string(params)

    enc_params = encode_params(params)
    if enc_params:
        if query:
            query = '%s&%s' % (query, enc_params)
        else:
            query = enc_params

    url = requote_uri(urlunparse([scheme, netloc, path, None, query,
                                  fragment]))
    return url
Example #2
0
def prepare_url(url, params):
  """Prepares the given HTTP URL."""
  #: Accept objects that have string representations.
  #: We're unable to blindly call unicode/str functions
  #: as this will include the bytestring indicator (b'')
  #: on python 3.x.
  #: https://github.com/kennethreitz/requests/pull/2238
  if isinstance(url, bytes):
    url = url.decode('utf8')
  else:
    url = unicode(url) if is_py2 else str(url)

  # Don't do any URL preparation for non-HTTP schemes like `mailto`,
  # `data` etc to work around exceptions from `url_parse`, which
  # handles RFC 3986 only.
  if ':' in url and not url.lower().startswith('http'):
    return url

  # Support for unicode domain names and paths.
  try:
    scheme, auth, host, port, path, query, fragment = urllib3.util.url.parse_url(url)
  except LocationParseError as e:
    raise InvalidURL(*e.args)

  if not scheme:
    error = ("Invalid URL {0!r}: No schema supplied. Perhaps you meant http://{0}?")
    error = error.format(to_native_string(url, 'utf8'))

    raise MissingSchema(error)

  if not host:
    raise InvalidURL("Invalid URL %r: No host supplied" % url)

  # Only want to apply IDNA to the hostname
  try:
    host = host.encode('idna').decode('utf-8')
  except UnicodeError:
    raise InvalidURL('URL has an invalid label.')        
        
  # Carefully reconstruct the network location
  netloc = auth or ''
  if netloc:
    netloc += '@'
  netloc += host
  if port:
    netloc += ':' + str(port)

  # Bare domains aren't valid URLs.
  if not path:
    path = '/'

  if is_py2:
    if isinstance(scheme, str):
      scheme = scheme.encode('utf-8')
    if isinstance(netloc, str):
      netloc = netloc.encode('utf-8')
    if isinstance(path, str):
      path = path.encode('utf-8')
    if isinstance(query, str):
      query = query.encode('utf-8')
    if isinstance(fragment, str):
      fragment = fragment.encode('utf-8')

  if isinstance(params, (str, bytes)):
    params = to_native_string(params)

  enc_params = encode_params(params)
  if enc_params:
    if query:
      query = '%s&%s' % (query, enc_params)
    else:
      query = enc_params

  url = requote_uri(urlunparse([scheme, netloc, path, None, query, fragment]))
  return url
    def _load(self, path, skipcrawl=False, originaluri=None, includelogs=False,\
                        skipinit=False, loadtype='href', loadcomplete=False):
        """Helper function to main load function.

        :param path: path to start load from.
        :type path: str.
        :param skipcrawl: flag to determine if load should traverse found links.
        :type skipcrawl: boolean.
        :param originaluri: variable to assist in determining originating path.
        :type originaluri: str.
        :param includelogs: flag to determine if logs should be downloaded also.
        :type includelogs: boolean.
        :param skipinit: flag to determine if first run of load.
        :type skipinit: boolean.
        :param loadtype: flag to determine if load is meant for only href items.
        :type loadtype: str.
        :param loadcomplete: flag to download the entire monolith
        :type loadcomplete: boolean

        """
        if path.endswith("?page=1"):
            return
        elif not includelogs:
            if "/Logs/" in path:
                return

        #TODO: need to find a better way to support non ascii characters
        path = path.replace("|", "%7C")
        #remove fragments
        newpath = urlparse2.urlparse(path)
        newpath.fragment = ''
        path = urlparse2.urlunparse(newpath)

        LOGGER.debug(u'_loading %s', path)

        if not self.reload:
            if path.lower() in self._visited_urls:
                return

        resp = self._client.get(path)

        if resp.status != 200 and path.lower() == self._client.typepath.defs.\
                                                                    biospath:
            raise BiosUnregisteredError()
        elif resp.status != 200:
            path = path + '/'
            resp = self._client.get(path)

            if resp.status == 401:
                raise SessionExpiredRis("Invalid session. Please logout and "\
                                        "log back in or include credentials.")
            elif resp.status != 200:
                return

        if loadtype == "ref":
            self.parse_schema(resp)

        self.queue.put((resp, path, skipinit, self))

        if loadtype == 'href':
            #follow all the href attributes
            if self.is_redfish:
                jsonpath_expr = jsonpath_rw.parse(u"$..'@odata.id'")
            else:
                jsonpath_expr = jsonpath_rw.parse(u'$..href')
            matches = jsonpath_expr.find(resp.dict)

            if 'links' in resp.dict and 'NextPage' in resp.dict['links']:
                if originaluri:
                    next_link_uri = originaluri + '?page=' + \
                                    str(resp.dict['links']['NextPage']['page'])
                    href = u'%s' % next_link_uri

                    self._load(href, originaluri=originaluri, \
                               includelogs=includelogs, skipcrawl=skipcrawl, \
                               skipinit=skipinit)
                else:
                    next_link_uri = path + '?page=' + \
                                    str(resp.dict['links']['NextPage']['page'])

                    href = u'%s' % next_link_uri
                    self._load(href, originaluri=path, includelogs=includelogs,\
                                        skipcrawl=skipcrawl, skipinit=skipinit)

            (newversion, dirmatch) = self.check_for_directory(matches)
            if not newversion and not skipcrawl:
                for match in matches:
                    if path == "/rest/v1":
                        if str(match.full_path) == "links.Schemas.href" or \
                                str(match.full_path) == "links.Registries.href":
                            continue
                    else:
                        if str(match.full_path) == "*****@*****.**" or \
                                str(match.full_path) == "*****@*****.**":
                            continue

                    if match.value == path:
                        continue

                    href = u'%s' % match.value
                    self._load(href, skipcrawl=skipcrawl, \
                           originaluri=originaluri, includelogs=includelogs, \
                           skipinit=skipinit)
            elif not skipcrawl:
                href = u'%s' % dirmatch.value
                self._load(href, skipcrawl=skipcrawl, originaluri=originaluri, \
                                    includelogs=includelogs, skipinit=skipinit)
            if loadcomplete:
                for match in matches:
                    self._load(match.value, skipcrawl=skipcrawl, originaluri=\
                       originaluri, includelogs=includelogs, skipinit=skipinit)
Example #4
0
    def _load(self, path, skipcrawl=False, originaluri=None, includelogs=False,\
                        skipinit=False, loadtype='href', loadcomplete=False):
        """Helper function to main load function.

        :param path: path to start load from.
        :type path: str.
        :param skipcrawl: flag to determine if load should traverse found links.
        :type skipcrawl: boolean.
        :param originaluri: variable to assist in determining originating path.
        :type originaluri: str.
        :param includelogs: flag to determine if logs should be downloaded also.
        :type includelogs: boolean.
        :param skipinit: flag to determine if first run of load.
        :type skipinit: boolean.
        :param loadtype: flag to determine if load is meant for only href items.
        :type loadtype: str.
        :param loadcomplete: flag to download the entire monolith
        :type loadcomplete: boolean

        """
        if path.endswith("?page=1"):
            return
        elif not includelogs:
            if "/Logs/" in path:
                return

        #TODO: need to find a better way to support non ascii characters
        path = path.replace("|", "%7C")

        #remove fragments
        newpath = urlparse2.urlparse(path)
        newpath.fragment = ''
        path = urlparse2.urlunparse(newpath)

        LOGGER.debug(u'_loading %s', path)

        if not self.reload:
            if path.lower() in self._visited_urls:
                return

        resp = self._client.get(path)

        if resp.status != 200:
            path = path + '/'
            resp = self._client.get(path)

            if resp.status == 401:
                raise SessionExpiredRis("Invalid session. Please logout and "\
                                        "log back in or include credentials.")
            elif resp.status != 200:
                return

        self.queue.put((resp, path, skipinit, self))

        if loadtype == 'href':
            #follow all the href attributes
            jsonpath_expr = jsonpath_rw.parse(u"$..'@odata.id'")
            matches = jsonpath_expr.find(resp.dict)

            if 'links' in resp.dict and 'NextPage' in resp.dict['links']:
                if originaluri:
                    next_link_uri = originaluri + '?page=' + \
                                    str(resp.dict['links']['NextPage']['page'])
                    href = u'%s' % next_link_uri

                    self._load(href, originaluri=originaluri, \
                               includelogs=includelogs, skipcrawl=skipcrawl, \
                               skipinit=skipinit)
                else:
                    next_link_uri = path + '?page=' + \
                                    str(resp.dict['links']['NextPage']['page'])

                    href = u'%s' % next_link_uri
                    self._load(href, originaluri=path, includelogs=includelogs,\
                                        skipcrawl=skipcrawl, skipinit=skipinit)

            if not skipcrawl:
                for match in matches:
                    if str(match.full_path) == "*****@*****.**" or \
                            str(match.full_path) == "*****@*****.**":
                        continue

                    if match.value == path:
                        continue

                    href = u'%s' % match.value
                    self._load(href, skipcrawl=skipcrawl, \
                           originaluri=originaluri, includelogs=includelogs, \
                           skipinit=skipinit)

            if loadcomplete:
                for match in matches:
                    self._load(match.value, skipcrawl=skipcrawl, originaluri=\
                       originaluri, includelogs=includelogs, skipinit=skipinit)
    def parse_list_page(self, response):
        multi_xpath = '/html/body/div[@id and @class="c"]'
        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)
        multi_hxs = hxs.select(multi_xpath)
        list_url = response.url
        query = response.meta.get('query')
        for hxs in multi_hxs:
            nick = ''.join(hxs.select('./div[1]/a//text()').extract())
            user_url = ''.join(hxs.select('./div[1]/a/@href').extract())
            user_url = urllib.unquote(user_url).strip()
            user_url_up = urlparse(user_url)
            user_url_up.query = ''
            user_url = urlunparse(user_url_up)
            div3 = hxs.select('./div[3]')
            if div3:
                content = ''.join(div3.select('.//text()').extract()[1:-10])
            else:
                content = ''.join(hxs.select('./div[1]/span//text()').extract())
            misc1 = hxs.select('.//a//text()')
            zan_count, zhuanfa_count, pinglun_count = self._ana_misc1(misc1)
            misc2 = hxs.select('.//span[@class="ct"]//text()')
            time, from_info = self._ana_misc2(misc2)
            misc3 = hxs.select('.//a[@class="cc"]/@href')
            own_msg_id, forward_msg_id = self._get_msg_id(misc3)
            own_user_id, forward_user_id = self._get_user_id(misc3)
            if forward_msg_id and forward_user_id:
                is_forward = True
                forward_msg_url1 = 'http://weibo.com/%s/%s' % (forward_user_id, forward_msg_id)
                forward_msg_url2 = 'http://weibo.cn/%s/%s' % (forward_user_id, forward_msg_id)
            else:
                is_forward = False
                forward_msg_url1 = ''
                forward_msg_url2 = ''
            doc = {
                'data_source': '新浪微博搜索',
                'nick': nick,
                'user_url': user_url,
                'content': content,
                'zan_count': zan_count,
                'zhuanfa_count': zhuanfa_count,
                'pinglun_count': pinglun_count,
                'time': time,
                'from_info': from_info,
                'own_user_id': own_user_id,
                'own_msg_id': own_msg_id,
                'own_msg_url1': 'http://weibo.com/%s/%s' % (own_user_id, own_msg_id),
                'own_msg_url2': 'http://weibo.cn/%s/%s' % (own_user_id, own_msg_id),
                'forward_user_id': forward_user_id,
                'forward_msg_id': forward_msg_id,
                'forward_msg_url1': forward_msg_url1,
                'forward_msg_url2': forward_msg_url2,
                'is_forward': is_forward,
                'sort': self.sort,
            }
            #暂不处理weibo用户的首页头像
            # user_homepage = user_url
            # if not user_homepage:
            #     next_request = None
            # else:
            #     next_request = Request(user_homepage, callback=self.parse_user_homepage)
            item = WeiboItem(doc=doc,
                             next_request=None, list_url=list_url, query=query)
            yield self.item_or_request(item)

    #暂不处理weibo用户的首页头像
    # def parse_user_homepage(self, response):
    #     item = response.meta['item']
    #     item['doc']['detail'] = response.body_as_unicode()
    #     yield self.item_or_request(item)