Ejemplo n.º 1
0
def filter_query_params(url: str, parsed: up.ParseResult) -> up.ParseResult:
    """
    Remove the following query params from an URL:
     * ``sid=\w+``: SIDs are mostly used by magento to track the users when cookies are disabled
     * ``s=\w{32}``: same as SID, but from vBulletin sites
     * ``replytocom=\d+``: used by wordpress when clicking on "answer" from a comment

    :param url: the url
    :return: the url without sid and the likes
    """
    if any(l in parsed.query for l in ['s=', 'sid=', 'replytocom=']):
        # TODO: here, the behavior of parse is inconsistant/changes the URL
        # e.g.:
        #   >>> up.parse_qsl('a=%7E_%7E%3B')
        #   [('a', '~_~;')]
        #   >>> up.urlencode([('a', '~_~;')])
        #   'a=~_~%3B'
        qs = up.parse_qsl(parsed.query)
        new_qs = up.urlencode([
            q for q in qs
            if not ((q[0] == 'sid') or  # magento
                    (q[0] == 's' and len(q[1]) == 32) or  # vBulletin
                    (q[0] == 'replytocom')  # wordpress
                    )
        ])
        return parsed._replace(query=new_qs)
    return parsed
Ejemplo n.º 2
0
    def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult:
        """Apply the actual transformation process to the url."""
        query_params = parse_qs(parsed_url.query, keep_blank_values=True)

        query_params.pop("fbclid", None)

        return parsed_url._replace(query=urlencode(query_params, doseq=True))
Ejemplo n.º 3
0
    def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult:
        """Apply the actual transformation process to the url."""
        assert parsed_url.hostname is not None  # mypy workaround

        new_domain = parsed_url.hostname.replace(".m.wikipedia.org",
                                                 ".wikipedia.org")
        return parsed_url._replace(netloc=new_domain)
Ejemplo n.º 4
0
 def __pre_url(parsed_url: parse.ParseResult, page: int):
     """Set the page parameter of a url to the previous page"""
     query = parsed_url.query
     if page == 1:
         new_query = APIPagination.__set_page(query, page)
     else:
         new_query = APIPagination.__set_page(query, page - 1)
     return parsed_url._replace(query=new_query)
Ejemplo n.º 5
0
 def __next_url(parsed_url: parse.ParseResult, page: int, total_pages: int):
     """Set the page parameter of a url to the last page"""
     query = parsed_url.query
     if page == total_pages:
         new_query = APIPagination.__set_page(query, page)
     else:
         new_query = APIPagination.__set_page(query, page + 1)
     return parsed_url._replace(query=new_query)
def canonical_url(url):
    'Converts a string to a Cargo Canonical URL, as per https://github.com/rust-lang/cargo/blob/35c55a93200c84a4de4627f1770f76a8ad268a39/src/cargo/util/canonical_url.rs#L19'
    # Hrm. The upstream cargo does not replace those URLs, but if we don't then it doesn't work too well :(
    url = url.replace('git+https://', 'https://')
    u = urlparse(url)
    # It seems cargo drops query and fragment
    u = ParseResult(u.scheme, u.netloc, u.path, None, None, None)
    u = u._replace(path=u.path.rstrip('/'))

    if u.netloc == 'github.com':
        u = u._replace(scheme='https')
        u = u._replace(path=u.path.lower())

    if u.path.endswith('.git'):
        u = u._replace(path=u.path[:-len('.git')])

    return u
Ejemplo n.º 7
0
def canonical_url(url):
    "Converts a string to a Cargo Canonical URL, as per https://github.com/rust-lang/cargo/blob/35c55a93200c84a4de4627f1770f76a8ad268a39/src/cargo/util/canonical_url.rs#L19"
    logging.debug("canonicalising %s", url)
    # Hrm. The upstream cargo does not replace those URLs, but if we don't then it doesn't work too well :(
    url = url.replace("git+https://", "https://")
    u = urlparse(url)
    # It seems cargo drops query and fragment
    u = ParseResult(u.scheme, u.netloc, u.path, None, None, None)
    u = u._replace(path=u.path.rstrip('/'))

    if u.netloc == "github.com":
        u = u._replace(scheme="https")
        u = u._replace(path=u.path.lower())

    if u.path.endswith(".git"):
        u.path = u.path[:-len(".git")]

    return u
Ejemplo n.º 8
0
def fix_freedesktop_org_url(parsed: ParseResult, branch: Optional[str],
                            subpath: Optional[str]):
    if parsed.netloc == "anongit.freedesktop.org":
        path = parsed.path
        if path.startswith("/git/"):
            path = path[len("/git"):]
        parsed = parsed._replace(netloc="gitlab.freedesktop.org",
                                 scheme="https",
                                 path=path)
        return parsed, branch, subpath
    return None, None, None
Ejemplo n.º 9
0
    def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult:
        """Apply the actual transformation process to the url."""
        query_params = parse_qs(parsed_url.query, keep_blank_values=True)

        cleaned_params = {
            param: value
            for param, value in query_params.items()
            if not param.startswith("utm_")
        }

        return parsed_url._replace(query=urlencode(cleaned_params, doseq=True))
Ejemplo n.º 10
0
def fix_twitter_url(url: str,
                    parsed: up.ParseResult) -> Optional[up.ParseResult]:
    """
    (this method does nothing on URLs outside of the twitter.com domain).
    [Potentially] modify the twitter.com URLs by applying the following:

    - ignore sharing intents (e.g. ``/intent``, ``/share``);
    - replace http by https;
    - strip specific subdomains (e.g. ``mobile``, ``www``);
    - strip specific query parameters (e.g. ``lang``);

    :param parsed: a parsed URL
    :return: the fixed URL or None if the URL should be ignored
    """
    if 'twitter.com' not in parsed.netloc:
        return parsed

    # remove twitter.com/share and twitter.com/intent, both used for sharing
    if parsed.path.startswith('/intent') or parsed.path.startswith('/share'):
        return None

    # make it https
    if parsed.scheme != 'https':
        parsed = parsed._replace(scheme='https')

    # handle subdomains
    if '.twitter.com' in parsed.netloc:
        subdomain = parsed.netloc.replace('.twitter.com', '')
        if subdomain in _twitter_remap:
            parsed = _twitter_remap[subdomain](parsed, subdomain)
            if parsed is None:
                return None

    # strip uninteresting query parameters
    qs = up.parse_qsl(parsed.query)
    if len(qs):
        parsed = parsed._replace(
            query=up.urlencode([(k, v) for k, v in qs
                                if k not in _twitter_qs_blacklist]))
    return parsed
Ejemplo n.º 11
0
def fix_path_in_port(parsed: ParseResult, branch: Optional[str],
                     subpath: Optional[str]):
    if ":" not in parsed.netloc or parsed.netloc.endswith("]"):
        return None, None, None
    host, port = parsed.netloc.rsplit(":", 1)
    if host.split("@")[-1] not in (KNOWN_GITLAB_SITES + ["github.com"]):
        return None, None, None
    if not port or port.isdigit():
        return None, None, None
    return (
        parsed._replace(path="%s/%s" % (port, parsed.path.lstrip("/")),
                        netloc=host),
        branch,
        subpath,
    )
Ejemplo n.º 12
0
    def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult:
        """Apply the actual transformation process to the url.

        This converts a url like https://youtu.be/asdf to
        https://www.youtube.com/watch?v=asdf (and retains any other query params).
        """
        video_id = parsed_url.path.strip("/")

        # use parse_qsl() and insert() here so the v= is always the first query param
        query_params = parse_qsl(parsed_url.query, keep_blank_values=True)
        query_params.insert(0, ("v", video_id))

        return parsed_url._replace(netloc="www.youtube.com",
                                   path="/watch",
                                   query=urlencode(query_params))
Ejemplo n.º 13
0
def fix_fb_url(url: str, parsed: up.ParseResult) -> Optional[up.ParseResult]:
    """
    (this method does nothing on URLs outside of the facebook.com domain).
    [Potentially] modify the twitter.com URLs by applying the following:

    - ignore specific subdomains (e.g. ``login``, ``graph``);
    - replace http by https;
    - remap specifiy subdomains to www (e.g. languages such as ``de-de``, other such as ``m``, ``touch``);
    - extract and return the redirect URL targeted by ``l.facebook.com``;
    - strip off all query parameters (TODO)

    :param parsed: a parsed URL
    :return: the fixed URL or None if the URL should be ignored
    """
    if 'facebook.com' not in parsed.netloc:
        return parsed

    # make it https
    if parsed.scheme != 'https':
        parsed = parsed._replace(scheme='https')

    # extract subdomain
    subdomain = parsed.netloc.replace('facebook.com', '')
    if subdomain.endswith('.'): subdomain = subdomain[:-1]
    # handle subdomains
    if subdomain in _facebook_remap:
        parsed = _facebook_remap[subdomain](parsed, subdomain)
    elif re.match('[a-z]{2}-[a-z]{2}', subdomain):
        parsed = _fb_remap(parsed, subdomain)

    if parsed is None:
        return None

    # strip off all query parameters (TODO: really that clever ?)
    parsed = parsed._replace(query='')
    return parsed
Ejemplo n.º 14
0
def path_as_href(path: str, into_url: ParseResult = None) -> str:
    """Returns the string to use for referring to the given path in a file.

    This percent-encodes characters as necessary to make the path a valid URL.
    If into_url is provided, it copies every part of that URL except the path
    into the resulting URL.

    Note that if into_url contains a scheme or netloc, the given path must be absolute.
    """
    urlpath = quote(path)
    if into_url:
        if (into_url.scheme or into_url.netloc) and not os.path.isabs(path):
            raise ValueError(
                f'Cannot put a relative path [{path}]'
                f'into a URL with scheme or host/port [{into_url}]')
        return urlunparse(into_url._replace(path=urlpath))
    return urlpath
Ejemplo n.º 15
0
def build_return_url(redirect_uri: ParseResult,
                     **params: Optional[str]) -> str:
    """Construct a return URL for a redirect.

    Parameters
    ----------
    redirect_uri : `urllib.parse.ParseResult`
        The parsed return URI from the client.
    **params : `str` or `None`
        Additional parameters to add to that URI to create the return URL.
        Any parameters set to `None` will be ignored.

    Returns
    -------
    return_url : `str`
        The return URL to which the user should be redirected.
    """
    query = parse_qsl(redirect_uri.query) if redirect_uri.query else []
    query.extend(((k, v) for (k, v) in params.items() if v is not None))
    return_url = redirect_uri._replace(query=urlencode(query))
    return return_url.geturl()
def to_pf_url(url: ParseResult):
    """
    Returns *P*ath and *F*ile as defined here:
    https://gist.github.com/andrewdotn/eebeaa60d48c3c0f6f9fc75f0ede8d03#proposal
    """
    return urlunparse(url._replace(scheme="", netloc=""))
Ejemplo n.º 17
0
 def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult:
     """Apply the actual transformation process to the url."""
     return parsed_url._replace(netloc="twitter.com")
Ejemplo n.º 18
0
def remove_target_url(url: ParseResult) -> ParseResult:
    parsed_query = parse_qs(url.query)
    parsed_query.pop('targetUrl')

    return url._replace(query=urlencode(parsed_query))
Ejemplo n.º 19
0
def set_target_url(url: ParseResult, targetUrl: str) -> ParseResult:
    parsed_query = parse_qs(url.query)
    parsed_query['targetUrl'] = targetUrl

    return url._replace(query=urlencode(parsed_query))
Ejemplo n.º 20
0
 def __last_url(parsed_url: parse.ParseResult, total_pages: int) -> parse.ParseResult:
     """Set the page parameter of a url to the last page"""
     query = parsed_url.query
     new_query = APIPagination.__set_page(query, total_pages)
     return parsed_url._replace(query=new_query)
Ejemplo n.º 21
0
 def __first_url(parsed_url: parse.ParseResult, first_page: int = 1) -> parse.ParseResult:
     """Set the page parameter of a url to the first page"""
     query = parsed_url.query
     new_query = APIPagination.__set_page(query, first_page)
     return parsed_url._replace(query=new_query)