Ejemplo n.º 1
0
def extract_tarball_to_directory(archive_file: str, dest_directory: str, strip_root: bool = False) -> None:
    """Extract Tar archive (.tar, .tar.gz or .tgz) to destination directory, optionally stripping the root directory
    first."""

    archive_file = decode_string_from_bytes_if_needed(archive_file)
    dest_directory = decode_string_from_bytes_if_needed(dest_directory)

    if not os.path.isfile(archive_file):
        raise McExtractTarballToDirectoryException("Archive at '%s' does not exist" % archive_file)

    archive_file_extension = file_extension(archive_file)
    if archive_file_extension in [".gz", ".tgz"]:
        tar_args = "-zxf"
    elif archive_file_extension in [".tar"]:
        tar_args = "-xf"
    else:
        raise McExtractTarballToDirectoryException("Unsupported archive '%s' with extension '%s'" %
                                                   (archive_file, archive_file_extension))

    args = ["tar",
            tar_args, archive_file,
            "-C", dest_directory]
    if strip_root:
        args += ['--strip', '1']

    try:
        run_command_in_foreground(args)
    except McRunCommandInForegroundException as ex:
        raise McExtractTarballToDirectoryException("Error while extracting archive '%s': %s" % (archive_file, str(ex)))
Ejemplo n.º 2
0
def is_shortened_url(url: str) -> bool:
    """Returns true if URL is a shortened URL (e.g. with Bit.ly)."""
    url = decode_string_from_bytes_if_needed(url)
    if url is None:
        l.debug("URL is None")
        return False
    if len(url) == 0:
        l.debug("URL is empty")
        return False
    if not is_http_url(url):
        l.debug("URL is not valid")
        return False

    uri = urlparse(url)

    if uri.path is not None and uri.path in ['', '/']:
        # Assume that most of the URL shorteners use something like
        # bit.ly/abcdef, so if there's no path or if it's empty, it's not a
        # shortened URL
        return False

    uri_host = uri.hostname.lower()
    if uri_host in __URL_SHORTENER_HOSTNAMES:
        return True

    return False
Ejemplo n.º 3
0
def increment_day(date: str, days: int = 1) -> str:
    """Given a date in the sql format 'YYYY-MM-DD', increment it by $days days."""
    date = decode_string_from_bytes_if_needed(date)
    if days == 0:
        return date
    epoch_date = get_epoch_from_sql_date(date) + (((days * 24) + 12) * 60 * 60)
    return datetime.datetime.fromtimestamp(int(epoch_date)).strftime('%Y-%m-%d')
Ejemplo n.º 4
0
def relative_symlink(source: str, link_name: str) -> None:
    """Create symlink while also converting paths to relative ones by finding common prefix."""

    source = decode_string_from_bytes_if_needed(source)
    link_name = decode_string_from_bytes_if_needed(link_name)

    source = os.path.abspath(source)
    link_name = os.path.abspath(link_name)

    if not os.path.exists(source):
        raise Exception("Symlink source does not exist at path: %s" % source)

    rel_source = os.path.relpath(source, os.path.dirname(link_name))

    l.debug("Creating relative symlink from '%s' to '%s'..." % (rel_source, link_name))
    os.symlink(rel_source, link_name)
Ejemplo n.º 5
0
def download_file_to_temp_path(source_url: str) -> str:
    """Download URL to temporary path, return that path."""

    source_url = decode_string_from_bytes_if_needed(source_url)

    dest_dir = tempfile.mkdtemp()

    # Try to figure out a sensible name for the file
    # noinspection PyBroadException
    try:
        uri = urlparse(source_url)
        url_path = uri.path
        temp_filename = os.path.basename(url_path)
    except Exception:
        temp_filename = "temp.dat"

    dest_path = os.path.join(dest_dir, temp_filename)
    try:
        download_file(source_url=source_url, target_path=dest_path)
    except McDownloadFileException as ex:
        raise McDownloadFileToTempPathException(
            "Error while downloading file from '%(source_url)s' to temp. location '%(target_path)s': %(exception)s" % {
                'source_url': source_url,
                'target_path': dest_path,
                'exception': str(ex),
            })

    return dest_path
Ejemplo n.º 6
0
def file_extension(filename: str) -> str:
    """Return file extension, e.g. ".zip" for "test.zip", or ".gz" for "test.tar.gz"."""

    filename = decode_string_from_bytes_if_needed(filename)

    basename = os.path.basename(filename)
    root, extension = os.path.splitext(basename)
    return extension.lower()
Ejemplo n.º 7
0
def tcp_port_is_open(port: int, hostname: str = 'localhost') -> bool:
    """Test if TCP port is open."""

    hostname = decode_string_from_bytes_if_needed(hostname)

    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    result = sock.connect_ex((hostname, port))
    return result == 0
Ejemplo n.º 8
0
def meta_refresh_url_from_html(html: str, base_url: str = None) -> Optional[str]:
    """From the provided HTML, determine the <meta http-equiv="refresh" /> URL (if any)."""

    def __get_meta_refresh_url_from_tag(inner_tag: str, inner_base_url=None) -> Optional[str]:
        """Given a <meta ...> tag, return the url from the content="url=XXX" attribute.  return undef if no such url is
        found."""
        if not re.search(r'http-equiv\s*?=\s*?["\']\s*?refresh\s*?["\']', inner_tag, re.I):
            return None

        # content="url='http://foo.bar'"
        inner_url = None

        match = re.search(r'content\s*?=\s*?"\d*?\s*?;?\s*?URL\s*?=\s*?\'(.+?)\'', inner_tag, re.I)
        if match:
            inner_url = match.group(1)
        else:
            # content="url='http://foo.bar'"
            match = re.search(r'content\s*?=\s*?\'\d*?\s*?;?\s*?URL\s*?=\s*?"(.+?)"', inner_tag, re.I)
            if match:
                inner_url = match.group(1)
            else:
                # Fallback
                match = re.search(r'content\s*?=\s*?["\']\d*?\s*?;?\s*?URL\s*?=\s*?(.+?)["\']', inner_tag, re.I)
                if match:
                    inner_url = match.group(1)

        if is_http_url(inner_url):
            return inner_url

        if inner_base_url is not None:
            return urljoin(base=inner_base_url, url=inner_url)

        return None

    html = decode_string_from_bytes_if_needed(html)
    base_url = decode_string_from_bytes_if_needed(base_url)

    tags = re.findall(r'(<\s*meta[^>]+>)', html, re.I)
    for tag in tags:
        url = __get_meta_refresh_url_from_tag(tag, base_url)
        if url is not None:
            return url

    return None
Ejemplo n.º 9
0
def get_url_path_fast(url: str) -> str:
    """Return URLs path."""
    url = decode_string_from_bytes_if_needed(url)

    if not is_http_url(url):
        return ''

    # Don't bother with the regex (Perl's version didn't work anyway)
    uri = urlparse(url)
    return uri.path
Ejemplo n.º 10
0
def schema_version_from_lines(sql: str) -> int:
    """Utility function to determine a database schema version from a bunch of SQL commands."""
    sql = decode_string_from_bytes_if_needed(sql)
    matches = re.search(r'[+\-]*\s*MEDIACLOUD_DATABASE_SCHEMA_VERSION CONSTANT INT := (\d+?);', sql)
    if matches is None:
        raise McSchemaVersionFromLinesException("Unable to parse the database schema version number")
    schema_version = int(matches.group(1))
    if schema_version == 0:
        raise McSchemaVersionFromLinesException("Invalid schema version")
    return schema_version
Ejemplo n.º 11
0
def extract_zip_to_directory(archive_file: str, dest_directory: str) -> None:
    """Extract ZIP archive (.zip or .war) to destination directory."""

    archive_file = decode_string_from_bytes_if_needed(archive_file)
    dest_directory = decode_string_from_bytes_if_needed(dest_directory)

    if not os.path.isfile(archive_file):
        raise McExtractZipToDirectoryException("Archive at '%s' does not exist" % archive_file)

    archive_file_extension = file_extension(archive_file)
    if archive_file_extension not in [".zip", ".war"]:
        raise McExtractZipToDirectoryException(
            "Unsupported archive '%s' with extension '%s'" % (archive_file, archive_file_extension))

    args = ["unzip", "-q", archive_file, "-d", dest_directory]

    try:
        run_command_in_foreground(args)
    except McRunCommandInForegroundException as ex:
        raise McExtractZipToDirectoryException("Error while extracting archive '%s': %s" % (archive_file, str(ex)))
Ejemplo n.º 12
0
def resolve_absolute_path_under_mc_root(path: str, must_exist: bool = False) -> str:
    """Return absolute path to object (file or directory) under Media Cloud root."""

    path = decode_string_from_bytes_if_needed(path)

    mc_root = mc_root_path()
    dist_path = os.path.join(mc_root, path)
    if must_exist:
        if not os.path.exists(dist_path):
            raise Exception("Object '%s' at path '%s' does not exist." % (path, dist_path))
    return os.path.abspath(dist_path)
Ejemplo n.º 13
0
def unlock_file(path: str) -> None:
    """Remove lock file."""
    # FIXME probably not thread-safe

    path = decode_string_from_bytes_if_needed(path)

    l.debug("Removing lock file '%s'..." % path)
    if not os.path.isfile(path):
        raise McUnlockFileException("Lock file '%s' does not exist." % path)
    os.unlink(path)
    l.debug("Removed lock file '%s'." % path)
Ejemplo n.º 14
0
def get_url_host(url: str) -> str:
    """Return hostname of an URL."""
    url = decode_string_from_bytes_if_needed(url)
    if url is None:
        raise GetURLHostException("URL is None")
    if len(url) == 0:
        raise GetURLHostException("URL is empty")

    url = fix_common_url_mistakes(url)

    uri = urlparse(url)
    return uri.hostname
Ejemplo n.º 15
0
def link_canonical_url_from_html(html: str, base_url: str = None) -> Optional[str]:
    """From the provided HTML, determine the <link rel="canonical" /> URL (if any)."""
    html = decode_string_from_bytes_if_needed(html)
    base_url = decode_string_from_bytes_if_needed(base_url)

    link_elements = re.findall(r'(<\s*?link.+?>)', html, re.I)
    for link_element in link_elements:
        if re.search(r'rel\s*?=\s*?["\']\s*?canonical\s*?["\']', link_element, re.I):
            url = re.search(r'href\s*?=\s*?["\'](.+?)["\']', link_element, re.I)
            if url:
                url = url.group(1)
                if not re.search(__URL_REGEX, url):
                    # Maybe it's absolute path?
                    if base_url is not None:
                        return urljoin(base=base_url, url=url)
                    else:
                        l.debug("HTML <link rel=\"canonical\"/> found, but the new URL '%s' doesn't seem to be valid."
                                % url)
                else:
                    # Looks like URL, so return it
                    return url
    return None
Ejemplo n.º 16
0
def mkdir_p(path: str) -> None:
    """mkdir -p"""

    path = decode_string_from_bytes_if_needed(path)

    l.debug("Creating directory '%s'..." % path)
    try:
        os.makedirs(path)
    except OSError as e:  # Python >2.5
        if e.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise
    l.debug("Created directory '%s'." % path)
Ejemplo n.º 17
0
def download_file(source_url: str, target_path: str) -> None:
    """Download URL to path."""

    source_url = decode_string_from_bytes_if_needed(source_url)
    target_path = decode_string_from_bytes_if_needed(target_path)

    args = ["curl",
            "--silent",
            "--show-error",
            "--fail",
            "--retry", "3",
            "--retry-delay", "5",
            "--output", target_path,
            source_url]

    try:
        run_command_in_foreground(args)
    except McRunCommandInForegroundException as ex:
        raise McDownloadFileException(
            "Error while downloading file from '%(source_url)s' to '%(target_path)s': %(exception)s" % {
                'source_url': source_url,
                'target_path': target_path,
                'exception': str(ex),
            })
Ejemplo n.º 18
0
def wait_for_tcp_port_to_open(port: int, hostname: str = 'localhost', retries: int = 60, delay: int = 1) -> bool:
    """Try connecting to TCP port until it opens (or not); return True if managed to connect."""

    hostname = decode_string_from_bytes_if_needed(hostname)

    port_is_open = False
    for retry in range(retries):
        if retry == 0:
            l.info("Trying to connect to %s:%d" % (hostname, port))
        else:
            l.info("Trying to connect to %s:%d, retry %d" % (hostname, port, retry))

        if tcp_port_is_open(port, hostname):
            port_is_open = True
            break
        else:
            time.sleep(delay)
    return port_is_open
Ejemplo n.º 19
0
def get_url_distinctive_domain(url: str) -> str:
    """Return a truncated form of URL's host (domain) that distinguishes it from others, e.g.:

    * www.whitehouse.gov => whitehouse.gov
    * www.blogspot.com => blogspot.com
    * kardashian.blogspot.com => kardashian.blogspot.com

    Return original URL if unable to process the URL."""

    try:
        url = decode_string_from_bytes_if_needed(url)

        url = fix_common_url_mistakes(url)

        host = get_url_host(url)
        if host is None:
            return url

        name_parts = host.split('.')
        n = len(name_parts) - 1

        if re.search(r'\.(gov|org|com?)\...$', host, re.I):
            # foo.co.uk -> foo.co.uk instead of co.uk
            parts = [str(name_parts[n - 2]), str(name_parts[n - 1]), str(name_parts[n])]
            domain = '.'.join(parts)
        elif re.search(r'\.(edu|gov)$', host, re.I):
            parts = [str(name_parts[n - 2]), str(name_parts[n - 1])]
            domain = '.'.join(parts)
        elif re.search(
                        r'go.com|wordpress.com|blogspot|livejournal.com|privet.ru|wikia.com|feedburner.com'
                        + '|24open.ru|patch.com|tumblr.com', host, re.I
        ):
            # identify sites in these domains as the whole host name (abcnews.go.com instead of go.com)
            domain = host
        else:
            parts = [str(name_parts[n - 1] or ''), str(name_parts[n] or '')]
            domain = '.'.join(parts)

        return domain.lower()

    except Exception as ex:
        l.debug( "get_url_distinctive_domain falling back to url: " + str( ex ) )
        return url.lower()
Ejemplo n.º 20
0
def normalize_url_lossy(url: str) -> Optional[str]:
    """Do some simple transformations on a URL to make it match other equivalent URLs as well as possible; normalization
    is "lossy" (makes the whole URL lowercase, removes subdomain parts "m.", "data.", "news.", ... in some cases)"""
    url = decode_string_from_bytes_if_needed(url)
    if url is None:
        return None
    if len(url) == 0:
        return None

    url = fix_common_url_mistakes(url)

    url = url.lower()

    # r2.ly redirects through the hostname, ala http://543.r2.ly
    if 'r2.ly' not in url:
        url = re.sub(
            r'^(https?://)(m|beta|media|data|image|www?|cdn|topic|article|news|archive|blog|video|search|preview|'
            + 'shop|sports?|act|donate|press|web|photos?|\d+?).?\.(.*\.)',
            r"\1\3", url, re.I)

    # collapse the vast array of http://pronkraymond83483.podomatic.com/ urls into http://pronkpops.podomatic.com/
    url = re.sub(r'http://.*pron.*\.podomatic\.com', 'http://pronkpops.podomatic.com', url)

    # get rid of anchor text
    url = re.sub(r'#.*', '', url)

    # get rid of multiple slashes in a row
    url = re.sub(r'(//.*/)/+', r"\1", url)

    url = re.sub(r'^https:', 'http:', url)

    # canonical_url might raise an encoding error if url is not invalid; just skip the canonical url step in the case
    try:
        url = __canonical_url(url)
    except Exception as ex:
        pass

    # add trailing slash
    if re.search(r'https?://[^/]*$', url):
        url += '/'

    return url
Ejemplo n.º 21
0
def extract_article_from_html(html: str) -> str:
    """Extract article HTML from a full HTML file."""
    # FIXME move HTML stripping here too
    html = decode_string_from_bytes_if_needed(html)
    if html is None or html == '':
        return ''

    try:
        doc = readability.readability.Document(html)

        doc_title = doc.short_title().strip()
        doc_summary = doc.summary().strip()

        extracted_text = "%s\n\n%s" % (doc_title, doc_summary)

    except Exception as ex:
        l.error('Exception raised while extracting HTML: %s' % str(ex))
        extracted_text = ''

    return extracted_text
Ejemplo n.º 22
0
def fix_common_url_mistakes(url: str) -> Optional[str]:
    """Fixes common URL mistakes (mistypes, etc.)."""
    url = decode_string_from_bytes_if_needed(url)

    if url is None:
        return None

    # Fix broken URLs that look like this: http://http://www.al-monitor.com/pulse
    url = re.sub(r'(https?://)https?:?//', r"\1", url, flags=re.I)

    # Fix URLs with only one slash after "http" ("http:/www.")
    url = re.sub(r'(https?:/)(www)', r"\1/\2", url, flags=re.I)

    # replace backslashes with forward
    url = re.sub(r'\\', r'/', url)

    # http://newsmachete.com?page=2 -> http://newsmachete.com/?page=2
    url = re.sub(r'(https?://[^/]+)\?', r"\1/?", url)

    return url
Ejemplo n.º 23
0
def http_urls_in_string(string: str) -> list:
    """Extract http(s):// URLs from a string.

    Returns a set of unique URLs in a string, raises HTTPURLsInStringException on error."""
    string = decode_string_from_bytes_if_needed(string)
    if string is None:
        raise HTTPURLsInStringException("String is None")
    if len(string) == 0:
        raise HTTPURLsInStringException("String is empty")

    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', string, re.I)
    http_urls = []
    for url in urls:
        if is_http_url(url):
            http_urls.append(url)

    # Unique URLs
    http_urls = list(set(http_urls))

    return http_urls
Ejemplo n.º 24
0
def is_homepage_url(url: str) -> bool:
    """Returns true if URL is homepage (e.g. http://www.wired.com/) and not a child page
    (e.g. http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/)."""
    url = decode_string_from_bytes_if_needed(url)
    if url is None:
        l.debug("URL is None.")
        return False
    if len(url) == 0:
        l.debug("URL is empty.")
        return False

    if not is_http_url(url):
        l.debug("URL '%s' is invalid." % url)
        return False

    # Remove cruft from the URL first
    try:
        url = normalize_url(url)
    except NormalizeURLException as ex:
        l.debug("Unable to normalize URL '%s' before checking if it's a homepage: %s" % (url, ex))
        return False

    # The shortened URL may lead to a homepage URL, but the shortened URL
    # itself is not a homepage URL
    if is_shortened_url(url):
        return False

    # If we still have something for a query of the URL after the
    # normalization, always assume that the URL is *not* a homepage
    scheme, netloc, uri_path, query_string, fragment = urlsplit(url)
    if len(query_string) > 0:
        return False

    for homepage_url_path_regex in __HOMEPAGE_URL_PATH_REGEXES:
        if re.search(homepage_url_path_regex, uri_path):
            return True

    return False
Ejemplo n.º 25
0
def is_http_url(url: str) -> bool:
    """Returns true if URL is in the "http" ("https") scheme."""
    url = decode_string_from_bytes_if_needed(url)
    if url is None:
        l.debug("URL is None")
        return False
    if len(url) == 0:
        l.debug("URL is empty")
        return False
    if not re.search(__URL_REGEX, url):
        l.debug("URL '%s' does not match URL's regexp" % url)
        return False

    uri = urlparse(url)

    if not uri.scheme:
        l.debug("Scheme is undefined for URL %s" % url)
        return False
    if not uri.scheme.lower() in ['http', 'https']:
        l.debug("Scheme is not HTTP(s) for URL %s" % url)
        return False

    return True
Ejemplo n.º 26
0
def lock_file(path: str, timeout: int = None) -> None:
    """Create lock file."""
    # FIXME probably not thread-safe

    path = decode_string_from_bytes_if_needed(path)

    start_time = time.time()
    l.debug("Creating lock file '%s'..." % path)
    while True:
        try:
            os.open(path, os.O_CREAT | os.O_EXCL | os.O_RDWR)
            break
        except OSError as e:
            if e.errno == errno.EEXIST:
                if timeout is not None:
                    if (time.time() - start_time) >= timeout:
                        raise McLockFileException("Unable to create lock file '%s' in %d seconds." % (path, timeout))

                l.info("Lock file '%s' already exists, will retry shortly." % path)
                time.sleep(1)
            else:
                # Some other I/O error
                raise
    l.debug("Created lock file '%s'" % path)
Ejemplo n.º 27
0
def get_epoch_from_sql_date(date: str) -> int:
    """Given a date in the sql format 'YYYY-MM-DD', return the epoch time."""
    date = decode_string_from_bytes_if_needed(date)
    parsed_date = dateutil.parser.parse(date)
    return int(parsed_date.timestamp())
Ejemplo n.º 28
0
def py_hausa_stem(token):
    """Used by Perl code to do Hausa stemming."""
    # FIXME MC_REWRITE_TO_PYTHON: simplify after rewriting language module to Python.
    token = decode_string_from_bytes_if_needed(token)
    return hausastemmer.stem(token)
Ejemplo n.º 29
0
def normalize_url(url: str) -> str:
    """Normalize URL

    * Fix common mistypes, e.g. "http://http://..."
    * Run URL through normalization, i.e. standardize URL's scheme and hostname case, remove default port, uppercase
      all escape sequences, un-escape octets that can be represented as plain characters, remove whitespace before /
      after the URL string)
    * Remove #fragment
    * Remove various ad tracking query parameters, e.g. "utm_source", "utm_medium", "PHPSESSID", etc.

    Return normalized URL on success; raise on error"""
    url = decode_string_from_bytes_if_needed(url)
    if url is None:
        raise NormalizeURLException("URL is None")
    if len(url) == 0:
        raise NormalizeURLException("URL is empty")

    url = fix_common_url_mistakes(url)
    url = __canonical_url(url)

    if not is_http_url(url):
        raise NormalizeURLException("URL is not valid")

    scheme, netloc, path, query_string, fragment = urlsplit(url)
    query = parse_qs(query_string, keep_blank_values=True)

    # Remove #fragment
    fragment = ''

    parameters_to_remove = []

    # Facebook parameters (https://developers.facebook.com/docs/games/canvas/referral-tracking)
    parameters_to_remove += [
        'fb_action_ids',
        'fb_action_types',
        'fb_source',
        'fb_ref',
        'action_object_map',
        'action_type_map',
        'action_ref_map',
        'fsrc_fb_noscript',
    ]

    # metrika.yandex.ru parameters
    parameters_to_remove += [
        'yclid',
        '_openstat',
    ]

    if 'facebook.com' in netloc.lower():
        # Additional parameters specifically for the facebook.com host
        parameters_to_remove += [
            'ref',
            'fref',
            'hc_location',
        ]

    if 'nytimes.com' in netloc.lower():
        # Additional parameters specifically for the nytimes.com host
        parameters_to_remove += [
            'emc',
            'partner',
            '_r',
            'hp',
            'inline',
            'smid',
            'WT.z_sma',
            'bicmp',
            'bicmlukp',
            'bicmst',
            'bicmet',
            'abt',
            'abg',
        ]

    if 'livejournal.com' in netloc.lower():
        # Additional parameters specifically for the livejournal.com host
        parameters_to_remove += [
            'thread',
            'nojs',
        ]

    if 'google.' in netloc.lower():
        # Additional parameters specifically for the google.[com,lt,...] host
        parameters_to_remove += [
            'gws_rd',
            'ei',
        ]

    # Some other parameters (common for tracking session IDs, advertising, etc.)
    parameters_to_remove += [
        'PHPSESSID',
        'PHPSESSIONID',
        'cid',
        's_cid',
        'sid',
        'ncid',
        'ir',
        'ref',
        'oref',
        'eref',
        'ns_mchannel',
        'ns_campaign',
        'ITO',
        'wprss',
        'custom_click',
        'source',
        'feedName',
        'feedType',
        'skipmobile',
        'skip_mobile',
        'altcast_code',
    ]

    # Make the sorting default (e.g. on Reddit)
    # Some other parameters (common for tracking session IDs, advertising, etc.)
    parameters_to_remove += ['sort']

    # Some Australian websites append the "nk" parameter with a tracking hash
    if 'nk' in query:
        for nk_value in query['nk']:
            if re.search(r'^[0-9a-fA-F]+$', nk_value, re.I):
                parameters_to_remove += ['nk']
                break

    # Delete the "empty" parameter (e.g. in http://www-nc.nytimes.com/2011/06/29/us/politics/29marriage.html?=_r%3D6)
    parameters_to_remove += ['']

    # Remove cruft parameters
    for parameter in parameters_to_remove:
        if ' ' in parameter:
            l.warn('Invalid cruft parameter "%s"' % parameter)
        query.pop(parameter, None)

    for name in list(query.keys()):  # copy of list to be able to delete

        # Remove parameters that start with '_' (e.g. '_cid') because they're
        # more likely to be the tracking codes
        if name.startswith('_'):
            query.pop(name)

        # Remove GA parameters, current and future (e.g. "utm_source",
        # "utm_medium", "ga_source", "ga_medium")
        # (https://support.google.com/analytics/answer/1033867?hl=en)
        if name.startswith('ga_') or name.startswith('utm_'):
            query.pop(name)

    url = urlunsplit((scheme, netloc, path, urlencode(query, doseq=True), fragment))

    # Remove empty values in query string, e.g. http://bash.org/?244321=
    url = url.replace('=&', '&')
    url = re.sub(r'=$', '', url)

    return url