def extract_tarball_to_directory(archive_file: str, dest_directory: str, strip_root: bool = False) -> None: """Extract Tar archive (.tar, .tar.gz or .tgz) to destination directory, optionally stripping the root directory first.""" archive_file = decode_string_from_bytes_if_needed(archive_file) dest_directory = decode_string_from_bytes_if_needed(dest_directory) if not os.path.isfile(archive_file): raise McExtractTarballToDirectoryException("Archive at '%s' does not exist" % archive_file) archive_file_extension = file_extension(archive_file) if archive_file_extension in [".gz", ".tgz"]: tar_args = "-zxf" elif archive_file_extension in [".tar"]: tar_args = "-xf" else: raise McExtractTarballToDirectoryException("Unsupported archive '%s' with extension '%s'" % (archive_file, archive_file_extension)) args = ["tar", tar_args, archive_file, "-C", dest_directory] if strip_root: args += ['--strip', '1'] try: run_command_in_foreground(args) except McRunCommandInForegroundException as ex: raise McExtractTarballToDirectoryException("Error while extracting archive '%s': %s" % (archive_file, str(ex)))
def is_shortened_url(url: str) -> bool: """Returns true if URL is a shortened URL (e.g. with Bit.ly).""" url = decode_string_from_bytes_if_needed(url) if url is None: l.debug("URL is None") return False if len(url) == 0: l.debug("URL is empty") return False if not is_http_url(url): l.debug("URL is not valid") return False uri = urlparse(url) if uri.path is not None and uri.path in ['', '/']: # Assume that most of the URL shorteners use something like # bit.ly/abcdef, so if there's no path or if it's empty, it's not a # shortened URL return False uri_host = uri.hostname.lower() if uri_host in __URL_SHORTENER_HOSTNAMES: return True return False
def increment_day(date: str, days: int = 1) -> str: """Given a date in the sql format 'YYYY-MM-DD', increment it by $days days.""" date = decode_string_from_bytes_if_needed(date) if days == 0: return date epoch_date = get_epoch_from_sql_date(date) + (((days * 24) + 12) * 60 * 60) return datetime.datetime.fromtimestamp(int(epoch_date)).strftime('%Y-%m-%d')
def relative_symlink(source: str, link_name: str) -> None: """Create symlink while also converting paths to relative ones by finding common prefix.""" source = decode_string_from_bytes_if_needed(source) link_name = decode_string_from_bytes_if_needed(link_name) source = os.path.abspath(source) link_name = os.path.abspath(link_name) if not os.path.exists(source): raise Exception("Symlink source does not exist at path: %s" % source) rel_source = os.path.relpath(source, os.path.dirname(link_name)) l.debug("Creating relative symlink from '%s' to '%s'..." % (rel_source, link_name)) os.symlink(rel_source, link_name)
def download_file_to_temp_path(source_url: str) -> str: """Download URL to temporary path, return that path.""" source_url = decode_string_from_bytes_if_needed(source_url) dest_dir = tempfile.mkdtemp() # Try to figure out a sensible name for the file # noinspection PyBroadException try: uri = urlparse(source_url) url_path = uri.path temp_filename = os.path.basename(url_path) except Exception: temp_filename = "temp.dat" dest_path = os.path.join(dest_dir, temp_filename) try: download_file(source_url=source_url, target_path=dest_path) except McDownloadFileException as ex: raise McDownloadFileToTempPathException( "Error while downloading file from '%(source_url)s' to temp. location '%(target_path)s': %(exception)s" % { 'source_url': source_url, 'target_path': dest_path, 'exception': str(ex), }) return dest_path
def file_extension(filename: str) -> str: """Return file extension, e.g. ".zip" for "test.zip", or ".gz" for "test.tar.gz".""" filename = decode_string_from_bytes_if_needed(filename) basename = os.path.basename(filename) root, extension = os.path.splitext(basename) return extension.lower()
def tcp_port_is_open(port: int, hostname: str = 'localhost') -> bool: """Test if TCP port is open.""" hostname = decode_string_from_bytes_if_needed(hostname) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) result = sock.connect_ex((hostname, port)) return result == 0
def meta_refresh_url_from_html(html: str, base_url: str = None) -> Optional[str]: """From the provided HTML, determine the <meta http-equiv="refresh" /> URL (if any).""" def __get_meta_refresh_url_from_tag(inner_tag: str, inner_base_url=None) -> Optional[str]: """Given a <meta ...> tag, return the url from the content="url=XXX" attribute. return undef if no such url is found.""" if not re.search(r'http-equiv\s*?=\s*?["\']\s*?refresh\s*?["\']', inner_tag, re.I): return None # content="url='http://foo.bar'" inner_url = None match = re.search(r'content\s*?=\s*?"\d*?\s*?;?\s*?URL\s*?=\s*?\'(.+?)\'', inner_tag, re.I) if match: inner_url = match.group(1) else: # content="url='http://foo.bar'" match = re.search(r'content\s*?=\s*?\'\d*?\s*?;?\s*?URL\s*?=\s*?"(.+?)"', inner_tag, re.I) if match: inner_url = match.group(1) else: # Fallback match = re.search(r'content\s*?=\s*?["\']\d*?\s*?;?\s*?URL\s*?=\s*?(.+?)["\']', inner_tag, re.I) if match: inner_url = match.group(1) if is_http_url(inner_url): return inner_url if inner_base_url is not None: return urljoin(base=inner_base_url, url=inner_url) return None html = decode_string_from_bytes_if_needed(html) base_url = decode_string_from_bytes_if_needed(base_url) tags = re.findall(r'(<\s*meta[^>]+>)', html, re.I) for tag in tags: url = __get_meta_refresh_url_from_tag(tag, base_url) if url is not None: return url return None
def get_url_path_fast(url: str) -> str: """Return URLs path.""" url = decode_string_from_bytes_if_needed(url) if not is_http_url(url): return '' # Don't bother with the regex (Perl's version didn't work anyway) uri = urlparse(url) return uri.path
def schema_version_from_lines(sql: str) -> int: """Utility function to determine a database schema version from a bunch of SQL commands.""" sql = decode_string_from_bytes_if_needed(sql) matches = re.search(r'[+\-]*\s*MEDIACLOUD_DATABASE_SCHEMA_VERSION CONSTANT INT := (\d+?);', sql) if matches is None: raise McSchemaVersionFromLinesException("Unable to parse the database schema version number") schema_version = int(matches.group(1)) if schema_version == 0: raise McSchemaVersionFromLinesException("Invalid schema version") return schema_version
def extract_zip_to_directory(archive_file: str, dest_directory: str) -> None: """Extract ZIP archive (.zip or .war) to destination directory.""" archive_file = decode_string_from_bytes_if_needed(archive_file) dest_directory = decode_string_from_bytes_if_needed(dest_directory) if not os.path.isfile(archive_file): raise McExtractZipToDirectoryException("Archive at '%s' does not exist" % archive_file) archive_file_extension = file_extension(archive_file) if archive_file_extension not in [".zip", ".war"]: raise McExtractZipToDirectoryException( "Unsupported archive '%s' with extension '%s'" % (archive_file, archive_file_extension)) args = ["unzip", "-q", archive_file, "-d", dest_directory] try: run_command_in_foreground(args) except McRunCommandInForegroundException as ex: raise McExtractZipToDirectoryException("Error while extracting archive '%s': %s" % (archive_file, str(ex)))
def resolve_absolute_path_under_mc_root(path: str, must_exist: bool = False) -> str: """Return absolute path to object (file or directory) under Media Cloud root.""" path = decode_string_from_bytes_if_needed(path) mc_root = mc_root_path() dist_path = os.path.join(mc_root, path) if must_exist: if not os.path.exists(dist_path): raise Exception("Object '%s' at path '%s' does not exist." % (path, dist_path)) return os.path.abspath(dist_path)
def unlock_file(path: str) -> None: """Remove lock file.""" # FIXME probably not thread-safe path = decode_string_from_bytes_if_needed(path) l.debug("Removing lock file '%s'..." % path) if not os.path.isfile(path): raise McUnlockFileException("Lock file '%s' does not exist." % path) os.unlink(path) l.debug("Removed lock file '%s'." % path)
def get_url_host(url: str) -> str: """Return hostname of an URL.""" url = decode_string_from_bytes_if_needed(url) if url is None: raise GetURLHostException("URL is None") if len(url) == 0: raise GetURLHostException("URL is empty") url = fix_common_url_mistakes(url) uri = urlparse(url) return uri.hostname
def link_canonical_url_from_html(html: str, base_url: str = None) -> Optional[str]: """From the provided HTML, determine the <link rel="canonical" /> URL (if any).""" html = decode_string_from_bytes_if_needed(html) base_url = decode_string_from_bytes_if_needed(base_url) link_elements = re.findall(r'(<\s*?link.+?>)', html, re.I) for link_element in link_elements: if re.search(r'rel\s*?=\s*?["\']\s*?canonical\s*?["\']', link_element, re.I): url = re.search(r'href\s*?=\s*?["\'](.+?)["\']', link_element, re.I) if url: url = url.group(1) if not re.search(__URL_REGEX, url): # Maybe it's absolute path? if base_url is not None: return urljoin(base=base_url, url=url) else: l.debug("HTML <link rel=\"canonical\"/> found, but the new URL '%s' doesn't seem to be valid." % url) else: # Looks like URL, so return it return url return None
def mkdir_p(path: str) -> None: """mkdir -p""" path = decode_string_from_bytes_if_needed(path) l.debug("Creating directory '%s'..." % path) try: os.makedirs(path) except OSError as e: # Python >2.5 if e.errno == errno.EEXIST and os.path.isdir(path): pass else: raise l.debug("Created directory '%s'." % path)
def download_file(source_url: str, target_path: str) -> None: """Download URL to path.""" source_url = decode_string_from_bytes_if_needed(source_url) target_path = decode_string_from_bytes_if_needed(target_path) args = ["curl", "--silent", "--show-error", "--fail", "--retry", "3", "--retry-delay", "5", "--output", target_path, source_url] try: run_command_in_foreground(args) except McRunCommandInForegroundException as ex: raise McDownloadFileException( "Error while downloading file from '%(source_url)s' to '%(target_path)s': %(exception)s" % { 'source_url': source_url, 'target_path': target_path, 'exception': str(ex), })
def wait_for_tcp_port_to_open(port: int, hostname: str = 'localhost', retries: int = 60, delay: int = 1) -> bool: """Try connecting to TCP port until it opens (or not); return True if managed to connect.""" hostname = decode_string_from_bytes_if_needed(hostname) port_is_open = False for retry in range(retries): if retry == 0: l.info("Trying to connect to %s:%d" % (hostname, port)) else: l.info("Trying to connect to %s:%d, retry %d" % (hostname, port, retry)) if tcp_port_is_open(port, hostname): port_is_open = True break else: time.sleep(delay) return port_is_open
def get_url_distinctive_domain(url: str) -> str: """Return a truncated form of URL's host (domain) that distinguishes it from others, e.g.: * www.whitehouse.gov => whitehouse.gov * www.blogspot.com => blogspot.com * kardashian.blogspot.com => kardashian.blogspot.com Return original URL if unable to process the URL.""" try: url = decode_string_from_bytes_if_needed(url) url = fix_common_url_mistakes(url) host = get_url_host(url) if host is None: return url name_parts = host.split('.') n = len(name_parts) - 1 if re.search(r'\.(gov|org|com?)\...$', host, re.I): # foo.co.uk -> foo.co.uk instead of co.uk parts = [str(name_parts[n - 2]), str(name_parts[n - 1]), str(name_parts[n])] domain = '.'.join(parts) elif re.search(r'\.(edu|gov)$', host, re.I): parts = [str(name_parts[n - 2]), str(name_parts[n - 1])] domain = '.'.join(parts) elif re.search( r'go.com|wordpress.com|blogspot|livejournal.com|privet.ru|wikia.com|feedburner.com' + '|24open.ru|patch.com|tumblr.com', host, re.I ): # identify sites in these domains as the whole host name (abcnews.go.com instead of go.com) domain = host else: parts = [str(name_parts[n - 1] or ''), str(name_parts[n] or '')] domain = '.'.join(parts) return domain.lower() except Exception as ex: l.debug( "get_url_distinctive_domain falling back to url: " + str( ex ) ) return url.lower()
def normalize_url_lossy(url: str) -> Optional[str]: """Do some simple transformations on a URL to make it match other equivalent URLs as well as possible; normalization is "lossy" (makes the whole URL lowercase, removes subdomain parts "m.", "data.", "news.", ... in some cases)""" url = decode_string_from_bytes_if_needed(url) if url is None: return None if len(url) == 0: return None url = fix_common_url_mistakes(url) url = url.lower() # r2.ly redirects through the hostname, ala http://543.r2.ly if 'r2.ly' not in url: url = re.sub( r'^(https?://)(m|beta|media|data|image|www?|cdn|topic|article|news|archive|blog|video|search|preview|' + 'shop|sports?|act|donate|press|web|photos?|\d+?).?\.(.*\.)', r"\1\3", url, re.I) # collapse the vast array of http://pronkraymond83483.podomatic.com/ urls into http://pronkpops.podomatic.com/ url = re.sub(r'http://.*pron.*\.podomatic\.com', 'http://pronkpops.podomatic.com', url) # get rid of anchor text url = re.sub(r'#.*', '', url) # get rid of multiple slashes in a row url = re.sub(r'(//.*/)/+', r"\1", url) url = re.sub(r'^https:', 'http:', url) # canonical_url might raise an encoding error if url is not invalid; just skip the canonical url step in the case try: url = __canonical_url(url) except Exception as ex: pass # add trailing slash if re.search(r'https?://[^/]*$', url): url += '/' return url
def extract_article_from_html(html: str) -> str: """Extract article HTML from a full HTML file.""" # FIXME move HTML stripping here too html = decode_string_from_bytes_if_needed(html) if html is None or html == '': return '' try: doc = readability.readability.Document(html) doc_title = doc.short_title().strip() doc_summary = doc.summary().strip() extracted_text = "%s\n\n%s" % (doc_title, doc_summary) except Exception as ex: l.error('Exception raised while extracting HTML: %s' % str(ex)) extracted_text = '' return extracted_text
def fix_common_url_mistakes(url: str) -> Optional[str]: """Fixes common URL mistakes (mistypes, etc.).""" url = decode_string_from_bytes_if_needed(url) if url is None: return None # Fix broken URLs that look like this: http://http://www.al-monitor.com/pulse url = re.sub(r'(https?://)https?:?//', r"\1", url, flags=re.I) # Fix URLs with only one slash after "http" ("http:/www.") url = re.sub(r'(https?:/)(www)', r"\1/\2", url, flags=re.I) # replace backslashes with forward url = re.sub(r'\\', r'/', url) # http://newsmachete.com?page=2 -> http://newsmachete.com/?page=2 url = re.sub(r'(https?://[^/]+)\?', r"\1/?", url) return url
def http_urls_in_string(string: str) -> list: """Extract http(s):// URLs from a string. Returns a set of unique URLs in a string, raises HTTPURLsInStringException on error.""" string = decode_string_from_bytes_if_needed(string) if string is None: raise HTTPURLsInStringException("String is None") if len(string) == 0: raise HTTPURLsInStringException("String is empty") urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', string, re.I) http_urls = [] for url in urls: if is_http_url(url): http_urls.append(url) # Unique URLs http_urls = list(set(http_urls)) return http_urls
def is_homepage_url(url: str) -> bool: """Returns true if URL is homepage (e.g. http://www.wired.com/) and not a child page (e.g. http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/).""" url = decode_string_from_bytes_if_needed(url) if url is None: l.debug("URL is None.") return False if len(url) == 0: l.debug("URL is empty.") return False if not is_http_url(url): l.debug("URL '%s' is invalid." % url) return False # Remove cruft from the URL first try: url = normalize_url(url) except NormalizeURLException as ex: l.debug("Unable to normalize URL '%s' before checking if it's a homepage: %s" % (url, ex)) return False # The shortened URL may lead to a homepage URL, but the shortened URL # itself is not a homepage URL if is_shortened_url(url): return False # If we still have something for a query of the URL after the # normalization, always assume that the URL is *not* a homepage scheme, netloc, uri_path, query_string, fragment = urlsplit(url) if len(query_string) > 0: return False for homepage_url_path_regex in __HOMEPAGE_URL_PATH_REGEXES: if re.search(homepage_url_path_regex, uri_path): return True return False
def is_http_url(url: str) -> bool: """Returns true if URL is in the "http" ("https") scheme.""" url = decode_string_from_bytes_if_needed(url) if url is None: l.debug("URL is None") return False if len(url) == 0: l.debug("URL is empty") return False if not re.search(__URL_REGEX, url): l.debug("URL '%s' does not match URL's regexp" % url) return False uri = urlparse(url) if not uri.scheme: l.debug("Scheme is undefined for URL %s" % url) return False if not uri.scheme.lower() in ['http', 'https']: l.debug("Scheme is not HTTP(s) for URL %s" % url) return False return True
def lock_file(path: str, timeout: int = None) -> None: """Create lock file.""" # FIXME probably not thread-safe path = decode_string_from_bytes_if_needed(path) start_time = time.time() l.debug("Creating lock file '%s'..." % path) while True: try: os.open(path, os.O_CREAT | os.O_EXCL | os.O_RDWR) break except OSError as e: if e.errno == errno.EEXIST: if timeout is not None: if (time.time() - start_time) >= timeout: raise McLockFileException("Unable to create lock file '%s' in %d seconds." % (path, timeout)) l.info("Lock file '%s' already exists, will retry shortly." % path) time.sleep(1) else: # Some other I/O error raise l.debug("Created lock file '%s'" % path)
def get_epoch_from_sql_date(date: str) -> int: """Given a date in the sql format 'YYYY-MM-DD', return the epoch time.""" date = decode_string_from_bytes_if_needed(date) parsed_date = dateutil.parser.parse(date) return int(parsed_date.timestamp())
def py_hausa_stem(token): """Used by Perl code to do Hausa stemming.""" # FIXME MC_REWRITE_TO_PYTHON: simplify after rewriting language module to Python. token = decode_string_from_bytes_if_needed(token) return hausastemmer.stem(token)
def normalize_url(url: str) -> str: """Normalize URL * Fix common mistypes, e.g. "http://http://..." * Run URL through normalization, i.e. standardize URL's scheme and hostname case, remove default port, uppercase all escape sequences, un-escape octets that can be represented as plain characters, remove whitespace before / after the URL string) * Remove #fragment * Remove various ad tracking query parameters, e.g. "utm_source", "utm_medium", "PHPSESSID", etc. Return normalized URL on success; raise on error""" url = decode_string_from_bytes_if_needed(url) if url is None: raise NormalizeURLException("URL is None") if len(url) == 0: raise NormalizeURLException("URL is empty") url = fix_common_url_mistakes(url) url = __canonical_url(url) if not is_http_url(url): raise NormalizeURLException("URL is not valid") scheme, netloc, path, query_string, fragment = urlsplit(url) query = parse_qs(query_string, keep_blank_values=True) # Remove #fragment fragment = '' parameters_to_remove = [] # Facebook parameters (https://developers.facebook.com/docs/games/canvas/referral-tracking) parameters_to_remove += [ 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref', 'action_object_map', 'action_type_map', 'action_ref_map', 'fsrc_fb_noscript', ] # metrika.yandex.ru parameters parameters_to_remove += [ 'yclid', '_openstat', ] if 'facebook.com' in netloc.lower(): # Additional parameters specifically for the facebook.com host parameters_to_remove += [ 'ref', 'fref', 'hc_location', ] if 'nytimes.com' in netloc.lower(): # Additional parameters specifically for the nytimes.com host parameters_to_remove += [ 'emc', 'partner', '_r', 'hp', 'inline', 'smid', 'WT.z_sma', 'bicmp', 'bicmlukp', 'bicmst', 'bicmet', 'abt', 'abg', ] if 'livejournal.com' in netloc.lower(): # Additional parameters specifically for the livejournal.com host parameters_to_remove += [ 'thread', 'nojs', ] if 'google.' in netloc.lower(): # Additional parameters specifically for the google.[com,lt,...] host parameters_to_remove += [ 'gws_rd', 'ei', ] # Some other parameters (common for tracking session IDs, advertising, etc.) parameters_to_remove += [ 'PHPSESSID', 'PHPSESSIONID', 'cid', 's_cid', 'sid', 'ncid', 'ir', 'ref', 'oref', 'eref', 'ns_mchannel', 'ns_campaign', 'ITO', 'wprss', 'custom_click', 'source', 'feedName', 'feedType', 'skipmobile', 'skip_mobile', 'altcast_code', ] # Make the sorting default (e.g. on Reddit) # Some other parameters (common for tracking session IDs, advertising, etc.) parameters_to_remove += ['sort'] # Some Australian websites append the "nk" parameter with a tracking hash if 'nk' in query: for nk_value in query['nk']: if re.search(r'^[0-9a-fA-F]+$', nk_value, re.I): parameters_to_remove += ['nk'] break # Delete the "empty" parameter (e.g. in http://www-nc.nytimes.com/2011/06/29/us/politics/29marriage.html?=_r%3D6) parameters_to_remove += [''] # Remove cruft parameters for parameter in parameters_to_remove: if ' ' in parameter: l.warn('Invalid cruft parameter "%s"' % parameter) query.pop(parameter, None) for name in list(query.keys()): # copy of list to be able to delete # Remove parameters that start with '_' (e.g. '_cid') because they're # more likely to be the tracking codes if name.startswith('_'): query.pop(name) # Remove GA parameters, current and future (e.g. "utm_source", # "utm_medium", "ga_source", "ga_medium") # (https://support.google.com/analytics/answer/1033867?hl=en) if name.startswith('ga_') or name.startswith('utm_'): query.pop(name) url = urlunsplit((scheme, netloc, path, urlencode(query, doseq=True), fragment)) # Remove empty values in query string, e.g. http://bash.org/?244321= url = url.replace('=&', '&') url = re.sub(r'=$', '', url) return url