def run_solr_shard(shard_num: int, shard_count: int, hostname: str = None, starting_port: int = MC_SOLR_CLUSTER_STARTING_PORT, base_data_dir: str = MC_SOLR_BASE_DATA_DIR, dist_directory: str = MC_DIST_DIR, solr_version: str = MC_SOLR_VERSION, zookeeper_host: str = MC_SOLR_CLUSTER_ZOOKEEPER_HOST, zookeeper_port: int = MC_SOLR_CLUSTER_ZOOKEEPER_PORT, jvm_heap_size: str = MC_SOLR_CLUSTER_JVM_HEAP_SIZE) -> None: """Run Solr shard, install Solr if needed; read configuration from ZooKeeper.""" if shard_num < 1: raise McSolrRunException("Shard number must be 1 or greater.") if shard_count < 1: raise McSolrRunException("Shard count must be 1 or greater.") if not __solr_is_installed(dist_directory=dist_directory, solr_version=solr_version): log.info("Solr is not installed, installing...") __install_solr(dist_directory=dist_directory, solr_version=solr_version) if hostname is None: hostname = fqdn() base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir, must_exist=True) shard_port = __shard_port(shard_num=shard_num, starting_port=starting_port) shard_data_dir = __shard_data_dir(shard_num=shard_num, base_data_dir=base_data_dir) log.info("Waiting for ZooKeeper to start on %s:%d..." % (zookeeper_host, zookeeper_port)) wait_for_tcp_port_to_open( hostname=zookeeper_host, port=zookeeper_port, retries=MC_SOLR_CLUSTER_ZOOKEEPER_CONNECT_RETRIES) log.info("ZooKeeper is up!") log.info("Starting Solr shard %d on port %d..." % (shard_num, shard_port)) # noinspection SpellCheckingInspection shard_args = [ "-DzkHost=%s:%d" % (zookeeper_host, zookeeper_port), "-DnumShards=%d" % shard_count, ] __run_solr(hostname=hostname, port=shard_port, instance_data_dir=shard_data_dir, jvm_heap_size=jvm_heap_size, jvm_opts=MC_SOLR_CLUSTER_JVM_OPTS, start_jar_args=shard_args, connect_timeout=MC_SOLR_CLUSTER_CONNECT_RETRIES, dist_directory=dist_directory, solr_version=solr_version)
def test_wait_for_tcp_port_to_open(): random_port = random_unused_port() assert wait_for_tcp_port_to_open(port=random_port, retries=2) is False # Open port s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(('localhost', random_port)) s.listen() assert wait_for_tcp_port_to_open(port=random_port, retries=2) is True # Close port s.close() assert wait_for_tcp_port_to_open(port=random_port, retries=2) is False
def run_solr_shard(shard_num: int, shard_count: int, hostname: str = None, starting_port: int = MC_SOLR_CLUSTER_STARTING_PORT, base_data_dir: str = MC_SOLR_BASE_DATA_DIR, dist_directory: str = MC_DIST_DIR, solr_version: str = MC_SOLR_VERSION, zookeeper_host: str = MC_SOLR_CLUSTER_ZOOKEEPER_HOST, zookeeper_port: int = MC_SOLR_CLUSTER_ZOOKEEPER_PORT, jvm_heap_size: str = MC_SOLR_CLUSTER_JVM_HEAP_SIZE) -> None: """Run Solr shard, install Solr if needed; read configuration from ZooKeeper.""" if shard_num < 1: raise McSolrRunException("Shard number must be 1 or greater.") if shard_count < 1: raise McSolrRunException("Shard count must be 1 or greater.") if not __solr_is_installed(dist_directory=dist_directory, solr_version=solr_version): log.info("Solr is not installed, installing...") __install_solr(dist_directory=dist_directory, solr_version=solr_version) if hostname is None: hostname = fqdn() base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir, must_exist=True) shard_port = __shard_port(shard_num=shard_num, starting_port=starting_port) shard_data_dir = __shard_data_dir(shard_num=shard_num, base_data_dir=base_data_dir) log.info("Waiting for ZooKeeper to start on %s:%d..." % (zookeeper_host, zookeeper_port)) wait_for_tcp_port_to_open(hostname=zookeeper_host, port=zookeeper_port, retries=MC_SOLR_CLUSTER_ZOOKEEPER_CONNECT_RETRIES) log.info("ZooKeeper is up!") log.info("Starting Solr shard %d on port %d..." % (shard_num, shard_port)) # noinspection SpellCheckingInspection shard_args = [ "-DzkHost=%s:%d" % (zookeeper_host, zookeeper_port), "-DnumShards=%d" % shard_count, ] __run_solr(hostname=hostname, port=shard_port, instance_data_dir=shard_data_dir, jvm_heap_size=jvm_heap_size, jvm_opts=MC_SOLR_CLUSTER_JVM_OPTS, start_jar_args=shard_args, connect_timeout=MC_SOLR_CLUSTER_CONNECT_RETRIES, dist_directory=dist_directory, solr_version=solr_version)
def start(self, delay: int = 0): """Start the webserver. Arguments: delay - number of seconds to delay before starting server """ if tcp_port_is_open(port=self.__port): raise McHashServerException("Port %d is already open." % self.__port) log.debug('Starting test web server %s:%d' % ( self.__host, self.__port, )) log.debug('Pages: %s' % str(self.__pages)) # "threading.Thread()" doesn't work with Perl callers self.__http_server_thread = multiprocessing.Process( target=self.__start_http_server, args=(self.__host, self.__port, self.__pages, self.__http_server_active_pids, self.__http_server_active_pids_lock, delay)) self.__http_server_thread.daemon = True self.__http_server_thread.start() if delay == 0: if not wait_for_tcp_port_to_open( port=self.__port, retries=20, delay=0.1): raise McHashServerException("Port %d is not open." % self.__port)
def start(self, delay: int = 0): """Start the webserver. Arguments: delay - number of seconds to delay before starting server """ if tcp_port_is_open(port=self.__port): raise McHashServerException("Port %d is already open." % self.__port) log.info('Starting test web server %s:%d' % (self.__host, self.__port,)) log.debug('Pages: %s' % str(self.__pages)) # "threading.Thread()" doesn't work with Perl callers self.__http_server_thread = multiprocessing.Process( target=self.__start_http_server, args=( self.__host, self.__port, self.__pages, self.__http_server_active_pids, self.__http_server_active_pids_lock, delay ) ) self.__http_server_thread.daemon = True self.__http_server_thread.start() if delay == 0: if not wait_for_tcp_port_to_open(port=self.__port, retries=20, delay=0.1): raise McHashServerException("Port %d is not open." % self.__port)
def workflow_client(namespace: str = 'default') -> WorkflowClient: """ Connect to Temporal server and return its client. :param namespace: Namespace to connect to. :return: WorkflowClient instance. """ host = 'temporal-server' port = 7233 # It's super lame to wait for this port to open, but the Python SDK seems to fail otherwise wait_for_tcp_port_to_open(hostname=host, port=port) client = WorkflowClient.new_client(host=host, port=port, namespace=namespace) return client
def __run_solr(port: int, instance_data_dir: str, hostname: str = fqdn(), jvm_heap_size: str = None, start_jar_args: List[str] = None, jvm_opts: List[str] = None, connect_timeout: int = 120, dist_directory: str = MC_DIST_DIR, solr_version: str = MC_SOLR_VERSION) -> None: """Run Solr instance.""" if jvm_opts is None: jvm_opts = MC_SOLR_STANDALONE_JVM_OPTS if start_jar_args is None: start_jar_args = [] if not __solr_is_installed(): l.info("Solr is not installed, installing...") __install_solr() solr_home_dir = __solr_home_path(solr_home_dir=MC_SOLR_HOME_DIR) if not os.path.isdir(solr_home_dir): raise McSolrRunException("Solr home directory '%s' does not exist." % solr_home_dir) solr_path = __solr_path(dist_directory=dist_directory, solr_version=solr_version) if not os.path.isdir(instance_data_dir): l.info("Creating data directory at %s..." % instance_data_dir) mkdir_p(instance_data_dir) l.info("Updating collections at %s..." % instance_data_dir) collections = __collections(solr_home_dir=solr_home_dir) for collection_name, collection_path in sorted(collections.items()): l.info("Updating collection '%s'..." % collection_name) collection_conf_src_dir = os.path.join(collection_path, "conf") if not os.path.isdir(collection_conf_src_dir): raise McSolrRunException( "Configuration for collection '%s' at %s does not exist" % (collection_name, collection_conf_src_dir)) collection_dst_dir = os.path.join(instance_data_dir, collection_name) mkdir_p(collection_dst_dir) # Remove and copy configuration in case it has changed # (don't symlink because Solr 5.5+ doesn't like those) collection_conf_dst_dir = os.path.join(collection_dst_dir, "conf") if os.path.lexists(collection_conf_dst_dir): l.debug("Removing old collection configuration in '%s'..." % collection_conf_dst_dir) if os.path.islink(collection_conf_dst_dir): # Might still be a link from older Solr versions os.unlink(collection_conf_dst_dir) else: shutil.rmtree(collection_conf_dst_dir) l.info("Copying '%s' to '%s'..." % (collection_conf_src_dir, collection_conf_dst_dir)) shutil.copytree(collection_conf_src_dir, collection_conf_dst_dir, symlinks=False) l.info("Updating core.properties for collection '%s'..." % collection_name) core_properties_path = os.path.join(collection_dst_dir, "core.properties") with open(core_properties_path, 'w') as core_properties_file: core_properties_file.write( """ # # This file is autogenerated. Don't bother editing it! # name=%(collection_name)s instanceDir=%(instance_dir)s """ % { "collection_name": collection_name, "instance_dir": collection_dst_dir, }) l.info("Symlinking shard configuration...") config_items_to_symlink = [ "contexts", "etc", "modules", "resources", "solr.xml", ] for config_item in config_items_to_symlink: config_item_src_path = os.path.join(solr_home_dir, config_item) if not os.path.exists(config_item_src_path): raise McSolrRunException( "Expected configuration item '%s' does not exist" % config_item_src_path) # Recreate symlink just in case config_item_dst_path = os.path.join(instance_data_dir, config_item) if os.path.lexists(config_item_dst_path): if not os.path.islink(config_item_dst_path): raise McSolrRunException( "Configuration item '%s' exists but is not a symlink." % config_item_dst_path) os.unlink(config_item_dst_path) l.info("Symlinking '%s' to '%s'..." % (config_item_src_path, config_item_dst_path)) relative_symlink(config_item_src_path, config_item_dst_path) jetty_home_path = __jetty_home_path(dist_directory=dist_directory, solr_version=solr_version) l.info("Symlinking libraries and JARs...") library_items_to_symlink = [ "lib", "solr-webapp", "start.jar", "solr", "solr-webapp", ] for library_item in library_items_to_symlink: library_item_src_path = os.path.join(jetty_home_path, library_item) if not os.path.exists(library_item_src_path): raise McSolrRunException( "Expected library item '%s' does not exist" % library_item_src_path) # Recreate symlink just in case library_item_dst_path = os.path.join(instance_data_dir, library_item) if os.path.lexists(library_item_dst_path): if not os.path.islink(library_item_dst_path): raise McSolrRunException( "Library item '%s' exists but is not a symlink." % library_item_dst_path) os.unlink(library_item_dst_path) l.info("Symlinking '%s' to '%s'..." % (library_item_src_path, library_item_dst_path)) relative_symlink(library_item_src_path, library_item_dst_path) log4j_properties_path = os.path.join(solr_home_dir, "resources", "log4j.properties") if not os.path.isfile(log4j_properties_path): raise McSolrRunException("log4j.properties at '%s' was not found.") start_jar_path = os.path.join(jetty_home_path, "start.jar") if not os.path.isfile(start_jar_path): raise McSolrRunException("start.jar at '%s' was not found." % start_jar_path) solr_webapp_path = os.path.abspath( os.path.join(jetty_home_path, "solr-webapp")) if not os.path.isdir(solr_webapp_path): raise McSolrRunException("Solr webapp dir at '%s' was not found." % solr_webapp_path) if not hostname_resolves(hostname): raise McSolrRunException("Hostname '%s' does not resolve." % hostname) if tcp_port_is_open(port=port): raise McSolrRunException("Port %d is already open on this machine." % port) __raise_if_old_shards_exist() args = ["java"] l.info("Starting Solr instance on %s, port %d..." % (hostname, port)) if jvm_heap_size is not None: args += ["-Xmx%s" % jvm_heap_size] args += jvm_opts # noinspection SpellCheckingInspection args += [ "-server", "-Djava.util.logging.config.file=file://" + os.path.abspath(log4j_properties_path), "-Djetty.base=%s" % instance_data_dir, "-Djetty.home=%s" % instance_data_dir, "-Djetty.port=%d" % port, "-Dsolr.solr.home=%s" % instance_data_dir, "-Dsolr.data.dir=%s" % instance_data_dir, "-Dhost=%s" % hostname, "-Dmediacloud.luceneMatchVersion=%s" % MC_SOLR_LUCENEMATCHVERSION, # write heap dump to data directory on OOM errors "-XX:+HeapDumpOnOutOfMemoryError", "-XX:HeapDumpPath=%s" % instance_data_dir, # needed for resolving paths to JARs in solrconfig.xml "-Dmediacloud.solr_dist_dir=%s" % solr_path, "-Dmediacloud.solr_webapp_dir=%s" % solr_webapp_path, ] args += start_jar_args args += [ "-jar", start_jar_path, "--module=http", ] l.debug("Running command: %s" % ' '.join(args)) process = subprocess.Popen(args) global __solr_pid __solr_pid = process.pid # Declare that we don't care about the exit code of the child process so # it doesn't become a zombie when it gets killed in signal handler signal.signal(signal.SIGCHLD, signal.SIG_IGN) signal.signal(signal.SIGTERM, __kill_solr_process ) # SIGTERM is handled differently for whatever reason atexit.register(__kill_solr_process) l.info("Solr PID: %d" % __solr_pid) l.info("Solr is starting on port %d, will be available shortly..." % port) wait_for_tcp_port_to_open(port=port, retries=connect_timeout) l.info("Solr is running on port %d!" % port) while True: time.sleep(1)
def __annotate_text(self, text: str) -> Union[dict, list]: """Fetch JSON annotation for text, decode it into dictionary / list.""" text = decode_object_from_bytes_if_needed(text) if text is None: fatal_error("Text is None.") if len(text) == 0: # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here raise McJSONAnnotationFetcherException("Text is empty.") log.info("Annotating %d characters of text..." % len(text)) # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early # without making a request to the annotator at all text = text.strip() if self.__TEXT_LENGTH_LIMIT > 0: text_length = len(text) if text_length > self.__TEXT_LENGTH_LIMIT: log.warning( "Text length (%d) has exceeded the request text length limit (%d) so I will truncate it." % ( text_length, self.__TEXT_LENGTH_LIMIT, )) text = text[:self.__TEXT_LENGTH_LIMIT] # Make a request ua = UserAgent() ua.set_timing([1, 2, 4, 8]) ua.set_timeout(self.__HTTP_TIMEOUT) ua.set_max_size(None) request = None try: request = self._request_for_text(text=text) if request is None: raise McJSONAnnotationFetcherException( "Returned request is None.") except Exception as ex: # Assume that this is some sort of a programming error too fatal_error( "Unable to create annotator request for text '%s': %s" % ( text, str(ex), )) # Wait for the service's HTTP port to become open as the service might be # still starting up somewhere uri = furl(request.url()) hostname = str(uri.host) port = int(uri.port) assert hostname, f"URL hostname is not set for URL {url}" assert port, f"API URL port is not set for URL {url}" if not wait_for_tcp_port_to_open( port=port, hostname=hostname, retries=self.__ANNOTATOR_SERVICE_TIMEOUT, ): # Instead of throwing an exception, just crash the whole application # because there's no point in continuing on running it whatsoever. fatal_error( "Annotator service at {url} didn't come up in {timeout} seconds, exiting..." .format( url=url, timeout=self.__ANNOTATOR_SERVICE_TIMEOUT, )) log.debug("Sending request to %s..." % request.url()) response = ua.request(request) log.debug("Response received.") # Force UTF-8 encoding on the response because the server might not always # return correct "Content-Type" results_string = response.decoded_utf8_content() if not response.is_success(): # Error; determine whether we should be blamed for making a malformed # request, or is it an extraction error log.warning("Request failed: %s" % response.decoded_content()) if response.code() == HTTPStatus.REQUEST_TIMEOUT.value: # Raise on request timeouts without retrying anything because those usually mean that we posted # something funky to the annotator service and it got stuck raise McJSONAnnotationFetcherException( "The request timed out, giving up; text length: %d; text: %s" % ( len(text), text, )) if response.error_is_client_side(): # Error was generated by the user agent client code; likely didn't reach server at all (timeout, # unresponsive host, etc.) fatal_error("User agent error: %s: %s" % ( response.status_line(), results_string, )) else: # Error was generated by server http_status_code = response.code() if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \ or http_status_code == HTTPStatus.BAD_REQUEST.value: # Not POST, empty POST fatal_error('%s: %s' % ( response.status_line(), results_string, )) elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value: # Processing error -- raise so that the error gets caught and logged into a database raise McJSONAnnotationFetcherException( 'Annotator service was unable to process the download: %s' % results_string) else: # Shutdown the extractor on unconfigured responses fatal_error('Unknown HTTP response: %s: %s' % ( response.status_line(), results_string, )) if results_string is None or len(results_string) == 0: raise McJSONAnnotationFetcherException( "Annotator returned nothing for text: %s" % text) log.debug("Parsing response's JSON...") results = None try: results = decode_json(results_string) if results is None: raise McJSONAnnotationFetcherException( "Returned JSON is None.") except Exception as ex: # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do # fatal_error() here fatal_error("Unable to parse JSON response: %s\nJSON string: %s" % ( str(ex), results_string, )) log.debug("Done parsing response's JSON.") response_is_valid = False try: response_is_valid = self._fetched_annotation_is_valid(results) except Exception as ex: fatal_error( "Unable to determine whether response is valid: %s\nJSON string: %s" % (str(ex), results_string)) if not response_is_valid: fatal_error("Annotator response is invalid for JSON string: %s" % results_string) log.info("Done annotating %d characters of text." % len(text)) return results
def run_zookeeper(dist_directory: str = MC_DIST_DIR, listen: str = MC_ZOOKEEPER_LISTEN, port: int = MC_ZOOKEEPER_PORT, data_dir: str = MC_SOLR_BASE_DATA_DIR, zookeeper_version: str = MC_ZOOKEEPER_VERSION, solr_version: str = MC_SOLR_VERSION) -> None: """Run ZooKeeper, install if needed too.""" if not __zookeeper_is_installed(): log.info("ZooKeeper is not installed, installing...") __install_zookeeper() data_dir = resolve_absolute_path_under_mc_root(path=data_dir, must_exist=True) zookeeper_data_dir = os.path.join(data_dir, "mediacloud-cluster-zookeeper") if not os.path.isdir(zookeeper_data_dir): log.info("Creating data directory at %s..." % zookeeper_data_dir) mkdir_p(zookeeper_data_dir) if tcp_port_is_open(port=port): raise McZooKeeperRunException( "Port %d is already open on this machine." % port) zookeeper_path = __zookeeper_path(dist_directory=dist_directory, zookeeper_version=zookeeper_version) zkserver_path = os.path.join(zookeeper_path, "bin", "zkServer.sh") if not os.path.isfile(zkserver_path): raise McZooKeeperRunException("zkServer.sh at '%s' was not found." % zkserver_path) log4j_properties_path = os.path.join(zookeeper_path, "conf", "log4j.properties") if not os.path.isfile(log4j_properties_path): raise McZooKeeperRunException( "log4j.properties at '%s' was not found.") zoo_cnf_path = os.path.join(zookeeper_data_dir, "zoo.cfg") log.info("Creating zoo.cfg in '%s'..." % zoo_cnf_path) with open(zoo_cnf_path, 'w') as zoo_cnf: zoo_cnf.write(""" # # This file is autogenerated. Please do not modify it! # clientPortAddress=%(listen)s clientPort=%(port)d dataDir=%(data_dir)s # Must be between zkClientTimeout / 2 and zkClientTimeout / 20 tickTime=30000 initLimit=10 syncLimit=10 """ % { "listen": listen, "port": port, "data_dir": zookeeper_data_dir, }) zookeeper_env = os.environ.copy() zookeeper_env[ "ZOOCFGDIR"] = zookeeper_data_dir # Serves as configuration dir too zookeeper_env["ZOOCFG"] = "zoo.cfg" zookeeper_env["ZOO_LOG_DIR"] = zookeeper_data_dir zookeeper_env[ "SERVER_JVMFLAGS"] = "-Dlog4j.configuration=file://" + os.path.abspath( log4j_properties_path) args = [zkserver_path, "start-foreground"] log.info("Starting ZooKeeper on %s:%d..." % (listen, port)) log.debug("Running command: %s" % str(args)) log.debug("Environment variables: %s" % str(zookeeper_env)) process = subprocess.Popen(args, env=zookeeper_env) global __zookeeper_pid __zookeeper_pid = process.pid # Declare that we don't care about the exit code of the child process so # it doesn't become a zombie when it gets killed in signal handler signal.signal(signal.SIGCHLD, signal.SIG_IGN) signal.signal(signal.SIGTERM, __kill_zookeeper_process ) # SIGTERM is handled differently for whatever reason atexit.register(__kill_zookeeper_process) log.info("ZooKeeper PID: %d" % __zookeeper_pid) log.info("Waiting for ZooKeeper to start at port %d..." % port) zookeeper_started = wait_for_tcp_port_to_open( port=port, retries=MC_ZOOKEEPER_CONNECT_RETRIES) if not zookeeper_started: raise McZooKeeperRunException( "Unable to connect to ZooKeeper at port %d" % port) log.info( "Uploading initial Solr collection configurations to ZooKeeper...") update_zookeeper_solr_configuration(zookeeper_host="localhost", zookeeper_port=port, dist_directory=dist_directory, solr_version=solr_version) log.info("ZooKeeper is ready on port %d!" % port) while True: time.sleep(1)
def run_zookeeper(dist_directory: str = MC_DIST_DIR, listen: str = MC_ZOOKEEPER_LISTEN, port: int = MC_ZOOKEEPER_PORT, data_dir: str = MC_SOLR_BASE_DATA_DIR, zookeeper_version: str = MC_ZOOKEEPER_VERSION, solr_version: str = MC_SOLR_VERSION) -> None: """Run ZooKeeper, install if needed too.""" if not __zookeeper_is_installed(): log.info("ZooKeeper is not installed, installing...") __install_zookeeper() data_dir = resolve_absolute_path_under_mc_root(path=data_dir, must_exist=True) zookeeper_data_dir = os.path.join(data_dir, "mediacloud-cluster-zookeeper") if not os.path.isdir(zookeeper_data_dir): log.info("Creating data directory at %s..." % zookeeper_data_dir) mkdir_p(zookeeper_data_dir) if tcp_port_is_open(port=port): raise McZooKeeperRunException("Port %d is already open on this machine." % port) zookeeper_path = __zookeeper_path(dist_directory=dist_directory, zookeeper_version=zookeeper_version) zkserver_path = os.path.join(zookeeper_path, "bin", "zkServer.sh") if not os.path.isfile(zkserver_path): raise McZooKeeperRunException("zkServer.sh at '%s' was not found." % zkserver_path) log4j_properties_path = os.path.join(zookeeper_path, "conf", "log4j.properties") if not os.path.isfile(log4j_properties_path): raise McZooKeeperRunException("log4j.properties at '%s' was not found.") zoo_cnf_path = os.path.join(zookeeper_data_dir, "zoo.cfg") log.info("Creating zoo.cfg in '%s'..." % zoo_cnf_path) with open(zoo_cnf_path, 'w') as zoo_cnf: zoo_cnf.write(""" # # This file is autogenerated. Please do not modify it! # clientPortAddress=%(listen)s clientPort=%(port)d dataDir=%(data_dir)s # Must be between zkClientTimeout / 2 and zkClientTimeout / 20 tickTime=30000 initLimit=10 syncLimit=10 """ % { "listen": listen, "port": port, "data_dir": zookeeper_data_dir, }) zookeeper_env = os.environ.copy() zookeeper_env["ZOOCFGDIR"] = zookeeper_data_dir # Serves as configuration dir too zookeeper_env["ZOOCFG"] = "zoo.cfg" zookeeper_env["ZOO_LOG_DIR"] = zookeeper_data_dir zookeeper_env["SERVER_JVMFLAGS"] = "-Dlog4j.configuration=file://" + os.path.abspath(log4j_properties_path) args = [ zkserver_path, "start-foreground" ] log.info("Starting ZooKeeper on %s:%d..." % (listen, port)) log.debug("Running command: %s" % str(args)) log.debug("Environment variables: %s" % str(zookeeper_env)) process = subprocess.Popen(args, env=zookeeper_env) global __zookeeper_pid __zookeeper_pid = process.pid # Declare that we don't care about the exit code of the child process so # it doesn't become a zombie when it gets killed in signal handler signal.signal(signal.SIGCHLD, signal.SIG_IGN) signal.signal(signal.SIGTERM, __kill_zookeeper_process) # SIGTERM is handled differently for whatever reason atexit.register(__kill_zookeeper_process) log.info("ZooKeeper PID: %d" % __zookeeper_pid) log.info("Waiting for ZooKeeper to start at port %d..." % port) zookeeper_started = wait_for_tcp_port_to_open(port=port, retries=MC_ZOOKEEPER_CONNECT_RETRIES) if not zookeeper_started: raise McZooKeeperRunException("Unable to connect to ZooKeeper at port %d" % port) log.info("Uploading initial Solr collection configurations to ZooKeeper...") update_zookeeper_solr_configuration(zookeeper_host="localhost", zookeeper_port=port, dist_directory=dist_directory, solr_version=solr_version) log.info("ZooKeeper is ready on port %d!" % port) while True: time.sleep(1)
def __annotate_text(self, text: str) -> Union[dict, list]: """Fetch JSON annotation for text, decode it into dictionary / list.""" text = decode_object_from_bytes_if_needed(text) if text is None: fatal_error("Text is None.") if len(text) == 0: # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here raise McJSONAnnotationFetcherException("Text is empty.") log.info(f"Annotating {len(text)} characters of text...") # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early # without making a request to the annotator at all text = text.strip() if self.__TEXT_LENGTH_LIMIT > 0: text_length = len(text) if text_length > self.__TEXT_LENGTH_LIMIT: log.warning( f"Text length ({text_length}) has exceeded the request text length limit" f"({self.__TEXT_LENGTH_LIMIT}) so I will truncate it.") text = text[:self.__TEXT_LENGTH_LIMIT] # Make a request ua = UserAgent() ua.set_timing([1, 2, 4, 8]) ua.set_timeout(self.__HTTP_TIMEOUT) ua.set_max_size(None) request = None try: request = self._request_for_text(text=text) if request is None: raise McJSONAnnotationFetcherException( "Returned request is None.") except Exception as ex: # Assume that this is some sort of a programming error too fatal_error( f"Unable to create annotator request for text '{text}': {ex}") # Wait for the service's HTTP port to become open as the service might be # still starting up somewhere uri = furl(request.url()) hostname = str(uri.host) port = int(uri.port) assert hostname, f"URL hostname is not set for URL {request.url()}" assert port, f"API URL port is not set for URL {request.url()}" if not wait_for_tcp_port_to_open( port=port, hostname=hostname, retries=self.__ANNOTATOR_SERVICE_TIMEOUT, ): # Instead of throwing an exception, just crash the whole application # because there's no point in continuing on running it whatsoever. fatal_error( f"Annotator service at {request.url()} didn't come up in {self.__ANNOTATOR_SERVICE_TIMEOUT} seconds, " f"exiting...") log.debug(f"Sending request to {request.url()}...") # Try requesting a few times because sometimes it throws a connection error, e.g.: # # WARNING mediawords.util.web.user_agent: Client-side error while processing request <PreparedRequest [POST]>: # ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')) # WARNING mediawords.annotator.fetcher: Request failed: ('Connection aborted.', ConnectionResetError(104, # 'Connection reset by peer')) # ERROR mediawords.util.process: User agent error: 400 Client-side error: ('Connection aborted.', # ConnectionResetError(104, 'Connection reset by peer')) response = None retries = 60 sleep_between_retries = 1 for retry in range(1, retries + 1): if retry > 1: log.warning(f"Retrying ({retry} / {retries})...") response = ua.request(request) if response.is_success(): break else: if response.error_is_client_side(): log.error( f"Request failed on the client side: {response.decoded_content()}" ) time.sleep(sleep_between_retries) else: break log.debug("Response received.") # Force UTF-8 encoding on the response because the server might not always # return correct "Content-Type" results_string = response.decoded_utf8_content() if not response.is_success(): # Error; determine whether we should be blamed for making a malformed # request, or is it an extraction error log.warning(f"Request failed: {response.decoded_content()}") if response.code() == HTTPStatus.REQUEST_TIMEOUT.value: # Raise on request timeouts without retrying anything because those usually mean that we posted # something funky to the annotator service and it got stuck raise McJSONAnnotationFetcherException( f"The request timed out, giving up; text length: {len(text)}; text: {text}" ) if response.error_is_client_side(): # Error was generated by the user agent client code; likely didn't reach server at all (timeout, # unresponsive host, etc.) fatal_error( f"User agent error: {response.status_line()}: {results_string}" ) else: # Error was generated by server http_status_code = response.code() if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \ or http_status_code == HTTPStatus.BAD_REQUEST.value: # Not POST, empty POST fatal_error(f'{response.status_line()}: {results_string}') elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value: # Processing error -- raise so that the error gets caught and logged into a database raise McJSONAnnotationFetcherException( f'Annotator service was unable to process the download: {results_string}' ) else: # Shutdown the extractor on unconfigured responses fatal_error( f'Unknown HTTP response: {response.status_line()}: {results_string}' ) if results_string is None or len(results_string) == 0: raise McJSONAnnotationFetcherException( f"Annotator returned nothing for text: {text}") log.debug("Parsing response's JSON...") results = None try: results = decode_json(results_string) if results is None: raise McJSONAnnotationFetcherException( "Returned JSON is None.") except Exception as ex: # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do # fatal_error() here fatal_error( f"Unable to parse JSON response: {ex}\nJSON string: {results_string}" ) log.debug("Done parsing response's JSON.") response_is_valid = False try: response_is_valid = self._fetched_annotation_is_valid(results) except Exception as ex: fatal_error( f"Unable to determine whether response is valid: {ex}\nJSON string: {results_string}" ) if not response_is_valid: fatal_error( f"Annotator response is invalid for JSON string: {results_string}" ) log.info(f"Done annotating {len(text)} characters of text.") return results
def __run_solr( port: int, instance_data_dir: str, hostname: str = fqdn(), jvm_heap_size: str = None, start_jar_args: List[str] = None, jvm_opts: List[str] = None, connect_timeout: int = 120, dist_directory: str = MC_DIST_DIR, solr_version: str = MC_SOLR_VERSION, ) -> None: """Run Solr instance.""" if jvm_opts is None: jvm_opts = MC_SOLR_STANDALONE_JVM_OPTS if start_jar_args is None: start_jar_args = [] if not __solr_is_installed(): l.info("Solr is not installed, installing...") __install_solr() solr_home_dir = __solr_home_path(solr_home_dir=MC_SOLR_HOME_DIR) if not os.path.isdir(solr_home_dir): raise Exception("Solr home directory '%s' does not exist." % solr_home_dir) solr_path = __solr_path(dist_directory=dist_directory, solr_version=solr_version) if not os.path.isdir(instance_data_dir): l.info("Creating data directory at %s..." % instance_data_dir) mkdir_p(instance_data_dir) l.info("Updating collections at %s..." % instance_data_dir) collections = __collections(solr_home_dir=solr_home_dir) for collection_name, collection_path in sorted(collections.items()): l.info("Updating collection '%s'..." % collection_name) collection_conf_src_dir = os.path.join(collection_path, "conf") if not os.path.isdir(collection_conf_src_dir): raise Exception( "Configuration for collection '%s' at %s does not exist" % (collection_name, collection_conf_src_dir) ) collection_dst_dir = os.path.join(instance_data_dir, collection_name) mkdir_p(collection_dst_dir) # Remove and copy configuration in case it has changed # (don't symlink because Solr 5.5+ doesn't like those) collection_conf_dst_dir = os.path.join(collection_dst_dir, "conf") if os.path.lexists(collection_conf_dst_dir): l.debug("Removing old collection configuration in '%s'..." % collection_conf_dst_dir) if os.path.islink(collection_conf_dst_dir): # Might still be a link from older Solr versions os.unlink(collection_conf_dst_dir) else: shutil.rmtree(collection_conf_dst_dir) l.info("Copying '%s' to '%s'..." % (collection_conf_src_dir, collection_conf_dst_dir)) shutil.copytree(collection_conf_src_dir, collection_conf_dst_dir, symlinks=False) l.info("Updating core.properties for collection '%s'..." % collection_name) core_properties_path = os.path.join(collection_dst_dir, "core.properties") with open(core_properties_path, "w") as core_properties_file: core_properties_file.write( """ # # This file is autogenerated. Don't bother editing it! # name=%(collection_name)s instanceDir=%(instance_dir)s """ % {"collection_name": collection_name, "instance_dir": collection_dst_dir} ) l.info("Symlinking shard configuration...") config_items_to_symlink = ["contexts", "etc", "modules", "resources", "solr.xml"] for config_item in config_items_to_symlink: config_item_src_path = os.path.join(solr_home_dir, config_item) if not os.path.exists(config_item_src_path): raise Exception("Expected configuration item '%s' does not exist" % config_item_src_path) # Recreate symlink just in case config_item_dst_path = os.path.join(instance_data_dir, config_item) if os.path.lexists(config_item_dst_path): if not os.path.islink(config_item_dst_path): raise Exception("Configuration item '%s' exists but is not a symlink." % config_item_dst_path) os.unlink(config_item_dst_path) l.info("Symlinking '%s' to '%s'..." % (config_item_src_path, config_item_dst_path)) relative_symlink(config_item_src_path, config_item_dst_path) jetty_home_path = __jetty_home_path(dist_directory=dist_directory, solr_version=solr_version) l.info("Symlinking libraries and JARs...") library_items_to_symlink = ["lib", "solr-webapp", "start.jar", "solr", "solr-webapp"] for library_item in library_items_to_symlink: library_item_src_path = os.path.join(jetty_home_path, library_item) if not os.path.exists(library_item_src_path): raise Exception("Expected library item '%s' does not exist" % library_item_src_path) # Recreate symlink just in case library_item_dst_path = os.path.join(instance_data_dir, library_item) if os.path.lexists(library_item_dst_path): if not os.path.islink(library_item_dst_path): raise Exception("Library item '%s' exists but is not a symlink." % library_item_dst_path) os.unlink(library_item_dst_path) l.info("Symlinking '%s' to '%s'..." % (library_item_src_path, library_item_dst_path)) relative_symlink(library_item_src_path, library_item_dst_path) log4j_properties_path = os.path.join(solr_home_dir, "resources", "log4j.properties") if not os.path.isfile(log4j_properties_path): raise Exception("log4j.properties at '%s' was not found.") start_jar_path = os.path.join(jetty_home_path, "start.jar") if not os.path.isfile(start_jar_path): raise Exception("start.jar at '%s' was not found." % start_jar_path) solr_webapp_path = os.path.abspath(os.path.join(jetty_home_path, "solr-webapp")) if not os.path.isdir(solr_webapp_path): raise Exception("Solr webapp dir at '%s' was not found." % solr_webapp_path) if not hostname_resolves(hostname): raise Exception("Hostname '%s' does not resolve." % hostname) if tcp_port_is_open(port=port): raise Exception("Port %d is already open on this machine." % port) __raise_if_old_shards_exist() args = ["java"] l.info("Starting Solr instance on %s, port %d..." % (hostname, port)) if jvm_heap_size is not None: args += ["-Xmx%s" % jvm_heap_size] args += jvm_opts # noinspection SpellCheckingInspection args += [ "-server", "-Djava.util.logging.config.file=file://" + os.path.abspath(log4j_properties_path), "-Djetty.base=%s" % instance_data_dir, "-Djetty.home=%s" % instance_data_dir, "-Djetty.port=%d" % port, "-Dsolr.solr.home=%s" % instance_data_dir, "-Dsolr.data.dir=%s" % instance_data_dir, "-Dhost=%s" % hostname, "-Dmediacloud.luceneMatchVersion=%s" % MC_SOLR_LUCENEMATCHVERSION, # write heap dump to data directory on OOM errors "-XX:+HeapDumpOnOutOfMemoryError", "-XX:HeapDumpPath=%s" % instance_data_dir, # needed for resolving paths to JARs in solrconfig.xml "-Dmediacloud.solr_dist_dir=%s" % solr_path, "-Dmediacloud.solr_webapp_dir=%s" % solr_webapp_path, ] args += start_jar_args args += ["-jar", start_jar_path, "--module=http"] l.debug("Running command: %s" % " ".join(args)) process = subprocess.Popen(args) global __solr_pid __solr_pid = process.pid # Declare that we don't care about the exit code of the child process so # it doesn't become a zombie when it gets killed in signal handler signal.signal(signal.SIGCHLD, signal.SIG_IGN) signal.signal(signal.SIGTERM, __kill_solr_process) # SIGTERM is handled differently for whatever reason atexit.register(__kill_solr_process) l.info("Solr PID: %d" % __solr_pid) l.info("Solr is starting on port %d, will be available shortly..." % port) wait_for_tcp_port_to_open(port=port, retries=connect_timeout) l.info("Solr is running on port %d!" % port) while True: time.sleep(1)
def extract_article_html_from_page_html(content: str, config: Optional[CommonConfig] = None) -> Dict[str, str]: """ Using full page HTML as a parameter, extract part of HTML that contains the news article. :param content: Full page HTML. :param config: Optional CommonConfig object, useful for testing. :return: Dictionary with HTML that contains the news article content ("extracted_html" key) and extractor version tag ("extractor_version" key). """ content = decode_object_from_bytes_if_needed(content) if not config: config = CommonConfig() ua = UserAgent() api_url = config.extractor_api_url() # Wait up to a minute for extraction to finish ua.set_timeout(EXTRACT_TIMEOUT) # Wait for the extractor's HTTP port to become open as the service might be still starting up somewhere api_uri = furl(api_url) api_url_hostname = str(api_uri.host) api_url_port = int(api_uri.port) assert api_url_hostname, f"API URL hostname is not set for URL {api_url}" assert api_url_port, f"API URL port is not set for URL {api_url}" if not wait_for_tcp_port_to_open( port=api_url_port, hostname=api_url_hostname, retries=EXTRACTOR_SERVICE_TIMEOUT, ): # Instead of throwing an exception, just crash the whole application # because there's no point in continuing on running it whatsoever: # # 1) If the extractor service didn't come up in a given time, it won't # suddenly show up # 2) If it's a test that's doing the extraction, it can't do its job # and should fail one way or another; exit(1) is just one of the # ways how it can fail # 3) If it's some production code that needs something to get # extracted, and if we were to throw an exception instead of doing # exit(1), the caller might treat this exception as a failure to # extract this one specific input HTML file, and so it might # mis-extract a bunch of stories that way (making it hard for us to # spot the problem and time-consuming to fix it later (e.g. there # would be a need to manually re-extract a million of stories)) # # A better solution instead of exit(1) might be to throw different # kinds of exceptions and handle them appropriately in the caller, but # with the Perl-Python codebase that's a bit hard to do. fatal_error( "Extractor service at {url} didn't come up in {timeout} seconds, exiting...".format( url=api_url, timeout=EXTRACTOR_SERVICE_TIMEOUT, ) ) request_json = encode_json({'html': content}) http_request = Request(method='POST', url=api_url) http_request.set_content_type('application/json; charset=utf-8') http_request.set_content(request_json) # Try extracting multiple times # # UserAgent's set_timing() would only retry on retryable HTTP status codes and doesn't retry on connection errors by # default as such retries might have side effects, e.g. an API getting called multiple times. So, we retry # extracting the content a couple of times manually. http_response = None extraction_succeeded = False for retry in range(EXTRACT_RETRIES): if retry > 0: log.warning(f"Retrying #{retry + 1}...") http_response = ua.request(http_request) if http_response.is_success(): extraction_succeeded = True break else: log.error(f"Extraction attempt {retry + 1} failed: {http_response.decoded_content()}") if not extraction_succeeded: raise McExtractArticleFromPageException( f"Extraction of {len(content)} characters; failed; last error: {http_response.decoded_content()}" ) response = http_response.decoded_json() assert 'extracted_html' in response, "Response is expected to have 'extracted_html' key." assert 'extractor_version' in response, "Response is expected to have 'extractor_version' key." return response