Beispiel #1
0
def run_solr_shard(shard_num: int,
                   shard_count: int,
                   hostname: str = None,
                   starting_port: int = MC_SOLR_CLUSTER_STARTING_PORT,
                   base_data_dir: str = MC_SOLR_BASE_DATA_DIR,
                   dist_directory: str = MC_DIST_DIR,
                   solr_version: str = MC_SOLR_VERSION,
                   zookeeper_host: str = MC_SOLR_CLUSTER_ZOOKEEPER_HOST,
                   zookeeper_port: int = MC_SOLR_CLUSTER_ZOOKEEPER_PORT,
                   jvm_heap_size: str = MC_SOLR_CLUSTER_JVM_HEAP_SIZE) -> None:
    """Run Solr shard, install Solr if needed; read configuration from ZooKeeper."""
    if shard_num < 1:
        raise McSolrRunException("Shard number must be 1 or greater.")
    if shard_count < 1:
        raise McSolrRunException("Shard count must be 1 or greater.")

    if not __solr_is_installed(dist_directory=dist_directory,
                               solr_version=solr_version):
        log.info("Solr is not installed, installing...")
        __install_solr(dist_directory=dist_directory,
                       solr_version=solr_version)

    if hostname is None:
        hostname = fqdn()

    base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir,
                                                        must_exist=True)

    shard_port = __shard_port(shard_num=shard_num, starting_port=starting_port)
    shard_data_dir = __shard_data_dir(shard_num=shard_num,
                                      base_data_dir=base_data_dir)

    log.info("Waiting for ZooKeeper to start on %s:%d..." %
             (zookeeper_host, zookeeper_port))
    wait_for_tcp_port_to_open(
        hostname=zookeeper_host,
        port=zookeeper_port,
        retries=MC_SOLR_CLUSTER_ZOOKEEPER_CONNECT_RETRIES)
    log.info("ZooKeeper is up!")

    log.info("Starting Solr shard %d on port %d..." % (shard_num, shard_port))
    # noinspection SpellCheckingInspection
    shard_args = [
        "-DzkHost=%s:%d" % (zookeeper_host, zookeeper_port),
        "-DnumShards=%d" % shard_count,
    ]
    __run_solr(hostname=hostname,
               port=shard_port,
               instance_data_dir=shard_data_dir,
               jvm_heap_size=jvm_heap_size,
               jvm_opts=MC_SOLR_CLUSTER_JVM_OPTS,
               start_jar_args=shard_args,
               connect_timeout=MC_SOLR_CLUSTER_CONNECT_RETRIES,
               dist_directory=dist_directory,
               solr_version=solr_version)
def test_wait_for_tcp_port_to_open():
    random_port = random_unused_port()
    assert wait_for_tcp_port_to_open(port=random_port, retries=2) is False

    # Open port
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind(('localhost', random_port))
    s.listen()
    assert wait_for_tcp_port_to_open(port=random_port, retries=2) is True

    # Close port
    s.close()
    assert wait_for_tcp_port_to_open(port=random_port, retries=2) is False
Beispiel #3
0
def test_wait_for_tcp_port_to_open():
    random_port = random_unused_port()
    assert wait_for_tcp_port_to_open(port=random_port, retries=2) is False

    # Open port
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind(('localhost', random_port))
    s.listen()
    assert wait_for_tcp_port_to_open(port=random_port, retries=2) is True

    # Close port
    s.close()
    assert wait_for_tcp_port_to_open(port=random_port, retries=2) is False
Beispiel #4
0
def run_solr_shard(shard_num: int,
                   shard_count: int,
                   hostname: str = None,
                   starting_port: int = MC_SOLR_CLUSTER_STARTING_PORT,
                   base_data_dir: str = MC_SOLR_BASE_DATA_DIR,
                   dist_directory: str = MC_DIST_DIR,
                   solr_version: str = MC_SOLR_VERSION,
                   zookeeper_host: str = MC_SOLR_CLUSTER_ZOOKEEPER_HOST,
                   zookeeper_port: int = MC_SOLR_CLUSTER_ZOOKEEPER_PORT,
                   jvm_heap_size: str = MC_SOLR_CLUSTER_JVM_HEAP_SIZE) -> None:
    """Run Solr shard, install Solr if needed; read configuration from ZooKeeper."""
    if shard_num < 1:
        raise McSolrRunException("Shard number must be 1 or greater.")
    if shard_count < 1:
        raise McSolrRunException("Shard count must be 1 or greater.")

    if not __solr_is_installed(dist_directory=dist_directory, solr_version=solr_version):
        log.info("Solr is not installed, installing...")
        __install_solr(dist_directory=dist_directory, solr_version=solr_version)

    if hostname is None:
        hostname = fqdn()

    base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir, must_exist=True)

    shard_port = __shard_port(shard_num=shard_num, starting_port=starting_port)
    shard_data_dir = __shard_data_dir(shard_num=shard_num, base_data_dir=base_data_dir)

    log.info("Waiting for ZooKeeper to start on %s:%d..." % (zookeeper_host, zookeeper_port))
    wait_for_tcp_port_to_open(hostname=zookeeper_host,
                              port=zookeeper_port,
                              retries=MC_SOLR_CLUSTER_ZOOKEEPER_CONNECT_RETRIES)
    log.info("ZooKeeper is up!")

    log.info("Starting Solr shard %d on port %d..." % (shard_num, shard_port))
    # noinspection SpellCheckingInspection
    shard_args = [
        "-DzkHost=%s:%d" % (zookeeper_host, zookeeper_port),
        "-DnumShards=%d" % shard_count,
    ]
    __run_solr(hostname=hostname,
               port=shard_port,
               instance_data_dir=shard_data_dir,
               jvm_heap_size=jvm_heap_size,
               jvm_opts=MC_SOLR_CLUSTER_JVM_OPTS,
               start_jar_args=shard_args,
               connect_timeout=MC_SOLR_CLUSTER_CONNECT_RETRIES,
               dist_directory=dist_directory,
               solr_version=solr_version)
Beispiel #5
0
    def start(self, delay: int = 0):
        """Start the webserver.

        Arguments:
        delay - number of seconds to delay before starting server
        """

        if tcp_port_is_open(port=self.__port):
            raise McHashServerException("Port %d is already open." %
                                        self.__port)

        log.debug('Starting test web server %s:%d' % (
            self.__host,
            self.__port,
        ))
        log.debug('Pages: %s' % str(self.__pages))

        # "threading.Thread()" doesn't work with Perl callers
        self.__http_server_thread = multiprocessing.Process(
            target=self.__start_http_server,
            args=(self.__host, self.__port, self.__pages,
                  self.__http_server_active_pids,
                  self.__http_server_active_pids_lock, delay))
        self.__http_server_thread.daemon = True
        self.__http_server_thread.start()

        if delay == 0:
            if not wait_for_tcp_port_to_open(
                    port=self.__port, retries=20, delay=0.1):
                raise McHashServerException("Port %d is not open." %
                                            self.__port)
    def start(self, delay: int = 0):
        """Start the webserver.

        Arguments:
        delay - number of seconds to delay before starting server
        """

        if tcp_port_is_open(port=self.__port):
            raise McHashServerException("Port %d is already open." % self.__port)

        log.info('Starting test web server %s:%d' % (self.__host, self.__port,))
        log.debug('Pages: %s' % str(self.__pages))

        # "threading.Thread()" doesn't work with Perl callers
        self.__http_server_thread = multiprocessing.Process(
            target=self.__start_http_server,
            args=(
                self.__host,
                self.__port,
                self.__pages,
                self.__http_server_active_pids,
                self.__http_server_active_pids_lock,
                delay
            )
        )
        self.__http_server_thread.daemon = True
        self.__http_server_thread.start()

        if delay == 0:
            if not wait_for_tcp_port_to_open(port=self.__port, retries=20, delay=0.1):
                raise McHashServerException("Port %d is not open." % self.__port)
Beispiel #7
0
def workflow_client(namespace: str = 'default') -> WorkflowClient:
    """
    Connect to Temporal server and return its client.

    :param namespace: Namespace to connect to.
    :return: WorkflowClient instance.
    """

    host = 'temporal-server'
    port = 7233

    # It's super lame to wait for this port to open, but the Python SDK seems to fail otherwise
    wait_for_tcp_port_to_open(hostname=host, port=port)

    client = WorkflowClient.new_client(host=host,
                                       port=port,
                                       namespace=namespace)

    return client
Beispiel #8
0
def __run_solr(port: int,
               instance_data_dir: str,
               hostname: str = fqdn(),
               jvm_heap_size: str = None,
               start_jar_args: List[str] = None,
               jvm_opts: List[str] = None,
               connect_timeout: int = 120,
               dist_directory: str = MC_DIST_DIR,
               solr_version: str = MC_SOLR_VERSION) -> None:
    """Run Solr instance."""
    if jvm_opts is None:
        jvm_opts = MC_SOLR_STANDALONE_JVM_OPTS

    if start_jar_args is None:
        start_jar_args = []

    if not __solr_is_installed():
        l.info("Solr is not installed, installing...")
        __install_solr()

    solr_home_dir = __solr_home_path(solr_home_dir=MC_SOLR_HOME_DIR)
    if not os.path.isdir(solr_home_dir):
        raise McSolrRunException("Solr home directory '%s' does not exist." %
                                 solr_home_dir)

    solr_path = __solr_path(dist_directory=dist_directory,
                            solr_version=solr_version)

    if not os.path.isdir(instance_data_dir):
        l.info("Creating data directory at %s..." % instance_data_dir)
        mkdir_p(instance_data_dir)

    l.info("Updating collections at %s..." % instance_data_dir)
    collections = __collections(solr_home_dir=solr_home_dir)
    for collection_name, collection_path in sorted(collections.items()):
        l.info("Updating collection '%s'..." % collection_name)

        collection_conf_src_dir = os.path.join(collection_path, "conf")
        if not os.path.isdir(collection_conf_src_dir):
            raise McSolrRunException(
                "Configuration for collection '%s' at %s does not exist" %
                (collection_name, collection_conf_src_dir))

        collection_dst_dir = os.path.join(instance_data_dir, collection_name)
        mkdir_p(collection_dst_dir)

        # Remove and copy configuration in case it has changed
        # (don't symlink because Solr 5.5+ doesn't like those)
        collection_conf_dst_dir = os.path.join(collection_dst_dir, "conf")
        if os.path.lexists(collection_conf_dst_dir):
            l.debug("Removing old collection configuration in '%s'..." %
                    collection_conf_dst_dir)
            if os.path.islink(collection_conf_dst_dir):
                # Might still be a link from older Solr versions
                os.unlink(collection_conf_dst_dir)
            else:
                shutil.rmtree(collection_conf_dst_dir)

        l.info("Copying '%s' to '%s'..." %
               (collection_conf_src_dir, collection_conf_dst_dir))
        shutil.copytree(collection_conf_src_dir,
                        collection_conf_dst_dir,
                        symlinks=False)

        l.info("Updating core.properties for collection '%s'..." %
               collection_name)
        core_properties_path = os.path.join(collection_dst_dir,
                                            "core.properties")
        with open(core_properties_path, 'w') as core_properties_file:
            core_properties_file.write(
                """
#
# This file is autogenerated. Don't bother editing it!
#

name=%(collection_name)s
instanceDir=%(instance_dir)s
""" % {
                    "collection_name": collection_name,
                    "instance_dir": collection_dst_dir,
                })

    l.info("Symlinking shard configuration...")
    config_items_to_symlink = [
        "contexts",
        "etc",
        "modules",
        "resources",
        "solr.xml",
    ]
    for config_item in config_items_to_symlink:
        config_item_src_path = os.path.join(solr_home_dir, config_item)
        if not os.path.exists(config_item_src_path):
            raise McSolrRunException(
                "Expected configuration item '%s' does not exist" %
                config_item_src_path)

        # Recreate symlink just in case
        config_item_dst_path = os.path.join(instance_data_dir, config_item)
        if os.path.lexists(config_item_dst_path):
            if not os.path.islink(config_item_dst_path):
                raise McSolrRunException(
                    "Configuration item '%s' exists but is not a symlink." %
                    config_item_dst_path)
            os.unlink(config_item_dst_path)

        l.info("Symlinking '%s' to '%s'..." %
               (config_item_src_path, config_item_dst_path))
        relative_symlink(config_item_src_path, config_item_dst_path)

    jetty_home_path = __jetty_home_path(dist_directory=dist_directory,
                                        solr_version=solr_version)

    l.info("Symlinking libraries and JARs...")
    library_items_to_symlink = [
        "lib",
        "solr-webapp",
        "start.jar",
        "solr",
        "solr-webapp",
    ]
    for library_item in library_items_to_symlink:
        library_item_src_path = os.path.join(jetty_home_path, library_item)
        if not os.path.exists(library_item_src_path):
            raise McSolrRunException(
                "Expected library item '%s' does not exist" %
                library_item_src_path)

        # Recreate symlink just in case
        library_item_dst_path = os.path.join(instance_data_dir, library_item)
        if os.path.lexists(library_item_dst_path):
            if not os.path.islink(library_item_dst_path):
                raise McSolrRunException(
                    "Library item '%s' exists but is not a symlink." %
                    library_item_dst_path)
            os.unlink(library_item_dst_path)

        l.info("Symlinking '%s' to '%s'..." %
               (library_item_src_path, library_item_dst_path))
        relative_symlink(library_item_src_path, library_item_dst_path)

    log4j_properties_path = os.path.join(solr_home_dir, "resources",
                                         "log4j.properties")
    if not os.path.isfile(log4j_properties_path):
        raise McSolrRunException("log4j.properties at '%s' was not found.")

    start_jar_path = os.path.join(jetty_home_path, "start.jar")
    if not os.path.isfile(start_jar_path):
        raise McSolrRunException("start.jar at '%s' was not found." %
                                 start_jar_path)

    solr_webapp_path = os.path.abspath(
        os.path.join(jetty_home_path, "solr-webapp"))
    if not os.path.isdir(solr_webapp_path):
        raise McSolrRunException("Solr webapp dir at '%s' was not found." %
                                 solr_webapp_path)

    if not hostname_resolves(hostname):
        raise McSolrRunException("Hostname '%s' does not resolve." % hostname)

    if tcp_port_is_open(port=port):
        raise McSolrRunException("Port %d is already open on this machine." %
                                 port)

    __raise_if_old_shards_exist()

    args = ["java"]
    l.info("Starting Solr instance on %s, port %d..." % (hostname, port))

    if jvm_heap_size is not None:
        args += ["-Xmx%s" % jvm_heap_size]
    args += jvm_opts
    # noinspection SpellCheckingInspection
    args += [
        "-server",
        "-Djava.util.logging.config.file=file://" +
        os.path.abspath(log4j_properties_path),
        "-Djetty.base=%s" % instance_data_dir,
        "-Djetty.home=%s" % instance_data_dir,
        "-Djetty.port=%d" % port,
        "-Dsolr.solr.home=%s" % instance_data_dir,
        "-Dsolr.data.dir=%s" % instance_data_dir,
        "-Dhost=%s" % hostname,
        "-Dmediacloud.luceneMatchVersion=%s" % MC_SOLR_LUCENEMATCHVERSION,

        # write heap dump to data directory on OOM errors
        "-XX:+HeapDumpOnOutOfMemoryError",
        "-XX:HeapDumpPath=%s" % instance_data_dir,

        # needed for resolving paths to JARs in solrconfig.xml
        "-Dmediacloud.solr_dist_dir=%s" % solr_path,
        "-Dmediacloud.solr_webapp_dir=%s" % solr_webapp_path,
    ]
    args += start_jar_args
    args += [
        "-jar",
        start_jar_path,
        "--module=http",
    ]

    l.debug("Running command: %s" % ' '.join(args))

    process = subprocess.Popen(args)
    global __solr_pid
    __solr_pid = process.pid

    # Declare that we don't care about the exit code of the child process so
    # it doesn't become a zombie when it gets killed in signal handler
    signal.signal(signal.SIGCHLD, signal.SIG_IGN)

    signal.signal(signal.SIGTERM, __kill_solr_process
                  )  # SIGTERM is handled differently for whatever reason
    atexit.register(__kill_solr_process)

    l.info("Solr PID: %d" % __solr_pid)

    l.info("Solr is starting on port %d, will be available shortly..." % port)
    wait_for_tcp_port_to_open(port=port, retries=connect_timeout)

    l.info("Solr is running on port %d!" % port)
    while True:
        time.sleep(1)
Beispiel #9
0
    def __annotate_text(self, text: str) -> Union[dict, list]:
        """Fetch JSON annotation for text, decode it into dictionary / list."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            fatal_error("Text is None.")

        if len(text) == 0:
            # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here
            raise McJSONAnnotationFetcherException("Text is empty.")

        log.info("Annotating %d characters of text..." % len(text))

        # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early
        # without making a request to the annotator at all
        text = text.strip()

        if self.__TEXT_LENGTH_LIMIT > 0:
            text_length = len(text)
            if text_length > self.__TEXT_LENGTH_LIMIT:
                log.warning(
                    "Text length (%d) has exceeded the request text length limit (%d) so I will truncate it."
                    % (
                        text_length,
                        self.__TEXT_LENGTH_LIMIT,
                    ))
                text = text[:self.__TEXT_LENGTH_LIMIT]

        # Make a request
        ua = UserAgent()
        ua.set_timing([1, 2, 4, 8])
        ua.set_timeout(self.__HTTP_TIMEOUT)
        ua.set_max_size(None)

        request = None
        try:
            request = self._request_for_text(text=text)
            if request is None:
                raise McJSONAnnotationFetcherException(
                    "Returned request is None.")
        except Exception as ex:
            # Assume that this is some sort of a programming error too
            fatal_error(
                "Unable to create annotator request for text '%s': %s" % (
                    text,
                    str(ex),
                ))

        # Wait for the service's HTTP port to become open as the service might be
        # still starting up somewhere
        uri = furl(request.url())
        hostname = str(uri.host)
        port = int(uri.port)
        assert hostname, f"URL hostname is not set for URL {url}"
        assert port, f"API URL port is not set for URL {url}"

        if not wait_for_tcp_port_to_open(
                port=port,
                hostname=hostname,
                retries=self.__ANNOTATOR_SERVICE_TIMEOUT,
        ):
            # Instead of throwing an exception, just crash the whole application
            # because there's no point in continuing on running it whatsoever.
            fatal_error(
                "Annotator service at {url} didn't come up in {timeout} seconds, exiting..."
                .format(
                    url=url,
                    timeout=self.__ANNOTATOR_SERVICE_TIMEOUT,
                ))

        log.debug("Sending request to %s..." % request.url())
        response = ua.request(request)
        log.debug("Response received.")

        # Force UTF-8 encoding on the response because the server might not always
        # return correct "Content-Type"
        results_string = response.decoded_utf8_content()

        if not response.is_success():
            # Error; determine whether we should be blamed for making a malformed
            # request, or is it an extraction error
            log.warning("Request failed: %s" % response.decoded_content())

            if response.code() == HTTPStatus.REQUEST_TIMEOUT.value:
                # Raise on request timeouts without retrying anything because those usually mean that we posted
                # something funky to the annotator service and it got stuck
                raise McJSONAnnotationFetcherException(
                    "The request timed out, giving up; text length: %d; text: %s"
                    % (
                        len(text),
                        text,
                    ))

            if response.error_is_client_side():
                # Error was generated by the user agent client code; likely didn't reach server at all (timeout,
                # unresponsive host, etc.)
                fatal_error("User agent error: %s: %s" % (
                    response.status_line(),
                    results_string,
                ))

            else:

                # Error was generated by server
                http_status_code = response.code()

                if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \
                        or http_status_code == HTTPStatus.BAD_REQUEST.value:
                    # Not POST, empty POST
                    fatal_error('%s: %s' % (
                        response.status_line(),
                        results_string,
                    ))

                elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value:
                    # Processing error -- raise so that the error gets caught and logged into a database
                    raise McJSONAnnotationFetcherException(
                        'Annotator service was unable to process the download: %s'
                        % results_string)

                else:
                    # Shutdown the extractor on unconfigured responses
                    fatal_error('Unknown HTTP response: %s: %s' % (
                        response.status_line(),
                        results_string,
                    ))

        if results_string is None or len(results_string) == 0:
            raise McJSONAnnotationFetcherException(
                "Annotator returned nothing for text: %s" % text)

        log.debug("Parsing response's JSON...")
        results = None
        try:
            results = decode_json(results_string)
            if results is None:
                raise McJSONAnnotationFetcherException(
                    "Returned JSON is None.")
        except Exception as ex:
            # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do
            # fatal_error() here
            fatal_error("Unable to parse JSON response: %s\nJSON string: %s" %
                        (
                            str(ex),
                            results_string,
                        ))
        log.debug("Done parsing response's JSON.")

        response_is_valid = False
        try:
            response_is_valid = self._fetched_annotation_is_valid(results)
        except Exception as ex:
            fatal_error(
                "Unable to determine whether response is valid: %s\nJSON string: %s"
                % (str(ex), results_string))
        if not response_is_valid:
            fatal_error("Annotator response is invalid for JSON string: %s" %
                        results_string)

        log.info("Done annotating %d characters of text." % len(text))

        return results
Beispiel #10
0
def run_zookeeper(dist_directory: str = MC_DIST_DIR,
                  listen: str = MC_ZOOKEEPER_LISTEN,
                  port: int = MC_ZOOKEEPER_PORT,
                  data_dir: str = MC_SOLR_BASE_DATA_DIR,
                  zookeeper_version: str = MC_ZOOKEEPER_VERSION,
                  solr_version: str = MC_SOLR_VERSION) -> None:
    """Run ZooKeeper, install if needed too."""
    if not __zookeeper_is_installed():
        log.info("ZooKeeper is not installed, installing...")
        __install_zookeeper()

    data_dir = resolve_absolute_path_under_mc_root(path=data_dir,
                                                   must_exist=True)

    zookeeper_data_dir = os.path.join(data_dir, "mediacloud-cluster-zookeeper")
    if not os.path.isdir(zookeeper_data_dir):
        log.info("Creating data directory at %s..." % zookeeper_data_dir)
        mkdir_p(zookeeper_data_dir)

    if tcp_port_is_open(port=port):
        raise McZooKeeperRunException(
            "Port %d is already open on this machine." % port)

    zookeeper_path = __zookeeper_path(dist_directory=dist_directory,
                                      zookeeper_version=zookeeper_version)

    zkserver_path = os.path.join(zookeeper_path, "bin", "zkServer.sh")
    if not os.path.isfile(zkserver_path):
        raise McZooKeeperRunException("zkServer.sh at '%s' was not found." %
                                      zkserver_path)

    log4j_properties_path = os.path.join(zookeeper_path, "conf",
                                         "log4j.properties")
    if not os.path.isfile(log4j_properties_path):
        raise McZooKeeperRunException(
            "log4j.properties at '%s' was not found.")

    zoo_cnf_path = os.path.join(zookeeper_data_dir, "zoo.cfg")
    log.info("Creating zoo.cfg in '%s'..." % zoo_cnf_path)

    with open(zoo_cnf_path, 'w') as zoo_cnf:
        zoo_cnf.write("""
#
# This file is autogenerated. Please do not modify it!
#

clientPortAddress=%(listen)s
clientPort=%(port)d
dataDir=%(data_dir)s

# Must be between zkClientTimeout / 2 and zkClientTimeout / 20
tickTime=30000

initLimit=10
syncLimit=10
            """ % {
            "listen": listen,
            "port": port,
            "data_dir": zookeeper_data_dir,
        })

    zookeeper_env = os.environ.copy()
    zookeeper_env[
        "ZOOCFGDIR"] = zookeeper_data_dir  # Serves as configuration dir too
    zookeeper_env["ZOOCFG"] = "zoo.cfg"
    zookeeper_env["ZOO_LOG_DIR"] = zookeeper_data_dir
    zookeeper_env[
        "SERVER_JVMFLAGS"] = "-Dlog4j.configuration=file://" + os.path.abspath(
            log4j_properties_path)

    args = [zkserver_path, "start-foreground"]

    log.info("Starting ZooKeeper on %s:%d..." % (listen, port))
    log.debug("Running command: %s" % str(args))
    log.debug("Environment variables: %s" % str(zookeeper_env))

    process = subprocess.Popen(args, env=zookeeper_env)
    global __zookeeper_pid
    __zookeeper_pid = process.pid

    # Declare that we don't care about the exit code of the child process so
    # it doesn't become a zombie when it gets killed in signal handler
    signal.signal(signal.SIGCHLD, signal.SIG_IGN)

    signal.signal(signal.SIGTERM, __kill_zookeeper_process
                  )  # SIGTERM is handled differently for whatever reason
    atexit.register(__kill_zookeeper_process)

    log.info("ZooKeeper PID: %d" % __zookeeper_pid)

    log.info("Waiting for ZooKeeper to start at port %d..." % port)
    zookeeper_started = wait_for_tcp_port_to_open(
        port=port, retries=MC_ZOOKEEPER_CONNECT_RETRIES)
    if not zookeeper_started:
        raise McZooKeeperRunException(
            "Unable to connect to ZooKeeper at port %d" % port)

    log.info(
        "Uploading initial Solr collection configurations to ZooKeeper...")
    update_zookeeper_solr_configuration(zookeeper_host="localhost",
                                        zookeeper_port=port,
                                        dist_directory=dist_directory,
                                        solr_version=solr_version)

    log.info("ZooKeeper is ready on port %d!" % port)
    while True:
        time.sleep(1)
Beispiel #11
0
def run_zookeeper(dist_directory: str = MC_DIST_DIR,
                  listen: str = MC_ZOOKEEPER_LISTEN,
                  port: int = MC_ZOOKEEPER_PORT,
                  data_dir: str = MC_SOLR_BASE_DATA_DIR,
                  zookeeper_version: str = MC_ZOOKEEPER_VERSION,
                  solr_version: str = MC_SOLR_VERSION) -> None:
    """Run ZooKeeper, install if needed too."""
    if not __zookeeper_is_installed():
        log.info("ZooKeeper is not installed, installing...")
        __install_zookeeper()

    data_dir = resolve_absolute_path_under_mc_root(path=data_dir, must_exist=True)

    zookeeper_data_dir = os.path.join(data_dir, "mediacloud-cluster-zookeeper")
    if not os.path.isdir(zookeeper_data_dir):
        log.info("Creating data directory at %s..." % zookeeper_data_dir)
        mkdir_p(zookeeper_data_dir)

    if tcp_port_is_open(port=port):
        raise McZooKeeperRunException("Port %d is already open on this machine." % port)

    zookeeper_path = __zookeeper_path(dist_directory=dist_directory, zookeeper_version=zookeeper_version)

    zkserver_path = os.path.join(zookeeper_path, "bin", "zkServer.sh")
    if not os.path.isfile(zkserver_path):
        raise McZooKeeperRunException("zkServer.sh at '%s' was not found." % zkserver_path)

    log4j_properties_path = os.path.join(zookeeper_path, "conf", "log4j.properties")
    if not os.path.isfile(log4j_properties_path):
        raise McZooKeeperRunException("log4j.properties at '%s' was not found.")

    zoo_cnf_path = os.path.join(zookeeper_data_dir, "zoo.cfg")
    log.info("Creating zoo.cfg in '%s'..." % zoo_cnf_path)

    with open(zoo_cnf_path, 'w') as zoo_cnf:
        zoo_cnf.write("""
#
# This file is autogenerated. Please do not modify it!
#

clientPortAddress=%(listen)s
clientPort=%(port)d
dataDir=%(data_dir)s

# Must be between zkClientTimeout / 2 and zkClientTimeout / 20
tickTime=30000

initLimit=10
syncLimit=10
            """ % {
            "listen": listen,
            "port": port,
            "data_dir": zookeeper_data_dir,
        })

    zookeeper_env = os.environ.copy()
    zookeeper_env["ZOOCFGDIR"] = zookeeper_data_dir  # Serves as configuration dir too
    zookeeper_env["ZOOCFG"] = "zoo.cfg"
    zookeeper_env["ZOO_LOG_DIR"] = zookeeper_data_dir
    zookeeper_env["SERVER_JVMFLAGS"] = "-Dlog4j.configuration=file://" + os.path.abspath(log4j_properties_path)

    args = [
        zkserver_path,
        "start-foreground"
    ]

    log.info("Starting ZooKeeper on %s:%d..." % (listen, port))
    log.debug("Running command: %s" % str(args))
    log.debug("Environment variables: %s" % str(zookeeper_env))

    process = subprocess.Popen(args, env=zookeeper_env)
    global __zookeeper_pid
    __zookeeper_pid = process.pid

    # Declare that we don't care about the exit code of the child process so
    # it doesn't become a zombie when it gets killed in signal handler
    signal.signal(signal.SIGCHLD, signal.SIG_IGN)

    signal.signal(signal.SIGTERM, __kill_zookeeper_process)  # SIGTERM is handled differently for whatever reason
    atexit.register(__kill_zookeeper_process)

    log.info("ZooKeeper PID: %d" % __zookeeper_pid)

    log.info("Waiting for ZooKeeper to start at port %d..." % port)
    zookeeper_started = wait_for_tcp_port_to_open(port=port, retries=MC_ZOOKEEPER_CONNECT_RETRIES)
    if not zookeeper_started:
        raise McZooKeeperRunException("Unable to connect to ZooKeeper at port %d" % port)

    log.info("Uploading initial Solr collection configurations to ZooKeeper...")
    update_zookeeper_solr_configuration(zookeeper_host="localhost",
                                        zookeeper_port=port,
                                        dist_directory=dist_directory,
                                        solr_version=solr_version)

    log.info("ZooKeeper is ready on port %d!" % port)
    while True:
        time.sleep(1)
Beispiel #12
0
    def __annotate_text(self, text: str) -> Union[dict, list]:
        """Fetch JSON annotation for text, decode it into dictionary / list."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            fatal_error("Text is None.")

        if len(text) == 0:
            # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here
            raise McJSONAnnotationFetcherException("Text is empty.")

        log.info(f"Annotating {len(text)} characters of text...")

        # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early
        # without making a request to the annotator at all
        text = text.strip()

        if self.__TEXT_LENGTH_LIMIT > 0:
            text_length = len(text)
            if text_length > self.__TEXT_LENGTH_LIMIT:
                log.warning(
                    f"Text length ({text_length}) has exceeded the request text length limit"
                    f"({self.__TEXT_LENGTH_LIMIT}) so I will truncate it.")
                text = text[:self.__TEXT_LENGTH_LIMIT]

        # Make a request
        ua = UserAgent()
        ua.set_timing([1, 2, 4, 8])
        ua.set_timeout(self.__HTTP_TIMEOUT)
        ua.set_max_size(None)

        request = None
        try:
            request = self._request_for_text(text=text)
            if request is None:
                raise McJSONAnnotationFetcherException(
                    "Returned request is None.")
        except Exception as ex:
            # Assume that this is some sort of a programming error too
            fatal_error(
                f"Unable to create annotator request for text '{text}': {ex}")

        # Wait for the service's HTTP port to become open as the service might be
        # still starting up somewhere
        uri = furl(request.url())
        hostname = str(uri.host)
        port = int(uri.port)
        assert hostname, f"URL hostname is not set for URL {request.url()}"
        assert port, f"API URL port is not set for URL {request.url()}"

        if not wait_for_tcp_port_to_open(
                port=port,
                hostname=hostname,
                retries=self.__ANNOTATOR_SERVICE_TIMEOUT,
        ):
            # Instead of throwing an exception, just crash the whole application
            # because there's no point in continuing on running it whatsoever.
            fatal_error(
                f"Annotator service at {request.url()} didn't come up in {self.__ANNOTATOR_SERVICE_TIMEOUT} seconds, "
                f"exiting...")

        log.debug(f"Sending request to {request.url()}...")

        # Try requesting a few times because sometimes it throws a connection error, e.g.:
        #
        #   WARNING mediawords.util.web.user_agent: Client-side error while processing request <PreparedRequest [POST]>:
        #   ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
        #   WARNING mediawords.annotator.fetcher: Request failed: ('Connection aborted.', ConnectionResetError(104,
        #   'Connection reset by peer'))
        #   ERROR mediawords.util.process: User agent error: 400 Client-side error: ('Connection aborted.',
        #   ConnectionResetError(104, 'Connection reset by peer'))
        response = None
        retries = 60
        sleep_between_retries = 1
        for retry in range(1, retries + 1):

            if retry > 1:
                log.warning(f"Retrying ({retry} / {retries})...")

            response = ua.request(request)

            if response.is_success():
                break
            else:
                if response.error_is_client_side():
                    log.error(
                        f"Request failed on the client side: {response.decoded_content()}"
                    )
                    time.sleep(sleep_between_retries)
                else:
                    break

        log.debug("Response received.")

        # Force UTF-8 encoding on the response because the server might not always
        # return correct "Content-Type"
        results_string = response.decoded_utf8_content()

        if not response.is_success():
            # Error; determine whether we should be blamed for making a malformed
            # request, or is it an extraction error
            log.warning(f"Request failed: {response.decoded_content()}")

            if response.code() == HTTPStatus.REQUEST_TIMEOUT.value:
                # Raise on request timeouts without retrying anything because those usually mean that we posted
                # something funky to the annotator service and it got stuck
                raise McJSONAnnotationFetcherException(
                    f"The request timed out, giving up; text length: {len(text)}; text: {text}"
                )

            if response.error_is_client_side():
                # Error was generated by the user agent client code; likely didn't reach server at all (timeout,
                # unresponsive host, etc.)
                fatal_error(
                    f"User agent error: {response.status_line()}: {results_string}"
                )

            else:

                # Error was generated by server
                http_status_code = response.code()

                if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \
                        or http_status_code == HTTPStatus.BAD_REQUEST.value:
                    # Not POST, empty POST
                    fatal_error(f'{response.status_line()}: {results_string}')

                elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value:
                    # Processing error -- raise so that the error gets caught and logged into a database
                    raise McJSONAnnotationFetcherException(
                        f'Annotator service was unable to process the download: {results_string}'
                    )

                else:
                    # Shutdown the extractor on unconfigured responses
                    fatal_error(
                        f'Unknown HTTP response: {response.status_line()}: {results_string}'
                    )

        if results_string is None or len(results_string) == 0:
            raise McJSONAnnotationFetcherException(
                f"Annotator returned nothing for text: {text}")

        log.debug("Parsing response's JSON...")
        results = None
        try:
            results = decode_json(results_string)
            if results is None:
                raise McJSONAnnotationFetcherException(
                    "Returned JSON is None.")
        except Exception as ex:
            # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do
            # fatal_error() here
            fatal_error(
                f"Unable to parse JSON response: {ex}\nJSON string: {results_string}"
            )
        log.debug("Done parsing response's JSON.")

        response_is_valid = False
        try:
            response_is_valid = self._fetched_annotation_is_valid(results)
        except Exception as ex:
            fatal_error(
                f"Unable to determine whether response is valid: {ex}\nJSON string: {results_string}"
            )
        if not response_is_valid:
            fatal_error(
                f"Annotator response is invalid for JSON string: {results_string}"
            )

        log.info(f"Done annotating {len(text)} characters of text.")

        return results
Beispiel #13
0
def __run_solr(
    port: int,
    instance_data_dir: str,
    hostname: str = fqdn(),
    jvm_heap_size: str = None,
    start_jar_args: List[str] = None,
    jvm_opts: List[str] = None,
    connect_timeout: int = 120,
    dist_directory: str = MC_DIST_DIR,
    solr_version: str = MC_SOLR_VERSION,
) -> None:
    """Run Solr instance."""
    if jvm_opts is None:
        jvm_opts = MC_SOLR_STANDALONE_JVM_OPTS

    if start_jar_args is None:
        start_jar_args = []

    if not __solr_is_installed():
        l.info("Solr is not installed, installing...")
        __install_solr()

    solr_home_dir = __solr_home_path(solr_home_dir=MC_SOLR_HOME_DIR)
    if not os.path.isdir(solr_home_dir):
        raise Exception("Solr home directory '%s' does not exist." % solr_home_dir)

    solr_path = __solr_path(dist_directory=dist_directory, solr_version=solr_version)

    if not os.path.isdir(instance_data_dir):
        l.info("Creating data directory at %s..." % instance_data_dir)
        mkdir_p(instance_data_dir)

    l.info("Updating collections at %s..." % instance_data_dir)
    collections = __collections(solr_home_dir=solr_home_dir)
    for collection_name, collection_path in sorted(collections.items()):
        l.info("Updating collection '%s'..." % collection_name)

        collection_conf_src_dir = os.path.join(collection_path, "conf")
        if not os.path.isdir(collection_conf_src_dir):
            raise Exception(
                "Configuration for collection '%s' at %s does not exist" % (collection_name, collection_conf_src_dir)
            )

        collection_dst_dir = os.path.join(instance_data_dir, collection_name)
        mkdir_p(collection_dst_dir)

        # Remove and copy configuration in case it has changed
        # (don't symlink because Solr 5.5+ doesn't like those)
        collection_conf_dst_dir = os.path.join(collection_dst_dir, "conf")
        if os.path.lexists(collection_conf_dst_dir):
            l.debug("Removing old collection configuration in '%s'..." % collection_conf_dst_dir)
            if os.path.islink(collection_conf_dst_dir):
                # Might still be a link from older Solr versions
                os.unlink(collection_conf_dst_dir)
            else:
                shutil.rmtree(collection_conf_dst_dir)

        l.info("Copying '%s' to '%s'..." % (collection_conf_src_dir, collection_conf_dst_dir))
        shutil.copytree(collection_conf_src_dir, collection_conf_dst_dir, symlinks=False)

        l.info("Updating core.properties for collection '%s'..." % collection_name)
        core_properties_path = os.path.join(collection_dst_dir, "core.properties")
        with open(core_properties_path, "w") as core_properties_file:
            core_properties_file.write(
                """
#
# This file is autogenerated. Don't bother editing it!
#

name=%(collection_name)s
instanceDir=%(instance_dir)s
"""
                % {"collection_name": collection_name, "instance_dir": collection_dst_dir}
            )

    l.info("Symlinking shard configuration...")
    config_items_to_symlink = ["contexts", "etc", "modules", "resources", "solr.xml"]
    for config_item in config_items_to_symlink:
        config_item_src_path = os.path.join(solr_home_dir, config_item)
        if not os.path.exists(config_item_src_path):
            raise Exception("Expected configuration item '%s' does not exist" % config_item_src_path)

        # Recreate symlink just in case
        config_item_dst_path = os.path.join(instance_data_dir, config_item)
        if os.path.lexists(config_item_dst_path):
            if not os.path.islink(config_item_dst_path):
                raise Exception("Configuration item '%s' exists but is not a symlink." % config_item_dst_path)
            os.unlink(config_item_dst_path)

        l.info("Symlinking '%s' to '%s'..." % (config_item_src_path, config_item_dst_path))
        relative_symlink(config_item_src_path, config_item_dst_path)

    jetty_home_path = __jetty_home_path(dist_directory=dist_directory, solr_version=solr_version)

    l.info("Symlinking libraries and JARs...")
    library_items_to_symlink = ["lib", "solr-webapp", "start.jar", "solr", "solr-webapp"]
    for library_item in library_items_to_symlink:
        library_item_src_path = os.path.join(jetty_home_path, library_item)
        if not os.path.exists(library_item_src_path):
            raise Exception("Expected library item '%s' does not exist" % library_item_src_path)

        # Recreate symlink just in case
        library_item_dst_path = os.path.join(instance_data_dir, library_item)
        if os.path.lexists(library_item_dst_path):
            if not os.path.islink(library_item_dst_path):
                raise Exception("Library item '%s' exists but is not a symlink." % library_item_dst_path)
            os.unlink(library_item_dst_path)

        l.info("Symlinking '%s' to '%s'..." % (library_item_src_path, library_item_dst_path))
        relative_symlink(library_item_src_path, library_item_dst_path)

    log4j_properties_path = os.path.join(solr_home_dir, "resources", "log4j.properties")
    if not os.path.isfile(log4j_properties_path):
        raise Exception("log4j.properties at '%s' was not found.")

    start_jar_path = os.path.join(jetty_home_path, "start.jar")
    if not os.path.isfile(start_jar_path):
        raise Exception("start.jar at '%s' was not found." % start_jar_path)

    solr_webapp_path = os.path.abspath(os.path.join(jetty_home_path, "solr-webapp"))
    if not os.path.isdir(solr_webapp_path):
        raise Exception("Solr webapp dir at '%s' was not found." % solr_webapp_path)

    if not hostname_resolves(hostname):
        raise Exception("Hostname '%s' does not resolve." % hostname)

    if tcp_port_is_open(port=port):
        raise Exception("Port %d is already open on this machine." % port)

    __raise_if_old_shards_exist()

    args = ["java"]
    l.info("Starting Solr instance on %s, port %d..." % (hostname, port))

    if jvm_heap_size is not None:
        args += ["-Xmx%s" % jvm_heap_size]
    args += jvm_opts
    # noinspection SpellCheckingInspection
    args += [
        "-server",
        "-Djava.util.logging.config.file=file://" + os.path.abspath(log4j_properties_path),
        "-Djetty.base=%s" % instance_data_dir,
        "-Djetty.home=%s" % instance_data_dir,
        "-Djetty.port=%d" % port,
        "-Dsolr.solr.home=%s" % instance_data_dir,
        "-Dsolr.data.dir=%s" % instance_data_dir,
        "-Dhost=%s" % hostname,
        "-Dmediacloud.luceneMatchVersion=%s" % MC_SOLR_LUCENEMATCHVERSION,
        # write heap dump to data directory on OOM errors
        "-XX:+HeapDumpOnOutOfMemoryError",
        "-XX:HeapDumpPath=%s" % instance_data_dir,
        # needed for resolving paths to JARs in solrconfig.xml
        "-Dmediacloud.solr_dist_dir=%s" % solr_path,
        "-Dmediacloud.solr_webapp_dir=%s" % solr_webapp_path,
    ]
    args += start_jar_args
    args += ["-jar", start_jar_path, "--module=http"]

    l.debug("Running command: %s" % " ".join(args))

    process = subprocess.Popen(args)
    global __solr_pid
    __solr_pid = process.pid

    # Declare that we don't care about the exit code of the child process so
    # it doesn't become a zombie when it gets killed in signal handler
    signal.signal(signal.SIGCHLD, signal.SIG_IGN)

    signal.signal(signal.SIGTERM, __kill_solr_process)  # SIGTERM is handled differently for whatever reason
    atexit.register(__kill_solr_process)

    l.info("Solr PID: %d" % __solr_pid)

    l.info("Solr is starting on port %d, will be available shortly..." % port)
    wait_for_tcp_port_to_open(port=port, retries=connect_timeout)

    l.info("Solr is running on port %d!" % port)
    while True:
        time.sleep(1)
def extract_article_html_from_page_html(content: str, config: Optional[CommonConfig] = None) -> Dict[str, str]:
    """
    Using full page HTML as a parameter, extract part of HTML that contains the news article.
    :param content: Full page HTML.
    :param config: Optional CommonConfig object, useful for testing.
    :return: Dictionary with HTML that contains the news article content ("extracted_html" key) and extractor version
             tag ("extractor_version" key).
    """
    content = decode_object_from_bytes_if_needed(content)

    if not config:
        config = CommonConfig()

    ua = UserAgent()
    api_url = config.extractor_api_url()

    # Wait up to a minute for extraction to finish
    ua.set_timeout(EXTRACT_TIMEOUT)

    # Wait for the extractor's HTTP port to become open as the service might be still starting up somewhere
    api_uri = furl(api_url)
    api_url_hostname = str(api_uri.host)
    api_url_port = int(api_uri.port)
    assert api_url_hostname, f"API URL hostname is not set for URL {api_url}"
    assert api_url_port, f"API URL port is not set for URL {api_url}"

    if not wait_for_tcp_port_to_open(
            port=api_url_port,
            hostname=api_url_hostname,
            retries=EXTRACTOR_SERVICE_TIMEOUT,
    ):
        # Instead of throwing an exception, just crash the whole application
        # because there's no point in continuing on running it whatsoever:
        #
        # 1) If the extractor service didn't come up in a given time, it won't
        #    suddenly show up
        # 2) If it's a test that's doing the extraction, it can't do its job
        #    and should fail one way or another; exit(1) is just one of the
        #    ways how it can fail
        # 3) If it's some production code that needs something to get
        #    extracted, and if we were to throw an exception instead of doing
        #    exit(1), the caller might treat this exception as a failure to
        #    extract this one specific input HTML file, and so it might
        #    mis-extract a bunch of stories that way (making it hard for us to
        #    spot the problem and time-consuming to fix it later (e.g. there
        #    would be a need to manually re-extract a million of stories))
        #
        # A better solution instead of exit(1) might be to throw different
        # kinds of exceptions and handle them appropriately in the caller, but
        # with the Perl-Python codebase that's a bit hard to do.
        fatal_error(
            "Extractor service at {url} didn't come up in {timeout} seconds, exiting...".format(
                url=api_url,
                timeout=EXTRACTOR_SERVICE_TIMEOUT,
            )
        )

    request_json = encode_json({'html': content})

    http_request = Request(method='POST', url=api_url)
    http_request.set_content_type('application/json; charset=utf-8')
    http_request.set_content(request_json)

    # Try extracting multiple times
    #
    # UserAgent's set_timing() would only retry on retryable HTTP status codes and doesn't retry on connection errors by
    # default as such retries might have side effects, e.g. an API getting called multiple times. So, we retry
    # extracting the content a couple of times manually.
    http_response = None
    extraction_succeeded = False
    for retry in range(EXTRACT_RETRIES):

        if retry > 0:
            log.warning(f"Retrying #{retry + 1}...")

        http_response = ua.request(http_request)
        if http_response.is_success():
            extraction_succeeded = True
            break
        else:
            log.error(f"Extraction attempt {retry + 1} failed: {http_response.decoded_content()}")

    if not extraction_succeeded:
        raise McExtractArticleFromPageException(
            f"Extraction of {len(content)} characters; failed; last error: {http_response.decoded_content()}"
        )

    response = http_response.decoded_json()

    assert 'extracted_html' in response, "Response is expected to have 'extracted_html' key."
    assert 'extractor_version' in response, "Response is expected to have 'extractor_version' key."

    return response