Ejemplo n.º 1
0
def test_tcp_port_is_open():
    random_port = random_unused_port()
    assert tcp_port_is_open(random_port) is False

    # Open port
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind(('localhost', random_port))
    s.listen()
    assert tcp_port_is_open(random_port) is True

    # Close port
    s.close()
    assert tcp_port_is_open(random_port) is False
Ejemplo n.º 2
0
def optimize_solr_index(host: str = "localhost", port: int = MC_SOLR_STANDALONE_PORT, collections: List[str] = None):
    """Optimize collection indexes.

    In SolrCloud cluster, optimization command run on one of the shards will trigger optimization on all of them."""

    if collections is None:
        collections = __collections().keys()

    l.debug("Solr collections to reindex: %s" % ", ".join(collections))

    if not tcp_port_is_open(hostname=host, port=port):
        raise Exception("Solr is not running on %s:%d." % (host, port))

    l.info("Optimizing indexes on %s:%d..." % (host, port))

    for collection_name in sorted(collections):
        l.info("Optimizing collection's '%s' index on %s:%d..." % (collection_name, host, port))

        url = "http://%(host)s:%(port)d/solr/%(collection_name)s/update?optimize=true" % {
            "host": host,
            "port": port,
            "collection_name": collection_name,
        }
        l.debug("Requesting URL %s..." % url)

        try:
            urlopen(url)
        except URLError as e:
            raise Exception(
                "Unable to optimize collection '%s' index on %s:%d: %s" % (collection_name, host, port, e.reason)
            )

    l.info("Optimized indexes on %s:%d." % (host, port))
Ejemplo n.º 3
0
def reload_solr_shard(shard_num: int, host: str = "localhost", starting_port: int = MC_SOLR_CLUSTER_STARTING_PORT):
    """Reload Solr shard after ZooKeeper configuration change."""
    if shard_num < 1:
        raise Exception("Shard number must be 1 or greater.")

    shard_port = __shard_port(shard_num=shard_num, starting_port=starting_port)

    if not tcp_port_is_open(hostname=host, port=shard_port):
        raise Exception("Shard %d is not running on %s:%d." % (shard_num, host, shard_port))

    l.info("Reloading shard %d on %s:%d..." % (shard_num, host, shard_port))

    collections = __collections()
    l.debug("Solr collections: %s" % collections)

    for collection_name, collection_path in sorted(collections.items()):
        l.info("Reloading collection '%s' on shard %d on %s:%d..." % (collection_name, shard_num, host, shard_port))
        url = "http://%(host)s:%(port)d/solr/admin/cores?action=RELOAD&core=%(collection_name)s" % {
            "host": host,
            "port": shard_port,
            "collection_name": collection_name,
        }
        l.debug("Requesting URL %s..." % url)

        try:
            urlopen(url)
        except URLError as e:
            raise Exception("Unable to reload shard %d on %s:%d: %s" % (shard_num, host, shard_port, e.reason))

    l.info("Reloaded shard %d on %s:%d." % (shard_num, host, shard_port))
Ejemplo n.º 4
0
def run_solr_standalone(
    hostname: str = fqdn(),
    port: int = MC_SOLR_STANDALONE_PORT,
    base_data_dir: str = MC_SOLR_BASE_DATA_DIR,
    dist_directory: str = MC_DIST_DIR,
    solr_version: str = MC_SOLR_VERSION,
    jvm_heap_size: str = MC_SOLR_STANDALONE_JVM_HEAP_SIZE,
):
    """Run standalone instance of Solr."""
    if not __solr_is_installed(dist_directory=dist_directory, solr_version=solr_version):
        l.info("Solr is not installed, installing...")
        __install_solr(dist_directory=dist_directory, solr_version=solr_version)

    base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir, must_exist=True)
    standalone_data_dir = __standalone_data_dir(base_data_dir=base_data_dir)

    if tcp_port_is_open(port=port):
        raise Exception("Port %d is already open on this machine." % port)

    l.info("Starting standalone Solr instance on port %d..." % port)
    __run_solr(
        hostname=hostname,
        port=port,
        instance_data_dir=standalone_data_dir,
        jvm_heap_size=jvm_heap_size,
        jvm_opts=MC_SOLR_STANDALONE_JVM_OPTS,
        connect_timeout=MC_SOLR_STANDALONE_CONNECT_RETRIES,
        dist_directory=dist_directory,
        solr_version=solr_version,
    )
Ejemplo n.º 5
0
    def stop(self):
        """Stop the webserver."""

        if not tcp_port_is_open(port=self.__port):
            log.warning("Port %d is not open." % self.__port)
            return

        if self.__http_server_thread is None:
            log.warning("HTTP server process is None.")
            return

        log.info('Stopping test web server %s:%d' % (self.__host, self.__port,))

        # HTTP server itself is running in a fork, and it creates forks for every request which, at the point of killing
        # the server, might be in various states. So, we just SIGKILL all those PIDs in the most gruesome way.
        self.__http_server_active_pids_lock.acquire()
        for pid, value in self.__http_server_active_pids.items():
            if value is True:
                log.debug("Killing PID %d" % pid)
                try:
                    os.kill(pid, signal.SIGKILL)
                    self.__http_server_active_pids[pid] = False
                except OSError as ex:
                    log.error("Unable to kill PID %d: %s" % (pid, str(ex),))
        self.__http_server_active_pids_lock.release()

        self.__http_server_thread.join()
        self.__http_server_thread.terminate()
        self.__http_server_thread = None

        if not wait_for_tcp_port_to_close(port=self.__port, retries=20, delay=0.1):
            raise McHashServerException("Port %d is still open." % self.__port)
Ejemplo n.º 6
0
    def start(self, delay: int = 0):
        """Start the webserver.

        Arguments:
        delay - number of seconds to delay before starting server
        """

        if tcp_port_is_open(port=self.__port):
            raise McHashServerException("Port %d is already open." % self.__port)

        log.info('Starting test web server %s:%d' % (self.__host, self.__port,))
        log.debug('Pages: %s' % str(self.__pages))

        # "threading.Thread()" doesn't work with Perl callers
        self.__http_server_thread = multiprocessing.Process(
            target=self.__start_http_server,
            args=(
                self.__host,
                self.__port,
                self.__pages,
                self.__http_server_active_pids,
                self.__http_server_active_pids_lock,
                delay
            )
        )
        self.__http_server_thread.daemon = True
        self.__http_server_thread.start()

        if delay == 0:
            if not wait_for_tcp_port_to_open(port=self.__port, retries=20, delay=0.1):
                raise McHashServerException("Port %d is not open." % self.__port)
Ejemplo n.º 7
0
    def __init__(self, port: int, pages: dict):
        """HTTP server's constructor.

        Arguments:
        port - port to start server on (0 to choose random open port)
        pages - dict describing pages to serve, as described in docstring above

        """

        self.__host = '127.0.0.1'
        self.__http_server_thread = None

        if len(pages) == 0:
            log.warning("Pages dictionary is empty.")

        if port == 0:
            port = START_RANDOM_PORT
            while tcp_port_is_open(port):
                port += 1

        self.__port = port

        # MC_REWRITE_TO_PYTHON: Decode page keys from bytes
        pages = {decode_object_from_bytes_if_needed(k): v for k, v in pages.items()}

        self.__pages = pages

        self.__http_server_active_pids = multiprocessing.Manager().dict()
        self.__http_server_active_pids_lock = multiprocessing.Lock()
Ejemplo n.º 8
0
def __run_solr_zkcli(
    zkcli_args: List[str],
    zookeeper_host: str = MC_SOLR_CLUSTER_ZOOKEEPER_HOST,
    zookeeper_port: int = MC_SOLR_CLUSTER_ZOOKEEPER_PORT,
    dist_directory: str = MC_DIST_DIR,
    solr_version: str = MC_SOLR_VERSION,
) -> None:
    """Run Solr's zkcli.sh helper script."""
    solr_path = __solr_path(dist_directory=dist_directory, solr_version=solr_version)

    jetty_home_path = __jetty_home_path(dist_directory=dist_directory, solr_version=solr_version)

    log4j_properties_path = None
    log4j_properties_expected_paths = [
        # Solr 4.6
        os.path.join(jetty_home_path, "cloud-scripts", "log4j.properties"),
        # Solr 4.10+
        os.path.join(jetty_home_path, "scripts", "cloud-scripts", "log4j.properties"),
    ]

    for expected_path in log4j_properties_expected_paths:
        if os.path.isfile(expected_path):
            log4j_properties_path = expected_path
            break

    if log4j_properties_path is None:
        raise Exception(
            "Unable to find log4j.properties file for zkcli.sh script in paths: %s"
            % str(log4j_properties_expected_paths)
        )

    if not tcp_port_is_open(hostname=zookeeper_host, port=zookeeper_port):
        raise Exception("ZooKeeper is not running at %s:%d." % (zookeeper_host, zookeeper_port))

    jetty_home_path = __jetty_home_path(dist_directory=dist_directory, solr_version=solr_version)

    zkhost = "%s:%d" % (zookeeper_host, zookeeper_port)

    java_classpath_dirs = [
        # Solr 4
        os.path.join(solr_path, "dist", "*"),
        os.path.join(jetty_home_path, "solr-webapp", "webapp", "WEB-INF", "lib", "*"),
        os.path.join(jetty_home_path, "lib", "ext", "*"),
    ]

    args = [
        "java",
        "-classpath",
        ":".join(java_classpath_dirs),
        "-Dlog4j.configuration=file://" + os.path.abspath(log4j_properties_path),
        "org.apache.solr.cloud.ZkCLI",
        "-zkhost",
        zkhost,
    ] + zkcli_args

    run_command_in_foreground(args)
Ejemplo n.º 9
0
def _fetch_url(
        db: DatabaseHandler,
        url: str,
        network_down_host: str = DEFAULT_NETWORK_DOWN_HOST,
        network_down_port: int = DEFAULT_NETWORK_DOWN_PORT,
        network_down_timeout: int = DEFAULT_NETWORK_DOWN_TIMEOUT,
        domain_timeout: typing.Optional[int] = None) -> FetchLinkResponse:
    """Fetch a url and return the content.

    If fetching the url results in a 400 error, check whether the network_down_host is accessible.  If so,
    return the errored response.  Otherwise, wait network_down_timeout seconds and try again.

    This function catches McGetException and returns a dummy 400 Response object.

    Arguments:
    db - db handle
    url - url to fetch
    network_down_host - host to check if network is down on error
    network_down_port - port to check if network is down on error
    network_down_timeout - seconds to wait if the network is down
    domain_timeout - value to pass to ThrottledUserAgent()

    Returns:
    Response object
    """
    if mediawords.tm.stories.url_has_binary_extension(url):
        return _make_dummy_bypassed_response(url)

    while True:
        ua = ThrottledUserAgent(db, domain_timeout=domain_timeout)

        if mediawords.util.url.is_http_url(url):
            ua_response = ua.get_follow_http_html_redirects(url)
            response = FetchLinkResponse.from_useragent_response(
                url, ua_response)
        else:
            response = FetchLinkResponse(
                url=url,
                is_success=False,
                code=HTTPStatus.BAD_REQUEST.value,
                message=HTTPStatus.BAD_REQUEST.phrase,
                content='bad url',
                last_requested_url=None,
            )

        if response.is_success:
            return response

        if response.code == HTTPStatus.BAD_REQUEST.value and not tcp_port_is_open(
                port=network_down_port, hostname=network_down_host):
            log.warning(
                "Response failed with %s and network is down.  Waiting to retry ..."
                % (url, ))
            time.sleep(network_down_timeout)
        else:
            return response
Ejemplo n.º 10
0
def _fetch_url(
        db: DatabaseHandler,
        url: str,
        network_down_host: str = DEFAULT_NETWORK_DOWN_HOST,
        network_down_port: int = DEFAULT_NETWORK_DOWN_PORT,
        network_down_timeout: int = DEFAULT_NETWORK_DOWN_TIMEOUT,
        domain_timeout: typing.Optional[int] = None) -> FetchLinkResponse:
    """Fetch a url and return the content.

    If fetching the url results in a 400 error, check whether the network_down_host is accessible.  If so,
    return the errored response.  Otherwise, wait network_down_timeout seconds and try again.

    This function catches McGetException and returns a dummy 400 Response object.

    Arguments:
    db - db handle
    url - url to fetch
    network_down_host - host to check if network is down on error
    network_down_port - port to check if network is down on error
    network_down_timeout - seconds to wait if the network is down
    domain_timeout - value to pass to ThrottledUserAgent()

    Returns:
    Response object
    """
    if mediawords.tm.stories.url_has_binary_extension(url):
        return _make_dummy_bypassed_response(url)

    while True:
        ua = ThrottledUserAgent(db, domain_timeout=domain_timeout)

        if mediawords.util.url.is_http_url(url):
            ua_response = ua.get_follow_http_html_redirects(url)
            response = FetchLinkResponse.from_useragent_response(url, ua_response)
        else:
            response = FetchLinkResponse(
                url=url,
                is_success=False,
                code=HTTPStatus.BAD_REQUEST.value,
                message=HTTPStatus.BAD_REQUEST.phrase,
                content='bad url',
                last_requested_url=None,
            )

        if response.is_success:
            return response

        if response.code == HTTPStatus.BAD_REQUEST.value and not tcp_port_is_open(port=network_down_port,
                                                                                  hostname=network_down_host):
            log.warning("Response failed with %s and network is down.  Waiting to retry ..." % (url,))
            time.sleep(network_down_timeout)
        else:
            return response
Ejemplo n.º 11
0
def update_zookeeper_solr_configuration(
        zookeeper_host: str = MC_SOLR_CLUSTER_ZOOKEEPER_HOST,
        zookeeper_port: int = MC_SOLR_CLUSTER_ZOOKEEPER_PORT,
        dist_directory: str = MC_DIST_DIR,
        solr_version: str = MC_SOLR_VERSION) -> None:
    """Update Solr's configuration on ZooKeeper."""
    if not __solr_is_installed():
        log.info("Solr is not installed, installing...")
        __install_solr()

    if not tcp_port_is_open(hostname=zookeeper_host, port=zookeeper_port):
        raise McSolrRunException("ZooKeeper is not running at %s:%d." %
                                 (zookeeper_host, zookeeper_port))

    collections = __collections()
    log.debug("Solr collections: %s" % collections)

    log.info("Uploading Solr collection configurations to ZooKeeper...")
    for collection_name, collection_path in sorted(collections.items()):
        collection_conf_path = os.path.join(collection_path, "conf")

        # Copy configuration because ZooKeeper's uploader doesn't like symlinks
        log.info(
            "Copying collection's '%s' configuration to a temporary directory..."
            % collection_name)
        collection_conf_temp_dir = os.path.join(tempfile.mkdtemp(),
                                                collection_name)
        shutil.copytree(collection_conf_path, collection_conf_temp_dir)

        log.info("Uploading collection's '%s' configuration at '%s'..." %
                 (collection_name, collection_conf_temp_dir))
        __run_solr_zkcli(zkcli_args=[
            "-cmd", "upconfig", "-confdir", collection_conf_temp_dir,
            "-confname", collection_name
        ],
                         zookeeper_host=zookeeper_host,
                         zookeeper_port=zookeeper_port,
                         dist_directory=dist_directory,
                         solr_version=solr_version)

        log.info("Linking collection's '%s' configuration..." %
                 collection_name)
        __run_solr_zkcli(zkcli_args=[
            "-cmd", "linkconfig", "-collection", collection_name, "-confname",
            collection_name
        ],
                         zookeeper_host=zookeeper_host,
                         zookeeper_port=zookeeper_port,
                         dist_directory=dist_directory,
                         solr_version=solr_version)

    log.info("Uploaded Solr collection configurations to ZooKeeper.")
Ejemplo n.º 12
0
def test_random_port() -> None:
    """Test assigning a random port where port = 0."""

    hss = []
    for i in range(3):
        hs = HashServer(port=0, pages={'/foo': 'bar'})
        assert hs is not None

        hs.start()

        assert hs.port() >= START_RANDOM_PORT
        assert tcp_port_is_open(hs.port())
        assert str(requests.get(hs.page_url('/foo')).text) == 'bar'
        hss.append(hs)

    [hs.stop() for hs in hss]
Ejemplo n.º 13
0
def update_zookeeper_solr_configuration(
    zookeeper_host: str = MC_SOLR_CLUSTER_ZOOKEEPER_HOST,
    zookeeper_port: int = MC_SOLR_CLUSTER_ZOOKEEPER_PORT,
    dist_directory: str = MC_DIST_DIR,
    solr_version: str = MC_SOLR_VERSION,
) -> None:
    """Update Solr's configuration on ZooKeeper."""
    if not __solr_is_installed():
        l.info("Solr is not installed, installing...")
        __install_solr()

    if not tcp_port_is_open(hostname=zookeeper_host, port=zookeeper_port):
        raise Exception("ZooKeeper is not running at %s:%d." % (zookeeper_host, zookeeper_port))

    collections = __collections()
    l.debug("Solr collections: %s" % collections)

    l.info("Uploading Solr collection configurations to ZooKeeper...")
    for collection_name, collection_path in sorted(collections.items()):
        collection_conf_path = os.path.join(collection_path, "conf")

        # Copy configuration because ZooKeeper's uploader doesn't like symlinks
        l.info("Copying collection's '%s' configuration to a temporary directory..." % collection_name)
        collection_conf_temp_dir = os.path.join(tempfile.mkdtemp(), collection_name)
        shutil.copytree(collection_conf_path, collection_conf_temp_dir)

        l.info("Uploading collection's '%s' configuration at '%s'..." % (collection_name, collection_conf_temp_dir))
        __run_solr_zkcli(
            zkcli_args=["-cmd", "upconfig", "-confdir", collection_conf_temp_dir, "-confname", collection_name],
            zookeeper_host=zookeeper_host,
            zookeeper_port=zookeeper_port,
            dist_directory=dist_directory,
            solr_version=solr_version,
        )

        l.info("Linking collection's '%s' configuration..." % collection_name)
        __run_solr_zkcli(
            zkcli_args=["-cmd", "linkconfig", "-collection", collection_name, "-confname", collection_name],
            zookeeper_host=zookeeper_host,
            zookeeper_port=zookeeper_port,
            dist_directory=dist_directory,
            solr_version=solr_version,
        )

    l.info("Uploaded Solr collection configurations to ZooKeeper.")
Ejemplo n.º 14
0
def upgrade_lucene_standalone_index(
    base_data_dir: str = MC_SOLR_BASE_DATA_DIR, dist_directory: str = MC_DIST_DIR, solr_version: str = MC_SOLR_VERSION
):
    """Upgrade Lucene index using the IndexUpgrader tool to standalone instance."""

    base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir, must_exist=True)

    l.info("Making sure standalone instance isn't running...")
    port = MC_SOLR_STANDALONE_PORT
    if tcp_port_is_open(port=port):
        raise Exception("Solr standalone instance is running on port %d." % port)
    l.info("Made sure standalone instance isn't running.")

    l.info("Upgrading standalone instance indexes...")
    standalone_data_dir = __standalone_data_dir(base_data_dir=base_data_dir)
    __upgrade_lucene_index(
        instance_data_dir=standalone_data_dir, dist_directory=dist_directory, solr_version=solr_version
    )
    l.info("Upgraded standalone instance indexes...")
Ejemplo n.º 15
0
def upgrade_lucene_standalone_index(base_data_dir: str = MC_SOLR_BASE_DATA_DIR,
                                    dist_directory: str = MC_DIST_DIR,
                                    solr_version: str = MC_SOLR_VERSION):
    """Upgrade Lucene index using the IndexUpgrader tool to standalone instance."""

    base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir, must_exist=True)

    log.info("Making sure standalone instance isn't running...")
    port = MC_SOLR_STANDALONE_PORT
    if tcp_port_is_open(port=port):
        raise McSolrRunException("Solr standalone instance is running on port %d." % port)
    log.info("Made sure standalone instance isn't running.")

    log.info("Upgrading standalone instance indexes...")
    standalone_data_dir = __standalone_data_dir(base_data_dir=base_data_dir)
    __upgrade_lucene_index(instance_data_dir=standalone_data_dir,
                           dist_directory=dist_directory,
                           solr_version=solr_version)
    log.info("Upgraded standalone instance indexes...")
Ejemplo n.º 16
0
def __run_solr_zkcli(zkcli_args: List[str],
                     zookeeper_host: str = MC_SOLR_CLUSTER_ZOOKEEPER_HOST,
                     zookeeper_port: int = MC_SOLR_CLUSTER_ZOOKEEPER_PORT,
                     dist_directory: str = MC_DIST_DIR,
                     solr_version: str = MC_SOLR_VERSION) -> None:
    """Run Solr's zkcli.sh helper script."""
    solr_path = __solr_path(dist_directory=dist_directory,
                            solr_version=solr_version)

    jetty_home_path = __jetty_home_path(dist_directory=dist_directory,
                                        solr_version=solr_version)
    log4j_properties_path = os.path.join(jetty_home_path, "scripts",
                                         "cloud-scripts", "log4j.properties")

    if not os.path.isfile(log4j_properties_path):
        raise McSolrRunException(
            "Unable to find log4j.properties file for zkcli.sh script at path: %s"
            % log4j_properties_path)

    if not tcp_port_is_open(hostname=zookeeper_host, port=zookeeper_port):
        raise McSolrRunException("ZooKeeper is not running at %s:%d." %
                                 (zookeeper_host, zookeeper_port))

    jetty_home_path = __jetty_home_path(dist_directory=dist_directory,
                                        solr_version=solr_version)

    zkhost = "%s:%d" % (zookeeper_host, zookeeper_port)

    java_classpath_dirs = [
        os.path.join(solr_path, "dist", "*"),
        os.path.join(jetty_home_path, "solr-webapp", "webapp", "WEB-INF",
                     "lib", "*"),
        os.path.join(jetty_home_path, "lib", "ext", "*"),
    ]

    args = [
        "java", "-classpath", ":".join(java_classpath_dirs),
        "-Dlog4j.configuration=file://" +
        os.path.abspath(log4j_properties_path), "org.apache.solr.cloud.ZkCLI",
        "-zkhost", zkhost
    ] + zkcli_args

    run_command_in_foreground(args)
Ejemplo n.º 17
0
def upgrade_lucene_shards_indexes(base_data_dir: str = MC_SOLR_BASE_DATA_DIR,
                                  dist_directory: str = MC_DIST_DIR,
                                  solr_version: str = MC_SOLR_VERSION):
    """Upgrade Lucene indexes using the IndexUpgrader tool to all shards."""

    base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir,
                                                        must_exist=True)

    # Try to guess shard count from how many shards are in data directory
    l.info("Looking for shards...")
    shard_num = 0
    shard_count = 0
    while True:
        shard_num += 1
        shard_data_dir = __shard_data_dir(shard_num=shard_num,
                                          base_data_dir=base_data_dir)
        if os.path.isdir(shard_data_dir):
            shard_count += 1
        else:
            break
    if shard_count < 2:
        raise McSolrRunException("Found less than 2 shards.")
    l.info("Found %d shards." % shard_count)

    l.info("Making sure shards aren't running...")
    for shard_num in range(1, shard_count + 1):
        shard_port = __shard_port(shard_num=shard_num,
                                  starting_port=MC_SOLR_CLUSTER_STARTING_PORT)

        if tcp_port_is_open(port=shard_port):
            raise McSolrRunException("Solr shard %d is running on port %d." %
                                     (shard_num, shard_port))
    l.info("Made sure shards aren't running.")

    l.info("Upgrading shard indexes...")
    for shard_num in range(1, shard_count + 1):
        shard_data_dir = __shard_data_dir(shard_num=shard_num,
                                          base_data_dir=base_data_dir)
        __upgrade_lucene_index(instance_data_dir=shard_data_dir,
                               dist_directory=dist_directory,
                               solr_version=solr_version)
    l.info("Upgraded shard indexes.")
Ejemplo n.º 18
0
    def stop(self):
        """Stop the webserver."""

        if not tcp_port_is_open(port=self.__port):
            log.warning("Port %d is not open." % self.__port)
            return

        if self.__http_server_thread is None:
            log.warning("HTTP server process is None.")
            return

        log.debug('Stopping test web server %s:%d' % (
            self.__host,
            self.__port,
        ))

        # HTTP server itself is running in a fork, and it creates forks for every request which, at the point of killing
        # the server, might be in various states. So, we just SIGKILL all those PIDs in the most gruesome way.
        self.__http_server_active_pids_lock.acquire()
        for pid, value in self.__http_server_active_pids.items():
            if value is True:
                log.debug("Killing PID %d" % pid)
                try:
                    os.kill(pid, signal.SIGKILL)
                    self.__http_server_active_pids[pid] = False
                except OSError as ex:
                    log.error("Unable to kill PID %d: %s" % (
                        pid,
                        str(ex),
                    ))
        self.__http_server_active_pids_lock.release()

        self.__http_server_thread.join()
        self.__http_server_thread.terminate()
        self.__http_server_thread = None

        if not wait_for_tcp_port_to_close(
                port=self.__port, retries=20, delay=0.1):
            raise McHashServerException("Port %d is still open." % self.__port)
Ejemplo n.º 19
0
def optimize_solr_index(host: str = "localhost",
                        port: int = MC_SOLR_STANDALONE_PORT,
                        collections: List[str] = None):
    """Optimize collection indexes.

    In SolrCloud cluster, optimization command run on one of the shards will trigger optimization on all of them."""

    if collections is None:
        collections = __collections().keys()

    l.debug("Solr collections to reindex: %s" % ', '.join(collections))

    if not tcp_port_is_open(hostname=host, port=port):
        raise McSolrRunException("Solr is not running on %s:%d." %
                                 (host, port))

    l.info("Optimizing indexes on %s:%d..." % (host, port))

    for collection_name in sorted(collections):
        l.info("Optimizing collection's '%s' index on %s:%d..." %
               (collection_name, host, port))

        url = "http://%(host)s:%(port)d/solr/%(collection_name)s/update?optimize=true" % {
            "host": host,
            "port": port,
            "collection_name": collection_name,
        }
        l.debug("Requesting URL %s..." % url)

        try:
            urlopen(url)
        except URLError as e:
            raise McSolrRunException(
                "Unable to optimize collection '%s' index on %s:%d: %s" %
                (collection_name, host, port, e.reason))

    l.info("Optimized indexes on %s:%d." % (host, port))
Ejemplo n.º 20
0
def upgrade_lucene_shards_indexes(
    base_data_dir: str = MC_SOLR_BASE_DATA_DIR, dist_directory: str = MC_DIST_DIR, solr_version: str = MC_SOLR_VERSION
):
    """Upgrade Lucene indexes using the IndexUpgrader tool to all shards."""

    base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir, must_exist=True)

    # Try to guess shard count from how many shards are in data directory
    l.info("Looking for shards...")
    shard_num = 0
    shard_count = 0
    while True:
        shard_num += 1
        shard_data_dir = __shard_data_dir(shard_num=shard_num, base_data_dir=base_data_dir)
        if os.path.isdir(shard_data_dir):
            shard_count += 1
        else:
            break
    if shard_count < 2:
        raise Exception("Found less than 2 shards.")
    l.info("Found %d shards." % shard_count)

    l.info("Making sure shards aren't running...")
    for shard_num in range(1, shard_count + 1):
        shard_port = __shard_port(shard_num=shard_num, starting_port=MC_SOLR_CLUSTER_STARTING_PORT)

        if tcp_port_is_open(port=shard_port):
            raise Exception("Solr shard %d is running on port %d." % (shard_num, shard_port))
    l.info("Made sure shards aren't running.")

    l.info("Upgrading shard indexes...")
    for shard_num in range(1, shard_count + 1):
        shard_data_dir = __shard_data_dir(shard_num=shard_num, base_data_dir=base_data_dir)
        __upgrade_lucene_index(
            instance_data_dir=shard_data_dir, dist_directory=dist_directory, solr_version=solr_version
        )
    l.info("Upgraded shard indexes.")
Ejemplo n.º 21
0
def __run_solr(port: int,
               instance_data_dir: str,
               hostname: str = fqdn(),
               jvm_heap_size: str = None,
               start_jar_args: List[str] = None,
               jvm_opts: List[str] = None,
               connect_timeout: int = 120,
               dist_directory: str = MC_DIST_DIR,
               solr_version: str = MC_SOLR_VERSION) -> None:
    """Run Solr instance."""
    if jvm_opts is None:
        jvm_opts = MC_SOLR_STANDALONE_JVM_OPTS

    if start_jar_args is None:
        start_jar_args = []

    if not __solr_is_installed():
        l.info("Solr is not installed, installing...")
        __install_solr()

    solr_home_dir = __solr_home_path(solr_home_dir=MC_SOLR_HOME_DIR)
    if not os.path.isdir(solr_home_dir):
        raise McSolrRunException("Solr home directory '%s' does not exist." %
                                 solr_home_dir)

    solr_path = __solr_path(dist_directory=dist_directory,
                            solr_version=solr_version)

    if not os.path.isdir(instance_data_dir):
        l.info("Creating data directory at %s..." % instance_data_dir)
        mkdir_p(instance_data_dir)

    l.info("Updating collections at %s..." % instance_data_dir)
    collections = __collections(solr_home_dir=solr_home_dir)
    for collection_name, collection_path in sorted(collections.items()):
        l.info("Updating collection '%s'..." % collection_name)

        collection_conf_src_dir = os.path.join(collection_path, "conf")
        if not os.path.isdir(collection_conf_src_dir):
            raise McSolrRunException(
                "Configuration for collection '%s' at %s does not exist" %
                (collection_name, collection_conf_src_dir))

        collection_dst_dir = os.path.join(instance_data_dir, collection_name)
        mkdir_p(collection_dst_dir)

        # Remove and copy configuration in case it has changed
        # (don't symlink because Solr 5.5+ doesn't like those)
        collection_conf_dst_dir = os.path.join(collection_dst_dir, "conf")
        if os.path.lexists(collection_conf_dst_dir):
            l.debug("Removing old collection configuration in '%s'..." %
                    collection_conf_dst_dir)
            if os.path.islink(collection_conf_dst_dir):
                # Might still be a link from older Solr versions
                os.unlink(collection_conf_dst_dir)
            else:
                shutil.rmtree(collection_conf_dst_dir)

        l.info("Copying '%s' to '%s'..." %
               (collection_conf_src_dir, collection_conf_dst_dir))
        shutil.copytree(collection_conf_src_dir,
                        collection_conf_dst_dir,
                        symlinks=False)

        l.info("Updating core.properties for collection '%s'..." %
               collection_name)
        core_properties_path = os.path.join(collection_dst_dir,
                                            "core.properties")
        with open(core_properties_path, 'w') as core_properties_file:
            core_properties_file.write(
                """
#
# This file is autogenerated. Don't bother editing it!
#

name=%(collection_name)s
instanceDir=%(instance_dir)s
""" % {
                    "collection_name": collection_name,
                    "instance_dir": collection_dst_dir,
                })

    l.info("Symlinking shard configuration...")
    config_items_to_symlink = [
        "contexts",
        "etc",
        "modules",
        "resources",
        "solr.xml",
    ]
    for config_item in config_items_to_symlink:
        config_item_src_path = os.path.join(solr_home_dir, config_item)
        if not os.path.exists(config_item_src_path):
            raise McSolrRunException(
                "Expected configuration item '%s' does not exist" %
                config_item_src_path)

        # Recreate symlink just in case
        config_item_dst_path = os.path.join(instance_data_dir, config_item)
        if os.path.lexists(config_item_dst_path):
            if not os.path.islink(config_item_dst_path):
                raise McSolrRunException(
                    "Configuration item '%s' exists but is not a symlink." %
                    config_item_dst_path)
            os.unlink(config_item_dst_path)

        l.info("Symlinking '%s' to '%s'..." %
               (config_item_src_path, config_item_dst_path))
        relative_symlink(config_item_src_path, config_item_dst_path)

    jetty_home_path = __jetty_home_path(dist_directory=dist_directory,
                                        solr_version=solr_version)

    l.info("Symlinking libraries and JARs...")
    library_items_to_symlink = [
        "lib",
        "solr-webapp",
        "start.jar",
        "solr",
        "solr-webapp",
    ]
    for library_item in library_items_to_symlink:
        library_item_src_path = os.path.join(jetty_home_path, library_item)
        if not os.path.exists(library_item_src_path):
            raise McSolrRunException(
                "Expected library item '%s' does not exist" %
                library_item_src_path)

        # Recreate symlink just in case
        library_item_dst_path = os.path.join(instance_data_dir, library_item)
        if os.path.lexists(library_item_dst_path):
            if not os.path.islink(library_item_dst_path):
                raise McSolrRunException(
                    "Library item '%s' exists but is not a symlink." %
                    library_item_dst_path)
            os.unlink(library_item_dst_path)

        l.info("Symlinking '%s' to '%s'..." %
               (library_item_src_path, library_item_dst_path))
        relative_symlink(library_item_src_path, library_item_dst_path)

    log4j_properties_path = os.path.join(solr_home_dir, "resources",
                                         "log4j.properties")
    if not os.path.isfile(log4j_properties_path):
        raise McSolrRunException("log4j.properties at '%s' was not found.")

    start_jar_path = os.path.join(jetty_home_path, "start.jar")
    if not os.path.isfile(start_jar_path):
        raise McSolrRunException("start.jar at '%s' was not found." %
                                 start_jar_path)

    solr_webapp_path = os.path.abspath(
        os.path.join(jetty_home_path, "solr-webapp"))
    if not os.path.isdir(solr_webapp_path):
        raise McSolrRunException("Solr webapp dir at '%s' was not found." %
                                 solr_webapp_path)

    if not hostname_resolves(hostname):
        raise McSolrRunException("Hostname '%s' does not resolve." % hostname)

    if tcp_port_is_open(port=port):
        raise McSolrRunException("Port %d is already open on this machine." %
                                 port)

    __raise_if_old_shards_exist()

    args = ["java"]
    l.info("Starting Solr instance on %s, port %d..." % (hostname, port))

    if jvm_heap_size is not None:
        args += ["-Xmx%s" % jvm_heap_size]
    args += jvm_opts
    # noinspection SpellCheckingInspection
    args += [
        "-server",
        "-Djava.util.logging.config.file=file://" +
        os.path.abspath(log4j_properties_path),
        "-Djetty.base=%s" % instance_data_dir,
        "-Djetty.home=%s" % instance_data_dir,
        "-Djetty.port=%d" % port,
        "-Dsolr.solr.home=%s" % instance_data_dir,
        "-Dsolr.data.dir=%s" % instance_data_dir,
        "-Dhost=%s" % hostname,
        "-Dmediacloud.luceneMatchVersion=%s" % MC_SOLR_LUCENEMATCHVERSION,

        # write heap dump to data directory on OOM errors
        "-XX:+HeapDumpOnOutOfMemoryError",
        "-XX:HeapDumpPath=%s" % instance_data_dir,

        # needed for resolving paths to JARs in solrconfig.xml
        "-Dmediacloud.solr_dist_dir=%s" % solr_path,
        "-Dmediacloud.solr_webapp_dir=%s" % solr_webapp_path,
    ]
    args += start_jar_args
    args += [
        "-jar",
        start_jar_path,
        "--module=http",
    ]

    l.debug("Running command: %s" % ' '.join(args))

    process = subprocess.Popen(args)
    global __solr_pid
    __solr_pid = process.pid

    # Declare that we don't care about the exit code of the child process so
    # it doesn't become a zombie when it gets killed in signal handler
    signal.signal(signal.SIGCHLD, signal.SIG_IGN)

    signal.signal(signal.SIGTERM, __kill_solr_process
                  )  # SIGTERM is handled differently for whatever reason
    atexit.register(__kill_solr_process)

    l.info("Solr PID: %d" % __solr_pid)

    l.info("Solr is starting on port %d, will be available shortly..." % port)
    wait_for_tcp_port_to_open(port=port, retries=connect_timeout)

    l.info("Solr is running on port %d!" % port)
    while True:
        time.sleep(1)
Ejemplo n.º 22
0
def test_random_unused_port():
    random_port = random_unused_port()
    assert tcp_port_is_open(random_port) is False
Ejemplo n.º 23
0
def test_http_hash_server_multiple_clients():
    """Test running hash server with multiple clients."""

    port = random_unused_port()

    # noinspection PyTypeChecker,PyUnusedLocal
    def __callback_timeout(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: text/html; charset=UTF-8\r\n"
        r += "\r\n"
        r += "And now we wait"
        time.sleep(10)
        return str.encode(r)

    pages = {
        '/a': '𝘛𝘩𝘪𝘴 𝘪𝘴 𝘱𝘢𝘨𝘦 𝘈.',
        '/timeout': {
            'callback': __callback_timeout
        },
        # '/does-not-exist': '404',
        '/b': '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖕𝖆𝖌𝖊 𝕭.',
        '/c': '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕡𝕒𝕘𝕖 ℂ.',
    }

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port)

    base_url = 'http://localhost:%d' % port

    session = FuturesSession(max_workers=10)

    future_a = session.get('%s/a' % base_url, timeout=2)
    future_timeout = session.get('%s/timeout' % base_url, timeout=2)
    future_404 = session.get('%s/does-not-exist' % base_url, timeout=2)
    future_b = session.get('%s/b' % base_url, timeout=2)
    future_c = session.get('%s/c' % base_url, timeout=2)

    response_a = future_a.result()

    with pytest.raises(requests.Timeout):
        future_timeout.result()

    response_404 = future_404.result()
    response_b = future_b.result()
    response_c = future_c.result()

    assert response_b.status_code == 200
    assert response_b.text == '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖕𝖆𝖌𝖊 𝕭.'

    assert response_c.status_code == 200
    assert response_c.text == '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕡𝕒𝕘𝕖 ℂ.'

    assert response_404.status_code == 404

    assert response_a.status_code == 200
    assert response_a.text == '𝘛𝘩𝘪𝘴 𝘪𝘴 𝘱𝘢𝘨𝘦 𝘈.'

    hs.stop()
Ejemplo n.º 24
0
def run_zookeeper(dist_directory: str = MC_DIST_DIR,
                  listen: str = MC_ZOOKEEPER_LISTEN,
                  port: int = MC_ZOOKEEPER_PORT,
                  data_dir: str = MC_SOLR_BASE_DATA_DIR,
                  zookeeper_version: str = MC_ZOOKEEPER_VERSION,
                  solr_version: str = MC_SOLR_VERSION) -> None:
    """Run ZooKeeper, install if needed too."""
    if not __zookeeper_is_installed():
        log.info("ZooKeeper is not installed, installing...")
        __install_zookeeper()

    data_dir = resolve_absolute_path_under_mc_root(path=data_dir,
                                                   must_exist=True)

    zookeeper_data_dir = os.path.join(data_dir, "mediacloud-cluster-zookeeper")
    if not os.path.isdir(zookeeper_data_dir):
        log.info("Creating data directory at %s..." % zookeeper_data_dir)
        mkdir_p(zookeeper_data_dir)

    if tcp_port_is_open(port=port):
        raise McZooKeeperRunException(
            "Port %d is already open on this machine." % port)

    zookeeper_path = __zookeeper_path(dist_directory=dist_directory,
                                      zookeeper_version=zookeeper_version)

    zkserver_path = os.path.join(zookeeper_path, "bin", "zkServer.sh")
    if not os.path.isfile(zkserver_path):
        raise McZooKeeperRunException("zkServer.sh at '%s' was not found." %
                                      zkserver_path)

    log4j_properties_path = os.path.join(zookeeper_path, "conf",
                                         "log4j.properties")
    if not os.path.isfile(log4j_properties_path):
        raise McZooKeeperRunException(
            "log4j.properties at '%s' was not found.")

    zoo_cnf_path = os.path.join(zookeeper_data_dir, "zoo.cfg")
    log.info("Creating zoo.cfg in '%s'..." % zoo_cnf_path)

    with open(zoo_cnf_path, 'w') as zoo_cnf:
        zoo_cnf.write("""
#
# This file is autogenerated. Please do not modify it!
#

clientPortAddress=%(listen)s
clientPort=%(port)d
dataDir=%(data_dir)s

# Must be between zkClientTimeout / 2 and zkClientTimeout / 20
tickTime=30000

initLimit=10
syncLimit=10
            """ % {
            "listen": listen,
            "port": port,
            "data_dir": zookeeper_data_dir,
        })

    zookeeper_env = os.environ.copy()
    zookeeper_env[
        "ZOOCFGDIR"] = zookeeper_data_dir  # Serves as configuration dir too
    zookeeper_env["ZOOCFG"] = "zoo.cfg"
    zookeeper_env["ZOO_LOG_DIR"] = zookeeper_data_dir
    zookeeper_env[
        "SERVER_JVMFLAGS"] = "-Dlog4j.configuration=file://" + os.path.abspath(
            log4j_properties_path)

    args = [zkserver_path, "start-foreground"]

    log.info("Starting ZooKeeper on %s:%d..." % (listen, port))
    log.debug("Running command: %s" % str(args))
    log.debug("Environment variables: %s" % str(zookeeper_env))

    process = subprocess.Popen(args, env=zookeeper_env)
    global __zookeeper_pid
    __zookeeper_pid = process.pid

    # Declare that we don't care about the exit code of the child process so
    # it doesn't become a zombie when it gets killed in signal handler
    signal.signal(signal.SIGCHLD, signal.SIG_IGN)

    signal.signal(signal.SIGTERM, __kill_zookeeper_process
                  )  # SIGTERM is handled differently for whatever reason
    atexit.register(__kill_zookeeper_process)

    log.info("ZooKeeper PID: %d" % __zookeeper_pid)

    log.info("Waiting for ZooKeeper to start at port %d..." % port)
    zookeeper_started = wait_for_tcp_port_to_open(
        port=port, retries=MC_ZOOKEEPER_CONNECT_RETRIES)
    if not zookeeper_started:
        raise McZooKeeperRunException(
            "Unable to connect to ZooKeeper at port %d" % port)

    log.info(
        "Uploading initial Solr collection configurations to ZooKeeper...")
    update_zookeeper_solr_configuration(zookeeper_host="localhost",
                                        zookeeper_port=port,
                                        dist_directory=dist_directory,
                                        solr_version=solr_version)

    log.info("ZooKeeper is ready on port %d!" % port)
    while True:
        time.sleep(1)
Ejemplo n.º 25
0
def test_random_unused_port():
    random_port = random_unused_port()
    assert tcp_port_is_open(random_port) is False
Ejemplo n.º 26
0
def __run_solr(
    port: int,
    instance_data_dir: str,
    hostname: str = fqdn(),
    jvm_heap_size: str = None,
    start_jar_args: List[str] = None,
    jvm_opts: List[str] = None,
    connect_timeout: int = 120,
    dist_directory: str = MC_DIST_DIR,
    solr_version: str = MC_SOLR_VERSION,
) -> None:
    """Run Solr instance."""
    if jvm_opts is None:
        jvm_opts = MC_SOLR_STANDALONE_JVM_OPTS

    if start_jar_args is None:
        start_jar_args = []

    if not __solr_is_installed():
        l.info("Solr is not installed, installing...")
        __install_solr()

    solr_home_dir = __solr_home_path(solr_home_dir=MC_SOLR_HOME_DIR)
    if not os.path.isdir(solr_home_dir):
        raise Exception("Solr home directory '%s' does not exist." % solr_home_dir)

    solr_path = __solr_path(dist_directory=dist_directory, solr_version=solr_version)

    if not os.path.isdir(instance_data_dir):
        l.info("Creating data directory at %s..." % instance_data_dir)
        mkdir_p(instance_data_dir)

    l.info("Updating collections at %s..." % instance_data_dir)
    collections = __collections(solr_home_dir=solr_home_dir)
    for collection_name, collection_path in sorted(collections.items()):
        l.info("Updating collection '%s'..." % collection_name)

        collection_conf_src_dir = os.path.join(collection_path, "conf")
        if not os.path.isdir(collection_conf_src_dir):
            raise Exception(
                "Configuration for collection '%s' at %s does not exist" % (collection_name, collection_conf_src_dir)
            )

        collection_dst_dir = os.path.join(instance_data_dir, collection_name)
        mkdir_p(collection_dst_dir)

        # Remove and copy configuration in case it has changed
        # (don't symlink because Solr 5.5+ doesn't like those)
        collection_conf_dst_dir = os.path.join(collection_dst_dir, "conf")
        if os.path.lexists(collection_conf_dst_dir):
            l.debug("Removing old collection configuration in '%s'..." % collection_conf_dst_dir)
            if os.path.islink(collection_conf_dst_dir):
                # Might still be a link from older Solr versions
                os.unlink(collection_conf_dst_dir)
            else:
                shutil.rmtree(collection_conf_dst_dir)

        l.info("Copying '%s' to '%s'..." % (collection_conf_src_dir, collection_conf_dst_dir))
        shutil.copytree(collection_conf_src_dir, collection_conf_dst_dir, symlinks=False)

        l.info("Updating core.properties for collection '%s'..." % collection_name)
        core_properties_path = os.path.join(collection_dst_dir, "core.properties")
        with open(core_properties_path, "w") as core_properties_file:
            core_properties_file.write(
                """
#
# This file is autogenerated. Don't bother editing it!
#

name=%(collection_name)s
instanceDir=%(instance_dir)s
"""
                % {"collection_name": collection_name, "instance_dir": collection_dst_dir}
            )

    l.info("Symlinking shard configuration...")
    config_items_to_symlink = ["contexts", "etc", "modules", "resources", "solr.xml"]
    for config_item in config_items_to_symlink:
        config_item_src_path = os.path.join(solr_home_dir, config_item)
        if not os.path.exists(config_item_src_path):
            raise Exception("Expected configuration item '%s' does not exist" % config_item_src_path)

        # Recreate symlink just in case
        config_item_dst_path = os.path.join(instance_data_dir, config_item)
        if os.path.lexists(config_item_dst_path):
            if not os.path.islink(config_item_dst_path):
                raise Exception("Configuration item '%s' exists but is not a symlink." % config_item_dst_path)
            os.unlink(config_item_dst_path)

        l.info("Symlinking '%s' to '%s'..." % (config_item_src_path, config_item_dst_path))
        relative_symlink(config_item_src_path, config_item_dst_path)

    jetty_home_path = __jetty_home_path(dist_directory=dist_directory, solr_version=solr_version)

    l.info("Symlinking libraries and JARs...")
    library_items_to_symlink = ["lib", "solr-webapp", "start.jar", "solr", "solr-webapp"]
    for library_item in library_items_to_symlink:
        library_item_src_path = os.path.join(jetty_home_path, library_item)
        if not os.path.exists(library_item_src_path):
            raise Exception("Expected library item '%s' does not exist" % library_item_src_path)

        # Recreate symlink just in case
        library_item_dst_path = os.path.join(instance_data_dir, library_item)
        if os.path.lexists(library_item_dst_path):
            if not os.path.islink(library_item_dst_path):
                raise Exception("Library item '%s' exists but is not a symlink." % library_item_dst_path)
            os.unlink(library_item_dst_path)

        l.info("Symlinking '%s' to '%s'..." % (library_item_src_path, library_item_dst_path))
        relative_symlink(library_item_src_path, library_item_dst_path)

    log4j_properties_path = os.path.join(solr_home_dir, "resources", "log4j.properties")
    if not os.path.isfile(log4j_properties_path):
        raise Exception("log4j.properties at '%s' was not found.")

    start_jar_path = os.path.join(jetty_home_path, "start.jar")
    if not os.path.isfile(start_jar_path):
        raise Exception("start.jar at '%s' was not found." % start_jar_path)

    solr_webapp_path = os.path.abspath(os.path.join(jetty_home_path, "solr-webapp"))
    if not os.path.isdir(solr_webapp_path):
        raise Exception("Solr webapp dir at '%s' was not found." % solr_webapp_path)

    if not hostname_resolves(hostname):
        raise Exception("Hostname '%s' does not resolve." % hostname)

    if tcp_port_is_open(port=port):
        raise Exception("Port %d is already open on this machine." % port)

    __raise_if_old_shards_exist()

    args = ["java"]
    l.info("Starting Solr instance on %s, port %d..." % (hostname, port))

    if jvm_heap_size is not None:
        args += ["-Xmx%s" % jvm_heap_size]
    args += jvm_opts
    # noinspection SpellCheckingInspection
    args += [
        "-server",
        "-Djava.util.logging.config.file=file://" + os.path.abspath(log4j_properties_path),
        "-Djetty.base=%s" % instance_data_dir,
        "-Djetty.home=%s" % instance_data_dir,
        "-Djetty.port=%d" % port,
        "-Dsolr.solr.home=%s" % instance_data_dir,
        "-Dsolr.data.dir=%s" % instance_data_dir,
        "-Dhost=%s" % hostname,
        "-Dmediacloud.luceneMatchVersion=%s" % MC_SOLR_LUCENEMATCHVERSION,
        # write heap dump to data directory on OOM errors
        "-XX:+HeapDumpOnOutOfMemoryError",
        "-XX:HeapDumpPath=%s" % instance_data_dir,
        # needed for resolving paths to JARs in solrconfig.xml
        "-Dmediacloud.solr_dist_dir=%s" % solr_path,
        "-Dmediacloud.solr_webapp_dir=%s" % solr_webapp_path,
    ]
    args += start_jar_args
    args += ["-jar", start_jar_path, "--module=http"]

    l.debug("Running command: %s" % " ".join(args))

    process = subprocess.Popen(args)
    global __solr_pid
    __solr_pid = process.pid

    # Declare that we don't care about the exit code of the child process so
    # it doesn't become a zombie when it gets killed in signal handler
    signal.signal(signal.SIGCHLD, signal.SIG_IGN)

    signal.signal(signal.SIGTERM, __kill_solr_process)  # SIGTERM is handled differently for whatever reason
    atexit.register(__kill_solr_process)

    l.info("Solr PID: %d" % __solr_pid)

    l.info("Solr is starting on port %d, will be available shortly..." % port)
    wait_for_tcp_port_to_open(port=port, retries=connect_timeout)

    l.info("Solr is running on port %d!" % port)
    while True:
        time.sleep(1)
Ejemplo n.º 27
0
def run_zookeeper(dist_directory: str = MC_DIST_DIR,
                  listen: str = MC_ZOOKEEPER_LISTEN,
                  port: int = MC_ZOOKEEPER_PORT,
                  data_dir: str = MC_SOLR_BASE_DATA_DIR,
                  zookeeper_version: str = MC_ZOOKEEPER_VERSION,
                  solr_version: str = MC_SOLR_VERSION) -> None:
    """Run ZooKeeper, install if needed too."""
    if not __zookeeper_is_installed():
        log.info("ZooKeeper is not installed, installing...")
        __install_zookeeper()

    data_dir = resolve_absolute_path_under_mc_root(path=data_dir, must_exist=True)

    zookeeper_data_dir = os.path.join(data_dir, "mediacloud-cluster-zookeeper")
    if not os.path.isdir(zookeeper_data_dir):
        log.info("Creating data directory at %s..." % zookeeper_data_dir)
        mkdir_p(zookeeper_data_dir)

    if tcp_port_is_open(port=port):
        raise McZooKeeperRunException("Port %d is already open on this machine." % port)

    zookeeper_path = __zookeeper_path(dist_directory=dist_directory, zookeeper_version=zookeeper_version)

    zkserver_path = os.path.join(zookeeper_path, "bin", "zkServer.sh")
    if not os.path.isfile(zkserver_path):
        raise McZooKeeperRunException("zkServer.sh at '%s' was not found." % zkserver_path)

    log4j_properties_path = os.path.join(zookeeper_path, "conf", "log4j.properties")
    if not os.path.isfile(log4j_properties_path):
        raise McZooKeeperRunException("log4j.properties at '%s' was not found.")

    zoo_cnf_path = os.path.join(zookeeper_data_dir, "zoo.cfg")
    log.info("Creating zoo.cfg in '%s'..." % zoo_cnf_path)

    with open(zoo_cnf_path, 'w') as zoo_cnf:
        zoo_cnf.write("""
#
# This file is autogenerated. Please do not modify it!
#

clientPortAddress=%(listen)s
clientPort=%(port)d
dataDir=%(data_dir)s

# Must be between zkClientTimeout / 2 and zkClientTimeout / 20
tickTime=30000

initLimit=10
syncLimit=10
            """ % {
            "listen": listen,
            "port": port,
            "data_dir": zookeeper_data_dir,
        })

    zookeeper_env = os.environ.copy()
    zookeeper_env["ZOOCFGDIR"] = zookeeper_data_dir  # Serves as configuration dir too
    zookeeper_env["ZOOCFG"] = "zoo.cfg"
    zookeeper_env["ZOO_LOG_DIR"] = zookeeper_data_dir
    zookeeper_env["SERVER_JVMFLAGS"] = "-Dlog4j.configuration=file://" + os.path.abspath(log4j_properties_path)

    args = [
        zkserver_path,
        "start-foreground"
    ]

    log.info("Starting ZooKeeper on %s:%d..." % (listen, port))
    log.debug("Running command: %s" % str(args))
    log.debug("Environment variables: %s" % str(zookeeper_env))

    process = subprocess.Popen(args, env=zookeeper_env)
    global __zookeeper_pid
    __zookeeper_pid = process.pid

    # Declare that we don't care about the exit code of the child process so
    # it doesn't become a zombie when it gets killed in signal handler
    signal.signal(signal.SIGCHLD, signal.SIG_IGN)

    signal.signal(signal.SIGTERM, __kill_zookeeper_process)  # SIGTERM is handled differently for whatever reason
    atexit.register(__kill_zookeeper_process)

    log.info("ZooKeeper PID: %d" % __zookeeper_pid)

    log.info("Waiting for ZooKeeper to start at port %d..." % port)
    zookeeper_started = wait_for_tcp_port_to_open(port=port, retries=MC_ZOOKEEPER_CONNECT_RETRIES)
    if not zookeeper_started:
        raise McZooKeeperRunException("Unable to connect to ZooKeeper at port %d" % port)

    log.info("Uploading initial Solr collection configurations to ZooKeeper...")
    update_zookeeper_solr_configuration(zookeeper_host="localhost",
                                        zookeeper_port=port,
                                        dist_directory=dist_directory,
                                        solr_version=solr_version)

    log.info("ZooKeeper is ready on port %d!" % port)
    while True:
        time.sleep(1)
Ejemplo n.º 28
0
def test_http_hash_server():
    port = random_unused_port()
    base_url = 'http://localhost:%d' % port

    def __simple_callback(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: application/json; charset=UTF-8\r\n"
        r += "\r\n"
        r += json.dumps({
            'name': 'callback',
            'method': request.method(),
            'url': request.url(),
            'content-type': request.content_type(),
            'params': request.query_params(),
            'cookies': request.cookies(),
        })
        return str.encode(r)

    # noinspection PyUnusedLocal
    def __callback_cookie_redirect(request: HashServer.Request) -> str:
        r = ""
        r += "HTTP/1.0 302 Moved Temporarily\r\n"
        r += "Content-Type: text/html; charset=UTF-8\r\n"
        r += "Location: /check_cookie\r\n"
        r += "Set-Cookie: test_cookie=I'm a cookie and I know it!\r\n"
        r += "\r\n"
        r += "Redirecting to the cookie check page..."
        return r

    def __callback_post(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: application/json; charset=UTF-8\r\n"
        r += "\r\n"
        r += json.dumps({
            'name': 'callback_post',
            'post_data': request.content(),
        })
        return str.encode(r)

    pages = {
        '/': 'home',
        '/foo': b'foo',
        '/bar': 'bar ąą',
        '/foo-bar': {
            b'redirect': b'/bar'
        },
        '/localhost': {
            'redirect': "http://localhost:%d/" % port
        },
        b'/127-foo': {
            b'redirect': "http://127.0.0.1:%d/foo" % port
        },
        '/auth': {
            b'auth': b'foo:bar',
            b'content': b"foo bar \xf0\x90\x28\xbc"
        },
        '/404': {
            b'content': b'not found',
            b'http_status_code': 404
        },
        '/callback': {
            b'callback': __simple_callback
        },

        # Test setting cookies, redirects
        '/callback_cookie_redirect': {
            'callback': __callback_cookie_redirect
        },

        # POST data
        '/callback_post': {
            'callback': __callback_post
        },
    }

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port)

    assert str(requests.get('%s/' % base_url).text) == 'home'
    assert str(requests.get('%s/foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar' % base_url).text) == 'bar ąą'
    assert str(requests.get('%s/foo-bar' % base_url).text) == 'bar ąą'
    assert str(requests.get('%s/localhost' % base_url).text) == 'home'
    assert str(requests.get('%s/127-foo' % base_url).text) == 'foo'

    # Path normalization
    assert str(requests.get('%s//' % base_url).text) == 'home'
    assert str(requests.get('%s///' % base_url).text) == 'home'
    assert str(requests.get('%s/something/../' % base_url).text) == 'home'
    assert str(requests.get('%s/something/..//' % base_url).text) == 'home'
    assert str(requests.get('%s/something/..///' % base_url).text) == 'home'
    assert str(requests.get('%s/foo/' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo//' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo///' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo/' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo//' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo///' % base_url).text) == 'foo'

    response_json = requests.get('%s/callback?a=b&c=d' % base_url,
                                 cookies={
                                     'cookie_name': 'cookie_value'
                                 }).json()
    assert response_json == {
        'name': 'callback',
        'method': 'GET',
        'url': 'http://%s:%d/callback?a=b&c=d' % (_fqdn(), port),
        'content-type': None,
        'params': {
            'a': 'b',
            'c': 'd',
        },
        'cookies': {
            'cookie_name': 'cookie_value',
        },
    }

    response = requests.get('%s/callback_cookie_redirect' % base_url,
                            allow_redirects=False)
    assert response.status_code == 302
    assert response.headers['Location'] == '/check_cookie'

    response = requests.get("%s/404" % base_url)
    assert response.status_code == HTTPStatus.NOT_FOUND.value
    assert 'Not Found' in response.reason

    auth_url = "%s/auth" % base_url

    assert requests.get(auth_url).status_code == HTTPStatus.UNAUTHORIZED
    assert requests.get(auth_url,
                        auth=('foo',
                              'foo')).status_code == HTTPStatus.UNAUTHORIZED

    response = requests.get(auth_url, auth=('foo', 'bar'))
    assert response.status_code == HTTPStatus.OK
    assert response.content == b"foo bar \xf0\x90\x28\xbc"

    assert urls_are_equal(url1=hs.page_url('/callback?a=b&c=d'),
                          url2='http://%s:%d/callback' % (_fqdn(), port))
    with pytest.raises(McHashServerException):
        hs.page_url('/does-not-exist')

    response_json = requests.post('%s/callback_post' % base_url,
                                  data='abc=def').json()
    assert response_json == {
        'name': 'callback_post',
        'post_data': 'abc=def',
    }

    hs.stop()