Example #1
0
    def isolate(self):
        # Generate a random string, which will be the "namespace" of the
        # minicluster. This namespace will be used to add random suffixes to
        # container names so that they do not collide on the same docker
        # daemon.
        # TODO: Use this namespace.
        letters = string.ascii_lowercase
        rand_str = ''.join(random.choice(letters) for i in range(3))
        self._namespace = rand_str + '-'
        self.cli = Client(base_url="unix://var/run/docker.sock",
                          namespace=self._namespace)

        # Now we need to randomize the ports.
        # TODO: Fix race condition between the find_free_port() and the process
        # that will actually bind to that port.
        self.config["local_master_port"] = utils.find_free_port()
        self.config["local_zk_port"] = utils.find_free_port()
        self.config["local_cassandra_cql_port"] = utils.find_free_port()
        self.config["local_cassandra_thrift_port"] = utils.find_free_port()

        self.mesos_agent_ports = utils.randomize_ports(self.mesos_agent_ports)

        self.resmgr_ports = utils.randomize_ports(self.resmgr_ports)
        self.hostmgr_ports = utils.randomize_ports(self.hostmgr_ports)
        self.jobmgr_ports = utils.randomize_ports(self.jobmgr_ports)
        self.aurorabridge_ports = utils.randomize_ports(
            self.aurorabridge_ports)
        self.apiserver_ports = utils.randomize_ports(self.apiserver_ports)
        self.archiver_ports = utils.randomize_ports(self.archiver_ports)
        self.placement_ports = utils.randomize_ports(self.placement_ports)
        self.mockcqos_ports = utils.randomize_ports(self.mockcqos_ports)

        # TODO: Save those to local disk, or print them to stdout.
        return self._namespace
    def __init__(self, 
                 take_ownership=True, # Tor dies when the Crawler does
                 torrc_config={"CookieAuth": "1"},
                 tor_log="/var/log/tor/tor.log",
                 tor_cell_log="/var/log/tor/tor_cell_seq.log",
                 control_port=9051,
                 socks_port=9050, 
                 run_in_xvfb=True,
                 tbb_path=join("/opt","tbb","tor-browser_en-US"),
                 tb_log_path=join(_log_dir,"firefox.log"),
                 tb_tor_cfg=USE_RUNNING_TOR,
                 page_load_timeout=20,
                 wait_on_page=5,
                 wait_after_closing_circuits=0,
                 restart_on_sketchy_exception=True,
                 additional_control_fields={},
                 db_handler=None):

        self.logger = setup_logging(_log_dir, "crawler")

        self.torrc_config = torrc_config
        self.socks_port = find_free_port(socks_port, control_port)
        self.torrc_config.update({"SocksPort": str(self.socks_port)})
        self.control_port = find_free_port(control_port, self.socks_port)
        self.torrc_config.update({"ControlPort": str(self.control_port)})
        self.torrc_config.update({"Log": "INFO file {}".format(tor_log)})
        self.logger.info("Starting tor process with config "
                         "{torrc_config}.".format(**locals()))
        self.tor_process = launch_tor_with_config(config=self.torrc_config,
                                                  take_ownership=take_ownership)
        self.authenticate_to_tor_controlport()

        self.logger.info("Opening cell log stream...")
        self.cell_log = open(tor_cell_log, "rb")

        if run_in_xvfb:
            self.logger.info("Starting Xvfb...")
            self.run_in_xvfb = True
            self.virtual_framebuffer = start_xvfb()

        self.logger.info("Starting Tor Browser...")
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=tb_tor_cfg,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)

        self.wait_after_closing_circuits = wait_after_closing_circuits
        self.page_load_timeout = page_load_timeout
        self.tb_driver.set_page_load_timeout(page_load_timeout)
        self.wait_on_page = wait_on_page
        self.restart_on_sketchy_exception = restart_on_sketchy_exception

        self.control_data = self.get_control_data(page_load_timeout,
                                                  wait_on_page,
                                                  wait_after_closing_circuits,
                                                  additional_control_fields)
        self.db_handler = db_handler
        if db_handler:
            self.crawlid = self.db_handler.add_crawl(self.control_data)
Example #3
0
def main():
    videoPath = str(sys.argv[1])
    print("Your ip is: {}".format(utils.get_ip()))

    commands = []
    Collector_Receiving_Ports = []
    Collector_Sending_Ports = []

    # Generate needed random free ports
    producerPort = str(utils.get_ip()) + ":" + str(utils.find_free_port())
    for i in range(math.ceil(utils.N / 2)):
        Collector_Receiving_Ports.append(
            str(utils.get_ip()) + ":" + str(utils.find_free_port()))
        Collector_Sending_Ports.append(
            str(utils.get_ip()) + ":" + str(utils.find_free_port()))

    # Send Collector Ports to second computer
    try:
        ipPortConnecton = str(utils.SENDER) + ":" + utils.CONNECTION_PORT
        senderSocket, senderContext = utils.configure_port(
            ipPortConnecton, zmq.PUSH, "bind")
        data = pickle.dumps(Collector_Sending_Ports)
        senderSocket.send(data)
        print("Ports data has been sent...")
    except:
        print("Machine 1 (Sender) ERROR IN SENDING CONNECTION DATA, " +
              "Try Chaning the CONNECTION_PORT in utils.py file")

    # Generate needed Processes
    # Generate Producer
    commands.append('python Producer.py {} {}'.format(videoPath, producerPort))

    # Generate N Consumers1
    for i in range(utils.N):
        commands.append('python Consumer1.py {} {}'.format(
            producerPort, Collector_Receiving_Ports[math.floor(i / 2)]))

    # Generate N / 2 Collector
    for i in range(math.ceil(utils.N / 2)):
        commands.append('python Collector.py {} {}'.format(
            Collector_Receiving_Ports[i], Collector_Sending_Ports[i]))

    # Run in parallel
    processes = [Popen(cmd, shell=True) for cmd in commands]
    for p in processes:
        p.wait()

    senderSocket.close()
    senderContext.destroy()
Example #4
0
File: srs.py Project: bryfry/srs
async def srs(loop, port=None):

    # create app handler and closing event
    closing_time = asyncio.Event()
    closing_task = asyncio.create_task(closing_time.wait())
    closers.append(closing_time)
    handler = ClosingHandler(closing_time)

    # add handler to new application runner
    app = web.Application()
    app.add_routes([web.get("/", handler.default)])
    runner = web.AppRunner(app)
    await runner.setup()

    # start app on a specific tcp port
    if port == None:
        port = find_free_port()
    site = web.TCPSite(runner, "0.0.0.0", port)
    logging.info(f"{site.name} starting")
    try:
        await site.start()
    except OSError:
        logging.warn(f"port {port} already in use, trying a different one")
        await srs(loop)

    # wait for closing event
    await closing_task
    logging.info(f"{site.name} closing")
    await runner.cleanup()

    loop.create_task(srs(loop, handler.next_port))
Example #5
0
File: srs.py Project: bryfry/srs
 async def default(self, request):
     self.exit_event.set()
     self.next_port = find_free_port()
     data = {
         "now": request.url.port,
         "next": self.next_port,
         "flag-slice": random_flag_index(),
     }
     return web.json_response(data)
Example #6
0
def test_run_container(channel, img, gpu):
    with TempDir() as tmp:
        args = Arguments(
            channel,
            img,
            tmp,
            None,
            False,
            "",
            gpu,
            True,
            False,
            False,
            "us-docker.pkg.dev/android-emulator-268719/images",
            False,
            False,
        )
        emu_docker.accept_licenses(args)
        devices = emu_docker.create_docker_image(args)
        assert devices
        for device in devices:
            port = find_free_port()

            # Launch this thing.
            container = device.launch({"5555/tcp": port})
            # Now we are going to insepct this thing.
            api_client = device.get_api_client()
            status = api_client.inspect_container(container.id)
            state = status["State"]
            assert state["Status"] == "running"

            # Acceptable states:
            # starting --> We are still launching
            # healthy --> Yay, we booted! Good to go..
            health = state["Health"]["Status"]
            while health == "starting":
                health = api_client.inspect_container(
                    container.id)["State"]["Health"]["Status"]

            assert health == "healthy"

            # Good, good.. From an internal perspective things look great.
            # Can we connect with adb from outside the container?
            adb = find_adb()

            # Erase knowledge of existing devices.
            subprocess.check_output([adb, "kill-server"])
            name = "localhost:{}".format(port)
            subprocess.check_output([adb, "connect", name])

            # Boot complete should be true..
            res = subprocess.check_output(
                [adb, "-s", name, "shell", "getprop", "dev.bootcomplete"])
            assert "1" in str(res)

            api_client.stop(container.id)
Example #7
0
    def __init__(self, zone, id, tracker, build_indexes):
        self.zone = zone
        self.id = id
        self.tracker = tracker
        self.build_indexes = build_indexes
        self.torrent_client_port = find_free_port()
        self.registry_port = find_free_port()
        self.port = find_free_port()
        self.config_file = 'test-{zone}.yaml'.format(zone=zone)
        self.name = 'kraken-agent-{id}-{zone}'.format(id=id, zone=zone)

        populate_config_template(
            'agent',
            self.config_file,
            trackers=yaml_list([self.tracker.addr]),
            build_indexes=yaml_list([bi.addr for bi in self.build_indexes]))

        self.volumes = create_volumes('agent', self.name)

        self.start()
Example #8
0
    def __init__(self,
                 zone,
                 id,
                 tracker,
                 build_indexes,
                 with_docker_socket=False):
        self.zone = zone
        self.id = id
        self.tracker = tracker
        self.build_indexes = build_indexes
        self.torrent_client_port = find_free_port()
        self.registry_port = find_free_port()
        self.port = find_free_port()
        self.config_file = 'test-{zone}.yaml'.format(zone=zone)
        self.name = 'kraken-agent-{id}-{zone}'.format(id=id, zone=zone)
        self.with_docker_socket = with_docker_socket

        populate_config_template('agent',
                                 self.config_file,
                                 trackers=yaml_list([self.tracker.addr]),
                                 build_indexes=yaml_list(
                                     [bi.addr for bi in self.build_indexes]))

        if self.with_docker_socket:
            # In aditional to the need to mount docker socket, also avoid using
            # local cache volume, otherwise the process would run as root and
            # create local cache files that's hard to clean outside of the
            # container.
            self.volumes = create_volumes('agent',
                                          self.name,
                                          local_cache=False)
            self.volumes['/var/run/docker.sock'] = {
                'bind': '/var/run/docker.sock',
                'mode': 'rw',
            }
        else:
            self.volumes = create_volumes('agent', self.name)

        self.start()
Example #9
0
    def __init__(self, zone, origin_cluster):
        self.zone = zone
        self.origin_cluster = origin_cluster
        self.port = find_free_port()
        self.config_file = 'test-{zone}.yaml'.format(zone=zone)
        self.name = 'kraken-tracker-{zone}'.format(zone=zone)

        populate_config_template(
            'tracker',
            self.config_file,
            origins=yaml_list([o.addr for o in self.origin_cluster.origins]))

        self.volumes = create_volumes('tracker', self.name)

        self.start()
Example #10
0
    def __init__(self, zone, origin_cluster, build_indexes):
        self.zone = zone
        self.origin_cluster = origin_cluster
        self.build_indexes = build_indexes
        self.port = find_free_port()
        self.config_file = 'test-{zone}.yaml'.format(zone=zone)
        self.name = 'kraken-proxy-{zone}'.format(zone=zone)

        populate_config_template(
            'proxy',
            self.config_file,
            build_indexes=yaml_list([bi.addr for bi in self.build_indexes]),
            origins=yaml_list([o.addr for o in self.origin_cluster.origins]))

        self.volumes = create_volumes('proxy', self.name)

        self.start()
Example #11
0
def test_run_container(channel, img):
    assert not "linux" in sys.platform
    assert docker.from_env().ping()
    with TempDir() as tmp:
        args = Arguments(channel, img, tmp, None, False, "")
        device = emu_docker.create_docker_image(args)
        port = find_free_port()

        # Launch this thing.
        device.launch(device.identity, port)
        # Now we are going to insepct this thing.
        api_client = device.get_api_client()
        status = api_client.inspect_container(device.container.id)
        state = status["State"]
        assert state["Status"] == "running"

        # Acceptable states:
        # starting --> We are still launching
        # healthy --> Yay, we booted! Good to go..
        health = state["Health"]["Status"]
        while health == "starting":
            health = api_client.inspect_container(
                device.container.id)["State"]["Health"]["Status"]

        assert health == "healthy"

        # Good, good.. From an internal perspective things look great.
        # Can we connect with adb from outside the container?
        adb = find_adb()

        # Erase knowledge of existing devices.
        subprocess.check_output([adb, "kill-server"])
        name = "localhost:{}".format(port)
        subprocess.check_output([adb, "connect", name])

        # Boot complete should be true..
        res = subprocess.check_output(
            [adb, "-s", name, "shell", "getprop", "dev.bootcomplete"])
        assert "1" in str(res)

        api_client.stop(device.container.id)
Example #12
0
    def __init__(self,
                 take_ownership=True, # Tor dies when the Sorter does
                 torrc_config={"ControlPort": "9051",
                               "CookieAuth": "1"},
                 socks_port=9050,
                 page_load_timeout=20,
                 max_tasks=10,
                 db_handler=None):

        self.logger = setup_logging(_log_dir, "sorter")
        self.db_handler = db_handler

        self.logger.info("Opening event loop for Sorter...")
        self.loop = asyncio.get_event_loop()
        self.max_tasks = max_tasks
        self.logger.info("Creating Sorter queue...")
        self.q = asyncio.Queue()

        # Start tor and create an aiohttp tor connector
        self.torrc_config = torrc_config
        self.socks_port = str(find_free_port(socks_port))
        self.torrc_config.update({"SocksPort": self.socks_port})
        self.logger.info("Starting tor process with config "
                         "{self.torrc_config}.".format(**locals()))
        self.tor_process = launch_tor_with_config(config=self.torrc_config,
                                                  take_ownership=take_ownership)
        onion_proxy = aiosocks.Socks5Addr('127.0.0.1', socks_port)
        conn = SocksConnector(proxy=onion_proxy, remote_resolve=True)

        # aiohttp's ClientSession does connection pooling and HTTP keep-alives
        # for us
        self.logger.info("Creating aiohttp ClientSession with our event loop "
                         "and tor proxy connector...")
        self.session = aiohttp.ClientSession(loop=self.loop, connector=conn)

        # Pretend we're Tor Browser in order to get rejected by less sites/WAFs
        u = "Mozilla/5.0 (Windows NT 6.1; rv:45.0) Gecko/20100101 Firefox/45.0"
        self.headers = {'user-agent': u}

        self.page_load_timeout = page_load_timeout
Example #13
0
def main():
    outputPath = str(sys.argv[1])
    print("Your ip is: {}".format(utils.get_ip()))

    commands = []
    Collector_Sending_Ports = []

    # Generate needed random free ports
    finalCollectorPort = str(utils.get_ip()) + ":" + \
        str(utils.find_free_port())

    # Recieve Collector Ports from frst computer
    # try:
    ipPortConnecton = str(utils.SENDER) + ":" + utils.CONNECTION_PORT
    recieverSocket, recieverContext = utils.configure_port(
        ipPortConnecton, zmq.PULL, "connect")
    Collector_Sending_Ports = pickle.loads(recieverSocket.recv())
    print("Port has been recieved from the sender's collector")
    # except:
    #     print("Machine 2 (Reciever) ERROR IN RECIVING CONNECTION DATA," +
    #           "Try Chaning the CONNECTION_PORT in utils.py file")

    # Generate needed Processes
    # Generate N Consumers2
    for i in range(utils.N):
        commands.append('python Consumer2.py {} {}'.format(
            Collector_Sending_Ports[int(math.floor(i/2))], finalCollectorPort))

    # Generate Final Collector
    commands.append('python Final_Collector.py {} {}'.format(
        outputPath, finalCollectorPort))

    # Run in parallel
    processes = [Popen(cmd, shell=True) for cmd in commands]
    for p in processes:
        p.wait()

    recieverSocket.close()
    recieverContext.destroy()
Example #14
0
def main():
    omniscript_path = os.path.dirname(__file__)
    omnisci_server = None
    args = None
    port_default_value = -1

    parser = argparse.ArgumentParser(
        description="Run internal tests from ibis project")
    required = parser._action_groups.pop()
    optional = parser.add_argument_group("optional arguments")
    omnisci = parser.add_argument_group("omnisci")
    benchmark = parser.add_argument_group("benchmark")
    mysql = parser.add_argument_group("mysql")
    commits = parser.add_argument_group("commits")

    possible_tasks = ["build", "test", "benchmark"]
    benchmarks = ["ny_taxi", "santander", "census", "plasticc"]
    # Task
    required.add_argument(
        "-task",
        dest="task",
        required=True,
        help=
        f"Task for execute {possible_tasks}. Use , separator for multiple tasks",
    )

    # Environment
    required.add_argument("-en",
                          "--env_name",
                          dest="env_name",
                          help="Conda env name.")
    optional.add_argument(
        "-ec",
        "--env_check",
        dest="env_check",
        default=False,
        type=str_arg_to_bool,
        help="Check if env exists. If it exists don't recreate.",
    )
    optional.add_argument(
        "-s",
        "--save_env",
        dest="save_env",
        default=False,
        type=str_arg_to_bool,
        help="Save conda env after executing.",
    )
    optional.add_argument(
        "-r",
        "--report_path",
        dest="report_path",
        default=os.path.join(omniscript_path, ".."),
        help="Path to report file.",
    )
    optional.add_argument(
        "-ci",
        "--ci_requirements",
        dest="ci_requirements",
        default=os.path.join(omniscript_path, "ci_requirements.yml"),
        help="File with ci requirements for conda env.",
    )
    optional.add_argument(
        "-py",
        "--python_version",
        dest="python_version",
        default="3.7",
        help="File with ci requirements for conda env.",
    )
    # Ibis
    required.add_argument(
        "-i",
        "--ibis_path",
        dest="ibis_path",
        required=True,
        help="Path to ibis directory.",
    )
    # Ibis tests
    optional.add_argument(
        "-expression",
        dest="expression",
        default=" ",
        help=
        "Run tests which match the given substring test names and their parent "
        "classes. Example: 'test_other', while 'not test_method' matches those "
        "that don't contain 'test_method' in their names.",
    )
    # Omnisci server parameters
    omnisci.add_argument(
        "-executable",
        dest="executable",
        required=True,
        help="Path to omnisci_server executable.",
    )
    omnisci.add_argument(
        "--omnisci_cwd",
        dest="omnisci_cwd",
        help="Path to omnisci working directory. "
        "By default parent directory of executable location is used. "
        "Data directory is used in this location.",
    )
    omnisci.add_argument(
        "-port",
        dest="port",
        default=port_default_value,
        type=int,
        help="TCP port number to run omnisci_server on.",
    )
    omnisci.add_argument(
        "-http_port",
        dest="http_port",
        default=port_default_value,
        type=int,
        help="HTTP port number to run omnisci_server on.",
    )
    omnisci.add_argument(
        "-calcite_port",
        dest="calcite_port",
        default=port_default_value,
        type=int,
        help="Calcite port number to run omnisci_server on.",
    )
    omnisci.add_argument(
        "-user",
        dest="user",
        default="admin",
        help="User name to use on omniscidb server.",
    )
    omnisci.add_argument(
        "-password",
        dest="password",
        default="HyperInteractive",
        help="User password to use on omniscidb server.",
    )
    omnisci.add_argument(
        "-database_name",
        dest="database_name",
        default="agent_test_ibis",
        help="Database name to use in omniscidb server.",
    )
    omnisci.add_argument(
        "-table",
        dest="table",
        default="benchmark_table",
        help="Table name name to use in omniscidb server.",
    )
    omnisci.add_argument(
        "-ipc_conn",
        dest="ipc_connection",
        default=True,
        type=str_arg_to_bool,
        help="Table name name to use in omniscidb server.",
    )
    # Benchmark parameters
    benchmark.add_argument(
        "-bench_name",
        dest="bench_name",
        choices=benchmarks,
        help="Benchmark name.",
    )
    benchmark.add_argument(
        "-data_file",
        dest="data_file",
        help="A datafile that should be loaded.",
    )
    benchmark.add_argument(
        "-dfiles_num",
        dest="dfiles_num",
        default=1,
        type=int,
        help="Number of datafiles to input into database for processing.",
    )
    benchmark.add_argument(
        "-iterations",
        dest="iterations",
        default=1,
        type=int,
        help=
        "Number of iterations to run every query. Best result is selected.",
    )
    benchmark.add_argument("-dnd",
                           default=False,
                           type=str_arg_to_bool,
                           help="Do not delete old table.")
    benchmark.add_argument(
        "-dni",
        default=False,
        type=str_arg_to_bool,
        help="Do not create new table and import any data from CSV files.",
    )
    benchmark.add_argument(
        "-validation",
        dest="validation",
        default=False,
        type=str_arg_to_bool,
        help=
        "validate queries results (by comparison with Pandas queries results).",
    )
    benchmark.add_argument(
        "-optimizer",
        choices=["intel", "stock"],
        dest="optimizer",
        default="intel",
        help="Which optimizer is used",
    )
    benchmark.add_argument(
        "-no_ibis",
        default=False,
        type=str_arg_to_bool,
        help="Do not run Ibis benchmark, run only Pandas (or Modin) version",
    )
    benchmark.add_argument(
        "-pandas_mode",
        choices=["Pandas", "Modin_on_ray", "Modin_on_dask", "Modin_on_python"],
        default="Pandas",
        help="Specifies which version of Pandas to use: "
        "plain Pandas, Modin runing on Ray or on Dask",
    )
    benchmark.add_argument(
        "-ray_tmpdir",
        default="/tmp",
        help="Location where to keep Ray plasma store. "
        "It should have enough space to keep -ray_memory",
    )
    benchmark.add_argument(
        "-ray_memory",
        default=200 * 1024 * 1024 * 1024,
        help="Size of memory to allocate for Ray plasma store",
    )
    benchmark.add_argument(
        "-no_ml",
        default=False,
        type=str_arg_to_bool,
        help="Do not run machine learning benchmark, only ETL part",
    )
    optional.add_argument(
        "-gpu_memory",
        dest="gpu_memory",
        type=int,
        help="specify the memory of your gpu, default 16. "
        "(This controls the lines to be used. Also work for CPU version. )",
        default=16,
    )
    # MySQL database parameters
    mysql.add_argument(
        "-db_server",
        dest="db_server",
        default="localhost",
        help="Host name of MySQL server.",
    )
    mysql.add_argument(
        "-db_port",
        dest="db_port",
        default=3306,
        type=int,
        help="Port number of MySQL server.",
    )
    mysql.add_argument(
        "-db_user",
        dest="db_user",
        help="Username to use to connect to MySQL database. "
        "If user name is specified, script attempts to store results in MySQL "
        "database using other -db-* parameters.",
    )
    mysql.add_argument(
        "-db_pass",
        dest="db_pass",
        default="omniscidb",
        help="Password to use to connect to MySQL database.",
    )
    mysql.add_argument(
        "-db_name",
        dest="db_name",
        default="omniscidb",
        help="MySQL database to use to store benchmark results.",
    )
    optional.add_argument(
        "-db_table_etl",
        dest="db_table_etl",
        help="Table to use to store ETL results for this benchmark.",
    )
    optional.add_argument(
        "-db_table_ml",
        dest="db_table_ml",
        help="Table to use to store ML results for this benchmark.",
    )
    # Additional information
    commits.add_argument(
        "-commit_omnisci",
        dest="commit_omnisci",
        default="1234567890123456789012345678901234567890",
        help="Omnisci commit hash to use for tests.",
    )
    commits.add_argument(
        "-commit_ibis",
        dest="commit_ibis",
        default="1234567890123456789012345678901234567890",
        help="Ibis commit hash to use for tests.",
    )

    try:
        args = parser.parse_args()

        os.environ["IBIS_TEST_OMNISCIDB_DATABASE"] = args.database_name
        os.environ["IBIS_TEST_DATA_DB"] = args.database_name
        os.environ["IBIS_TEST_OMNISCIDB_PORT"] = str(args.port)
        os.environ["PYTHONIOENCODING"] = "UTF-8"
        os.environ["PYTHONUNBUFFERED"] = "1"

        if args.port == port_default_value:
            args.port = find_free_port()
        if args.http_port == port_default_value:
            args.http_port = find_free_port()
        if args.calcite_port == port_default_value:
            args.calcite_port = find_free_port()

        required_tasks = args.task.split(",")
        tasks = {}
        for task in possible_tasks:
            tasks[task] = True if task in required_tasks else False

        if True not in list(tasks.values()):
            print(
                f"Only {list(tasks.keys())} are supported, {required_tasks} cannot find possible tasks"
            )
            sys.exit(1)

        if args.python_version not in ["3.7", "3,6"]:
            print(
                f"Only 3.7 and 3.6 python versions are supported, {args.python_version} is not supported"
            )
            sys.exit(1)

        ibis_requirements = os.path.join(
            args.ibis_path, "ci",
            f"requirements-{args.python_version}-dev.yml")
        requirements_file = "requirements.yml"

        conda_env = CondaEnvironment(args.env_name)

        print("PREPARING ENVIRONMENT")
        combinate_requirements(ibis_requirements, args.ci_requirements,
                               requirements_file)
        conda_env.create(args.env_check, requirements_file=requirements_file)

        if tasks["build"]:
            install_ibis_cmdline = [
                "python3", os.path.join("setup.py"), "install"
            ]

            print("IBIS INSTALLATION")
            conda_env.run(install_ibis_cmdline,
                          cwd=args.ibis_path,
                          print_output=False)

        if tasks["test"]:
            ibis_data_script = os.path.join(args.ibis_path, "ci", "datamgr.py")
            dataset_download_cmdline = [
                "python3", ibis_data_script, "download"
            ]
            dataset_import_cmdline = [
                "python3",
                ibis_data_script,
                "omniscidb",
                "-P",
                str(args.port),
                "--database",
                args.database_name,
            ]
            report_file_name = (
                f"report-{args.commit_ibis[:8]}-{args.commit_omnisci[:8]}.html"
            )
            if not os.path.isdir(args.report_path):
                os.makedirs(args.report_path)
            report_file_path = os.path.join(args.report_path, report_file_name)

            ibis_tests_cmdline = [
                "pytest",
                "-m",
                "omniscidb",
                "--disable-pytest-warnings",
                "-k",
                args.expression,
                f"--html={report_file_path}",
            ]

            print("STARTING OMNISCI SERVER")
            omnisci_server = OmnisciServer(
                omnisci_executable=args.executable,
                omnisci_port=args.port,
                http_port=args.http_port,
                calcite_port=args.calcite_port,
                database_name=args.database_name,
                omnisci_cwd=args.omnisci_cwd,
                user=args.user,
                password=args.password,
            )
            omnisci_server.launch()

            print("PREPARING DATA")
            conda_env.run(dataset_download_cmdline)
            conda_env.run(dataset_import_cmdline)

            print("RUNNING TESTS")
            conda_env.run(ibis_tests_cmdline, cwd=args.ibis_path)

        if tasks["benchmark"]:
            # if not args.bench_name or args.bench_name not in benchmarks:
            #     print(
            #     f"Benchmark {args.bench_name} is not supported, only {benchmarks} are supported")
            # sys.exit(1)

            if not args.data_file:
                print(
                    f"Parameter --data_file was received empty, but it is required for benchmarks"
                )
                sys.exit(1)

            benchmark_script_path = os.path.join(omniscript_path,
                                                 "run_ibis_benchmark.py")

            benchmark_cmd = ["python3", benchmark_script_path]

            possible_benchmark_args = [
                "bench_name",
                "data_file",
                "dfiles_num",
                "iterations",
                "dnd",
                "dni",
                "validation",
                "optimizer",
                "no_ibis",
                "pandas_mode",
                "ray_tmpdir",
                "ray_memory",
                "no_ml",
                "gpu_memory",
                "db_server",
                "db_port",
                "db_user",
                "db_pass",
                "db_name",
                "db_table_etl",
                "db_table_ml",
                "executable",
                "omnisci_cwd",
                "port",
                "http_port",
                "calcite_port",
                "user",
                "password",
                "ipc_connection",
                "database_name",
                "table",
                "commit_omnisci",
                "commit_ibis",
            ]
            args_dict = vars(args)
            args_dict["data_file"] = f"'{args_dict['data_file']}'"
            for arg_name in list(parser._option_string_actions.keys()):
                try:
                    pure_arg = re.sub(r"^--*", "", arg_name)
                    if pure_arg in possible_benchmark_args:
                        arg_value = args_dict[pure_arg]
                        if arg_value:
                            benchmark_cmd.extend([arg_name, str(arg_value)])
                except KeyError:
                    pass

            print(benchmark_cmd)

            conda_env.run(benchmark_cmd)

    except Exception:
        traceback.print_exc(file=sys.stdout)
        sys.exit(1)

    finally:
        if omnisci_server:
            omnisci_server.terminate()
        if args and args.save_env is False:
            conda_env.remove()
Example #15
0
    def __init__(
            self,
            take_ownership=True,  # Tor dies when the Crawler does
            torrc_config={"CookieAuth": "1"},
            tor_log="/var/log/tor/tor.log",
            tor_cell_log="/var/log/tor/tor_cell_seq.log",
            control_port=9051,
            socks_port=9050,
            run_in_xvfb=True,
            tbb_path=join("/opt", "tbb", "tor-browser_en-US"),
            tb_log_path=join(_log_dir, "firefox.log"),
            tb_tor_cfg=USE_RUNNING_TOR,
            page_load_timeout=20,
            wait_on_page=5,
            wait_after_closing_circuits=0,
            restart_on_sketchy_exception=True,
            additional_control_fields={},
            db_handler=None):

        self.logger = setup_logging(_log_dir, "crawler")
        # Set stem logging level to INFO - "high level library activity"
        stem.util.log.get_logger().setLevel(stem.util.log.Runlevel.INFO)

        self.torrc_config = torrc_config
        self.socks_port = find_free_port(socks_port, control_port)
        self.torrc_config.update({"SocksPort": str(self.socks_port)})
        self.control_port = find_free_port(control_port, self.socks_port)
        self.torrc_config.update({"ControlPort": str(self.control_port)})
        self.torrc_config.update({"Log": "INFO file {}".format(tor_log)})
        self.logger.info("Starting tor process with config "
                         "{torrc_config}.".format(**locals()))
        self.tor_process = launch_tor_with_config(
            config=self.torrc_config, take_ownership=take_ownership)
        self.authenticate_to_tor_controlport()

        self.logger.info("Opening cell log stream...")
        self.cell_log = open(tor_cell_log, "rb")

        if run_in_xvfb:
            self.logger.info("Starting Xvfb...")
            self.run_in_xvfb = True
            self.virtual_framebuffer = start_xvfb()

        self.logger.info("Starting Tor Browser...")
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=tb_tor_cfg,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)

        self.wait_after_closing_circuits = wait_after_closing_circuits
        self.page_load_timeout = page_load_timeout
        self.tb_driver.set_page_load_timeout(page_load_timeout)
        self.wait_on_page = wait_on_page
        self.restart_on_sketchy_exception = restart_on_sketchy_exception

        self.control_data = self.get_control_data(page_load_timeout,
                                                  wait_on_page,
                                                  wait_after_closing_circuits,
                                                  additional_control_fields)
        self.db_handler = db_handler
        if db_handler:
            self.crawlid = self.db_handler.add_crawl(self.control_data)
Example #16
0
def main():
    omniscript_path = os.path.dirname(__file__)
    args = None
    omnisci_server = None
    port_default_value = -1

    benchmarks = ["ny_taxi", "santander", "census", "plasticc"]

    parser = argparse.ArgumentParser(
        description="Run internal tests from ibis project")
    optional = parser._action_groups.pop()
    required = parser.add_argument_group("required arguments")
    parser._action_groups.append(optional)

    required.add_argument(
        "-bench_name",
        dest="bench_name",
        choices=benchmarks,
        help="Benchmark name.",
        required=True,
    )
    required.add_argument(
        "-data_file",
        dest="data_file",
        help="A datafile that should be loaded.",
        required=True,
    )
    optional.add_argument(
        "-dfiles_num",
        dest="dfiles_num",
        default=1,
        type=int,
        help="Number of datafiles to input into database for processing.",
    )
    optional.add_argument(
        "-iterations",
        dest="iterations",
        default=1,
        type=int,
        help=
        "Number of iterations to run every query. Best result is selected.",
    )
    optional.add_argument("-dnd",
                          default=False,
                          type=str_arg_to_bool,
                          help="Do not delete old table.")
    optional.add_argument(
        "-dni",
        default=False,
        type=str_arg_to_bool,
        help="Do not create new table and import any data from CSV files.",
    )
    optional.add_argument(
        "-validation",
        dest="validation",
        default=False,
        type=str_arg_to_bool,
        help=
        "validate queries results (by comparison with Pandas queries results).",
    )
    optional.add_argument(
        "-optimizer",
        choices=["intel", "stock"],
        dest="optimizer",
        default="intel",
        help="Which optimizer is used",
    )
    optional.add_argument(
        "-no_ibis",
        default=False,
        type=str_arg_to_bool,
        help="Do not run Ibis benchmark, run only Pandas (or Modin) version",
    )
    optional.add_argument(
        "-pandas_mode",
        choices=["Pandas", "Modin_on_ray", "Modin_on_dask", "Modin_on_python"],
        default="Pandas",
        help=
        "Specifies which version of Pandas to use: plain Pandas, Modin runing on Ray or on Dask",
    )
    optional.add_argument(
        "-ray_tmpdir",
        default="/tmp",
        help=
        "Location where to keep Ray plasma store. It should have enough space to keep -ray_memory",
    )
    optional.add_argument(
        "-ray_memory",
        default=200 * 1024 * 1024 * 1024,
        help="Size of memory to allocate for Ray plasma store",
    )
    optional.add_argument(
        "-no_ml",
        default=False,
        type=str_arg_to_bool,
        help="Do not run machine learning benchmark, only ETL part",
    )
    optional.add_argument(
        "-gpu_memory",
        dest="gpu_memory",
        type=int,
        help=
        "specify the memory of your gpu, default 16. (This controls the lines to be used. Also work for CPU version. )",
        default=16,
    )
    # MySQL database parameters
    optional.add_argument(
        "-db_server",
        dest="db_server",
        default="localhost",
        help="Host name of MySQL server.",
    )
    optional.add_argument(
        "-db_port",
        dest="db_port",
        default=3306,
        type=int,
        help="Port number of MySQL server.",
    )
    optional.add_argument(
        "-db_user",
        dest="db_user",
        help="Username to use to connect to MySQL database. "
        "If user name is specified, script attempts to store results in MySQL "
        "database using other -db-* parameters.",
    )
    optional.add_argument(
        "-db_pass",
        dest="db_pass",
        default="omniscidb",
        help="Password to use to connect to MySQL database.",
    )
    optional.add_argument(
        "-db_name",
        dest="db_name",
        default="omniscidb",
        help="MySQL database to use to store benchmark results.",
    )
    optional.add_argument(
        "-db_table_etl",
        dest="db_table_etl",
        help="Table to use to store ETL results for this benchmark.",
    )
    optional.add_argument(
        "-db_table_ml",
        dest="db_table_ml",
        help="Table to use to store ML results for this benchmark.",
    )
    # Omnisci server parameters
    optional.add_argument(
        "-executable",
        dest="executable",
        help="Path to omnisci_server executable.",
    )
    optional.add_argument(
        "-omnisci_cwd",
        dest="omnisci_cwd",
        help="Path to omnisci working directory. "
        "By default parent directory of executable location is used. "
        "Data directory is used in this location.",
    )
    optional.add_argument(
        "-port",
        dest="port",
        default=port_default_value,
        type=int,
        help="TCP port number to run omnisci_server on.",
    )
    optional.add_argument(
        "-http_port",
        dest="http_port",
        default=port_default_value,
        type=int,
        help="HTTP port number to run omnisci_server on.",
    )
    optional.add_argument(
        "-calcite_port",
        dest="calcite_port",
        default=port_default_value,
        type=int,
        help="Calcite port number to run omnisci_server on.",
    )
    optional.add_argument(
        "-user",
        dest="user",
        default="admin",
        help="User name to use on omniscidb server.",
    )
    optional.add_argument(
        "-password",
        dest="password",
        default="HyperInteractive",
        help="User password to use on omniscidb server.",
    )
    optional.add_argument(
        "-database_name",
        dest="database_name",
        default="omnisci",
        help="Database name to use in omniscidb server.",
    )
    optional.add_argument(
        "-table",
        dest="table",
        default="benchmark_table",
        help="Table name name to use in omniscidb server.",
    )
    optional.add_argument(
        "-ipc_conn",
        dest="ipc_connection",
        default=True,
        type=str_arg_to_bool,
        help="Table name name to use in omniscidb server.",
    )
    # Additional information
    optional.add_argument(
        "-commit_omnisci",
        dest="commit_omnisci",
        default="1234567890123456789012345678901234567890",
        help="Omnisci commit hash to use for benchmark.",
    )
    optional.add_argument(
        "-commit_ibis",
        dest="commit_ibis",
        default="1234567890123456789012345678901234567890",
        help="Ibis commit hash to use for benchmark.",
    )

    try:
        os.environ["PYTHONIOENCODING"] = "UTF-8"
        os.environ["PYTHONUNBUFFERED"] = "1"
        omnisci_server_worker = None

        args = parser.parse_args()

        if args.port == port_default_value:
            args.port = find_free_port()
        if args.http_port == port_default_value:
            args.http_port = find_free_port()
        if args.calcite_port == port_default_value:
            args.calcite_port = find_free_port()

        if args.bench_name == "ny_taxi":
            from taxi import run_benchmark
        elif args.bench_name == "santander":
            from santander import run_benchmark
        elif args.bench_name == "census":
            from census import run_benchmark
        elif args.bench_name == "plasticc":
            from plasticc import run_benchmark

        parameters = {
            "data_file": args.data_file,
            "dfiles_num": args.dfiles_num,
            "no_ml": args.no_ml,
            "no_ibis": args.no_ibis,
            "optimizer": args.optimizer,
            "pandas_mode": args.pandas_mode,
            "ray_tmpdir": args.ray_tmpdir,
            "ray_memory": args.ray_memory,
            "gpu_memory": args.gpu_memory,
        }

        if not args.no_ibis:
            if args.executable is None:
                parser.error(
                    "Omnisci executable should be specified with -e/--executable for Ibis part"
                )
            omnisci_server = OmnisciServer(
                omnisci_executable=args.executable,
                omnisci_port=args.port,
                http_port=args.http_port,
                calcite_port=args.calcite_port,
                database_name=args.database_name,
                user=args.user,
                password=args.password,
            )

            parameters["database_name"] = args.database_name
            parameters["table"] = args.table
            parameters["dnd"] = args.dnd
            parameters["dni"] = args.dni
            parameters["validation"] = args.validation

        etl_results = []
        ml_results = []
        print(parameters)
        run_id = int(round(time.time()))
        for iter_num in range(1, args.iterations + 1):
            print(f"Iteration #{iter_num}")

            if not args.no_ibis:
                omnisci_server_worker = OmnisciServerWorker(omnisci_server)
                parameters["omnisci_server_worker"] = omnisci_server_worker
                parameters["ipc_connection"] = args.ipc_connection
                omnisci_server.launch()

            result = run_benchmark(parameters)

            if not args.no_ibis:
                omnisci_server.terminate()

            for backend_res in result["ETL"]:
                if backend_res:
                    backend_res["Iteration"] = iter_num
                    backend_res["run_id"] = run_id
                    etl_results.append(backend_res)
            for backend_res in result["ML"]:
                if backend_res:
                    backend_res["Iteration"] = iter_num
                    backend_res["run_id"] = run_id
                    ml_results.append(backend_res)

            # Reporting to MySQL database
            if args.db_user is not None:
                if iter_num == 1:
                    db = mysql.connector.connect(
                        host=args.db_server,
                        port=args.db_port,
                        user=args.db_user,
                        passwd=args.db_pass,
                        db=args.db_name,
                    )

                    reporting_init_fields = {
                        "OmnisciCommitHash": args.commit_omnisci,
                        "IbisCommitHash": args.commit_ibis
                    }

                    reporting_fields_benchmark_etl = {
                        x: "VARCHAR(500) NOT NULL"
                        for x in etl_results[0]
                    }
                    if len(etl_results) is not 1:
                        reporting_fields_benchmark_etl.update({
                            x: "VARCHAR(500) NOT NULL"
                            for x in etl_results[1]
                        })

                    db_reporter_etl = DbReport(db, args.db_table_etl,
                                               reporting_fields_benchmark_etl,
                                               reporting_init_fields)

                    if len(ml_results) is not 0:
                        reporting_fields_benchmark_ml = {
                            x: "VARCHAR(500) NOT NULL"
                            for x in ml_results[0]
                        }
                        if len(ml_results) is not 1:
                            reporting_fields_benchmark_ml.update({
                                x: "VARCHAR(500) NOT NULL"
                                for x in ml_results[1]
                            })

                        db_reporter_ml = DbReport(
                            db, args.db_table_ml,
                            reporting_fields_benchmark_ml,
                            reporting_init_fields)

                for result_etl in etl_results:
                    db_reporter_etl.submit(result_etl)

                if len(ml_results) is not 0:
                    for result_ml in ml_results:
                        db_reporter_ml.submit(result_ml)

    except Exception:
        traceback.print_exc(file=sys.stdout)
        sys.exit(1)
    finally:
        if omnisci_server_worker:
            omnisci_server_worker.terminate()
Example #17
0
 def __init__(self, name):
     self.name = name
     self.hostname = get_docker_bridge()
     self.port_rez = PortReservation()
     self.peer_port = find_free_port()
Example #18
0
 def __init__(self, zone):
     self.zone = zone
     self.port = find_free_port()
     self.start()