Exemple #1
0
def cli(dbname, host, port, timeout, interactive, version):
    """Cli entry point."""
    if version or dbname == 'version':
        print('datacli version: {}'.format(__version__))
        sys.exit(0)

    configure_logger()

    conn = PyDrill(host=host, port=port)
    if not conn.is_active():
        log.error('unable to reach Drill server')
        return 1

    cli = DataCli(conn, dbname, DataPrompt(), timeout=timeout)

    log.info('connected to Drillbit')
    while True:
        try:
            should_exit = cli.repl(interactive)
            if should_exit:
                break
        except KeyboardInterrupt:
            break  # Control-C pressed
        except EOFError:
            break  # Control-D pressed

    log.info('shutting down...')
    return 0
    def get_pydrill_conn(self, _connection_conf):
        if not PYDRILL_AVAILABLE:
            raise ImproperlyConfigured('pydrill must be installed.')
        conn = PyDrill(**_connection_conf)

        class cursor:
            def __init__(self):
                self.data = []
                self.description = ''
                self.cursor_index = -1

            def execute(self, sql):
                self.data = conn.query(sql)
                return self.data

            def fetchone(self, *args, **kwargs):
                self.cursor_index += 1
                try:
                    self.description = list(self.data.rows[self.cursor_index].items())
                    return self.description
                except IndexError:
                    return

            def close(self):
                return

        conn.cursor = cursor
        conn.commit = lambda: None
        return conn
class TestPydrill(unittest.TestCase):
    def setUp(self):
        self.drill = PyDrill(host='localhost', port=8047)

    def test_transport_host(self):
        assert self.drill.transport.host == 'localhost'

    def test_transport_port(self):
        assert self.drill.transport.port == 8047

    @responses.activate
    def test_is_active(self):
        responses.add(**{
            'method': responses.HEAD,
            'url': 'http://localhost:8047/',
            'status': 200,
            'content_type': 'application/json',
        })
        assert self.drill.is_active() == True

    @responses.activate
    def test_is_not_active_404(self):
        responses.add(**{
            'method': responses.HEAD,
            'url': 'http://localhost:8047/',
            'content_type': 'application/json',
            'status': 404,
        })
        assert self.drill.is_active() == False

    @responses.activate
    def test_is_not_active_500(self):
        responses.add(**{
            'method': responses.HEAD,
            'url': 'http://localhost:8047/',
            'content_type': 'application/json',
            'status': 500,
        })
        assert self.drill.is_active() == False

    @responses.activate
    def test_is_not_active_timeout(self):
        responses.add(**{
            'method': responses.HEAD,
            'url': 'http://localhost:8047/',
            'content_type': 'application/json',
            'status': 500,
        })
        try:
            self.drill.perform_request('HEAD', '/', params={'request_timeout': 0})
        except TransportError as e:
            assert e.status_code == e.args[0]
            assert e.error == e.args[1]
            assert e.info == e.args[2]
            assert str(e)
        else:
            assert False
Exemple #4
0
    def run_query(self, query, user):
        drillbit_host, drillbit_port = self.get_drillbit(
            self.configuration.get('host', None),
            self.configuration.get('port', None),
            self.configuration.get('zookeeper_path', None))

        user_auth = self.configuration.get('user_auth', None)
        if user_auth:
            session = requests.Session()  # Create a session object
            username, password = user_auth.split(':')
            if not self.auth_drill(session, drillbit_host, drillbit_port,
                                  username, password):
                json_data = None
                error = 'Invalid credentials for Drill'
                return json_data, error
            connection = PyDrill(host=drillbit_host, port=drillbit_port,
                                 connection_class=DrillRequestsHttpConnection,
                                 drill_session=session)
        else:
            connection = PyDrill(host=drillbit_host, port=drillbit_port)

        if not connection.is_active():
            json_data = None
            error = 'Please run Drill first'
            return json_data, error

        annotation = self.get_annotation(query)

        try:
            result = None
            for q in self.strip_comments(query).split(';'):
                q = q.strip()
                if not q:
                    continue
                q = self.magic_helpers(q)
                q = annotation + q
                result = connection.query(q, timeout=600)
                print(result.rows)
                logger.info(result.rows)
            columns = []
            for col in result.columns:
                columns.append({'name': col,
                                'friendly_name': col,
                                'type': TYPE_STRING})
            rows = result.rows
            data = {'columns': columns, 'rows': rows}
            json_data = json.dumps(data, cls=JSONEncoder)
            error = None
        except TransportError as te:
            json_data = None
            error = drillbit_host + '\n' + te.error
        except Exception as ex:
            json_data = None
            error = drillbit_host + '\n' + str(ex)

        return json_data, error
class TestPydrill(unittest.TestCase):
    def setUp(self):
        self.drill = PyDrill(host='localhost', port=8047)

    @responses.activate
    def test_plan_for_select_employee_mocked(self):
        sql = "SELECT * FROM cp.`employee.json` ORDER BY salary DESC LIMIT 1"

        responses.add(
            **{
                'method':
                responses.POST,
                'url':
                'http://localhost:8047/query.json',
                'body':
                '{"queryType": "SQL","query": "explain plan for %(sql)s"}' %
                ({
                    'sql': sql
                }),
                'status':
                200,
                'content_type':
                'application/json',
                'json': {},
            })

        result = self.drill.plan(sql=sql)
        assert result.response.status_code == 200
Exemple #6
0
    def connection(self):
        from pydrill.client import PyDrill

        if self._connection is None:
            self._connection = PyDrill(**self.params)

        return self._connection
Exemple #7
0
def test_authentication_success(pydrill_url):

    responses.add(
        **{
            'method': responses.POST,
            'url': "{0}/{1}".format(pydrill_url, 'j_security_check'),
        })

    PyDrill(auth='user:password')
    def init_drill_connection(self):
        self.__drill = PyDrill(host='localhost')
        is_drill_active = self.__drill.is_active()

        if is_drill_active:
            print("Drill is active: %s" % is_drill_active)
        else:
            print('''
            Drill is not active. Start your server in a terminal using command:
            cd /Users/Aymeric/apache-drill-1.5.0
            bin/drill-embedded
            ''')
def main():
    print("**init end2end**")

    Execution.getArgs()

    dir_data_file = Settings.data["TestSettings"]["dataDirectory"]

    # Create Table Drill -----------------------------------------
    drill = PyDrill(host="localhost", port=8047)
    createSchema.init_drill_schema(drill, dir_data_file)

    # Sólo pasan todos los test con 100Mb
    csvFromLocalTest.main(drill, dir_data_file)

    csvFromS3Test.main(
        drill, dir_data_file
    )  # AttributeError: 'NoneType' object has no attribute '_cols'

    # vector::_M_range_check: __n
    # (which is 18446744073709551615) >= this->size() (which is 2)
    csvFromHdfsTest.main(
        drill, dir_data_file
    )

    parquetFromLocalTest.main(
        drill, dir_data_file
    )  # Sólo pasan todos los test con 100Mb

    # Pasan todos los test con 100Mb, con multiples archivos para
    # una tabla no porque no se carga bien todos los archivos.
    parquetFromS3Test.main(
        drill, dir_data_file
    )

    parquetFromHdfsTest.main(
        drill, dir_data_file
    )  # Se queda pensando en la lectura de data

    runTest.save_log()

    for i in range(0, len(Settings.memory_list)):
        print(
            Settings.memory_list[i].name
            + ":"
            + "   Start Mem: "
            + str(Settings.memory_list[i].start_mem)
            + "   End Mem: "
            + str(Settings.memory_list[i].end_mem)
            + "   Diff: "
            + str(Settings.memory_list[i].delta)
        )
Exemple #10
0
def init_drill():
    # Start Drill schema-----------------------------------------
    from pydrill.client import PyDrill

    drill = PyDrill(host="localhost", port=8047)
    createSchema.init_drill_schema(
        drill, Settings.data["TestSettings"]["dataDirectory"], bool_test=True)
    createSchema.init_drill_schema(
        drill,
        Settings.data["TestSettings"]["dataDirectory"],
        smiles_test=True,
        fileSchemaType=DataType.PARQUET)

    return drill
class TestPydrill(unittest.TestCase):
    def setUp(self):
        self.drill = PyDrill(host='localhost', port=8047)

    @responses.activate
    def test_select_employee_mocked(self):
        sql = "SELECT * FROM cp.`employee.json` ORDER BY salary DESC LIMIT 1"
        expected_result = {
            "columns": ["employee_id", "full_name", "first_name", "last_name", "position_id", "position_title",
                        "store_id", "department_id", "birth_date", "hire_date", "salary", "supervisor_id",
                        "education_level", "marital_status", "gender", "management_role"],
            "rows": [{
                "hire_date": "1994-12-01 00:00:00.0",
                "birth_date": "1961-08-26",
                "department_id": "1",
                "store_id": "0",
                "education_level": "Graduate Degree",
                "first_name": "Sheri",
                "position_id": "1",
                "management_role": "Senior Management",
                "last_name": "Nowmer",
                "gender": "F",
                "position_title": "President",
                "marital_status": "S",
                "salary": "80000.0",
                "employee_id": "1",
                "supervisor_id": "0",
                "full_name": "Sheri Nowmer"
            }]
        }

        responses.add(**{
            'method': responses.POST,
            'url': 'http://localhost:8047/query.json',
            'body': '{"queryType": "SQL","query": "%(sql)s"}' % ({'sql': sql}),
            'status': 200,
            'content_type': 'application/json',
            'json': expected_result,
        })

        result = self.drill.query(sql=sql)

        assert result.response.status_code == 200
        assert result.data == expected_result
class TestPydrill(unittest.TestCase):
    def setUp(self):
        self.drill = PyDrill(host='localhost', port=8047)

    @responses.activate
    def test_plan_for_select_employee_mocked(self):
        sql = "SELECT * FROM cp.`employee.json` ORDER BY salary DESC LIMIT 1"

        responses.add(**{
            'method': responses.POST,
            'url': 'http://localhost:8047/query.json',
            'body': '{"queryType": "SQL","query": "explain plan for %(sql)s"}' % ({'sql': sql}),
            'status': 200,
            'content_type': 'application/json',
            'json': {},
        })

        result = self.drill.plan(sql=sql)
        assert result.response.status_code == 200
Exemple #13
0
def main():
    print('**init performance test**')
    Execution.getArgs()

    dir_data_file = Settings.data['TestSettings']['dataDirectory']

    # Create Table Drill ------------------------------------------------------------------------------------------------------
    drill = PyDrill(host='localhost', port=8047)
    createSchema.init_drill_schema(drill, dir_data_file)

    jobId = 1

    if Settings.data['MysqlConnection']['connectEnabled']:
        from DataBase import mysqlDatabaseManager as msqldb
        jobId = msqldb.getJobId()

    for x in range(0, 10):
        performanceTest.main(drill, dir_data_file)
        runTest.save_log(job_id=jobId)
Exemple #14
0
def main(dask_client, bc):

    # Create Table Drill ------------------------------------------------

    from pydrill.client import PyDrill

    drill = PyDrill(host="localhost", port=8047)

    dir_data_lc = Settings.data["TestSettings"]["dataDirectory"]

    for x in range(5):

        # [numberOfFiles, type_nation, type_region, type_supplier,
        #  type_customer, type_lineitem, type_orders]

        run = []

        if x == 0:
            run = [1, "psv", "psv", "psv", "psv", "psv", "psv"]
        elif x == 1:
            run = [
                2, "parquet", "parquet", "parquet", "parquet", "parquet",
                "parquet"
            ]
        elif x == 2:
            run = [6, "parquet", "psv", "parquet", "psv", "parquet", "psv"]
        elif x == 3:
            run = [10, "psv", "parquet", "psv", "parquet", "psv", "parquet"]
        elif x == 4:
            run = [12, "psv", "psv", "parquet", "parquet", "psv", "parquet"]

        print("============================================================")
        print("Running " + str(x + 1) + ":")
        print("Número de Archivos: " + str(run[0]))
        print("Type of files for Nation: " + run[1])
        print("Type of files for Region: " + run[2])
        print("Type of files for Supplier: " + run[3])
        print("Type of files for Customer: " + run[4])
        print("Type of files for Lineitem: " + run[5])
        print("Type of files for Orders: " + run[6])
        print("============================================================")
        print("1")
        num_files = run[0]
        print("2")
        cs.init_drill_schema(drill, dir_data_lc, n_files=num_files)
        print("3")
        # Read Data TPCH-----------------------------------------------------
        nation_files = cs.get_filenames_table("nation", dir_data_lc, num_files,
                                              run[1])
        bc.create_table(
            "nation",
            nation_files,
            delimiter="|",
            dtype=cs.get_dtypes("nation"),
            names=cs.get_column_names("nation"),
        )

        region_files = cs.get_filenames_table("region", dir_data_lc, num_files,
                                              run[2])
        bc.create_table(
            "region",
            region_files,
            delimiter="|",
            dtype=cs.get_dtypes("region"),
            names=cs.get_column_names("region"),
        )

        supplier_files = cs.get_filenames_table("supplier", dir_data_lc,
                                                num_files, run[3])
        bc.create_table(
            "supplier",
            supplier_files,
            delimiter="|",
            dtype=cs.get_dtypes("supplier"),
            names=cs.get_column_names("supplier"),
        )

        customer_files = cs.get_filenames_table("customer", dir_data_lc,
                                                num_files, run[4])
        bc.create_table(
            "customer",
            customer_files,
            delimiter="|",
            dtype=cs.get_dtypes("customer"),
            names=cs.get_column_names("customer"),
        )

        lineitem_files = cs.get_filenames_table("lineitem", dir_data_lc,
                                                num_files, run[5])
        bc.create_table(
            "lineitem",
            lineitem_files,
            delimiter="|",
            dtype=cs.get_dtypes("lineitem"),
            names=cs.get_column_names("lineitem"),
        )

        orders_files = cs.get_filenames_table("orders", dir_data_lc, num_files,
                                              run[6])
        bc.create_table(
            "orders",
            orders_files,
            delimiter="|",
            dtype=cs.get_dtypes("orders"),
            names=cs.get_column_names("orders"),
        )

        # Run Query ------------------------------------------------------
        # Parameter to indicate if its necessary to order
        # the resulsets before compare them
        worder = 1
        use_percentage = False
        acceptable_difference = 0.01
        queryType = "Load Data Test"

        print("==============================")
        print(queryType)
        print("==============================")

        queryId = "TEST_01"
        query = """select count(c_custkey) as c1, count(c_acctbal) as c2
                from customer"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_02"
        query = "select count(n_nationkey), count(n_regionkey) from nation"
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_03"
        query = "select count(s_suppkey), count(s_nationkey) from supplier"
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_04"
        query = """select count(c_custkey), sum(c_acctbal),
                    sum(c_acctbal)/count(c_acctbal), min(c_custkey),
                    max(c_nationkey),
                    (max(c_nationkey) + min(c_nationkey))/2 c_nationkey
                    from customer
                    where c_custkey < 100 group by c_nationkey"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            True,
        )  # TODO: Change sum/count for avg KC

        queryId = "TEST_05"
        query = """select c.c_custkey, c.c_nationkey, n.n_regionkey
                from customer as c inner join nation as n
                on c.c_nationkey = n.n_nationkey
                where n.n_regionkey = 1 and c.c_custkey < 50"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_06"
        query = """select c_custkey, c_nationkey, c_acctbal
                from customer order by c_nationkey, c_acctbal"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            0,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_07"
        query = """select c_custkey + c_nationkey, c_acctbal
                from customer order by 1, 2"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            0,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_08"
        query = """select n1.n_nationkey as supp_nation,
                n2.n_nationkey as cust_nation,
                l.l_extendedprice * l.l_discount
                from supplier as s
                inner join lineitem as l on s.s_suppkey = l.l_suppkey
                inner join orders as o on o.o_orderkey = l.l_orderkey
                inner join customer as c on c.c_custkey = o.o_custkey
                inner join nation as n1 on s.s_nationkey = n1.n_nationkey
                inner join nation as n2 on c.c_nationkey = n2.n_nationkey
                where n1.n_nationkey = 1
                and n2.n_nationkey = 2 and o.o_orderkey < 10000"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_09"
        query = """select c_custkey, c_nationkey as nkey
                from customer where c_custkey < 0 and c_nationkey >= 30"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_10"
        query = """select sin(c_acctbal), cos(c_acctbal),
                    sin(c_acctbal), acos(c_acctbal),
                    ln(c_acctbal), tan(c_acctbal),
                    atan(c_acctbal), floor(c_acctbal),
                    c_acctbal
                from customer"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_11"
        query = """select n1.n_nationkey as n1key,
                n2.n_nationkey as n2key, n1.n_nationkey + n2.n_nationkey
                from nation as n1 full outer join nation as n2
                on n1.n_nationkey = n2.n_nationkey + 6
                where n1.n_nationkey < 10 and n1.n_nationkey > 5"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_12"
        query = """select count(n1.n_nationkey) as n1key,
                    count(n2.n_nationkey) as n2key, count(*) as cstar
                from nation as n1 full outer join nation as n2
                on n1.n_nationkey = n2.n_nationkey + 6"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_13"
        query = """select o_orderkey, o_custkey from orders
                where o_orderkey < 10 and o_orderkey >= 1"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_14"
        query = """select 100168549 - sum(o_orderkey)/count(o_orderkey),
                    56410984 / sum(o_totalprice),
                    (123 - 945/max(o_orderkey)) / (sum(81619/o_orderkey) /
                    count(81619/o_orderkey))
                from orders"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            True,
        )  # TODO: Change sum/count for avg KC

        queryId = "TEST_15"
        query = """select o_orderkey, sum(o_totalprice)/count(o_orderstatus)
                from orders where o_custkey < 100
                group by o_orderstatus, o_orderkey"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_16"
        query = """select o_orderkey, o_orderstatus
                from orders where o_custkey < 10
                and o_orderstatus <> 'O'
                order by o_orderkey, o_orderstatus
                limit 50"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_17"
        query = """select count(o_orderstatus)
                from orders where o_orderstatus <> 'O'"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_18"
        query = """select count(o_orderkey), sum(o_orderkey), o_clerk
                from orders where o_custkey < 1000
                group by o_clerk, o_orderstatus"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_19"
        query = """select sum(o_orderkey)/count(o_orderkey)
                from orders group by o_orderstatus"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            True,
        )  # TODO: Change sum/count for avg KC

        queryId = "TEST_20"
        query = """select count(o_shippriority), sum(o_totalprice)
                from orders group by o_shippriority"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_21"
        query = """with regionTemp as ( select r_regionkey,
                 r_name from region where r_regionkey > 2 ),
                nationTemp as(select n_nationkey, n_regionkey as fkey,
                n_name from nation where n_nationkey > 3
                order by n_nationkey)
                select regionTemp.r_name, nationTemp.n_name
                from regionTemp inner join nationTemp on
                regionTemp.r_regionkey = nationTemp.fkey"""
        runTest.run_query(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_22"
        query = """select o.o_totalprice, l.l_partkey
                from orders as o
                left outer join lineitem as l on o.o_custkey = l.l_linenumber
                and l.l_suppkey = o.o_orderkey where l.l_linenumber < 1000"""
        runTest.run_query_performance(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_23"
        query = """select o.o_orderkey, o.o_totalprice,
                l.l_partkey, l.l_returnflag from lineitem as l
                inner join orders as o on o.o_orderkey = l.l_orderkey
                inner join customer as c on c.c_custkey = o.o_custkey
                where c.c_custkey < 1000"""
        runTest.run_query_performance(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        queryId = "TEST_24"
        query = """select o.o_orderkey, o.o_totalprice,
                l.l_partkey, l.l_linestatus from orders as o
                full outer join lineitem as l on
                l.l_orderkey = o.o_orderkey where o.o_orderkey < 1000"""
        runTest.run_query_performance(
            bc,
            drill,
            query,
            queryId,
            queryType,
            worder,
            "",
            acceptable_difference,
            use_percentage,
            # fileSchemaType,
        )

        runTest.save_log()
Exemple #15
0
#!/usr/local/bin/python
from pydrill.client import PyDrill
import json
from bson.json_util import dumps
from bson import json_util
#print "Content-type: application/json\n\n";
print "Content-type: text/html\n\n"
#print """<p>hehy</p>"""
#drill = PyDrill(host='localhost', port=8047)
from pymongo import MongoClient
drill = PyDrill(host='localhost', port=8047)

if not drill.is_active():
    raise ImproperlyConfigured('Please run Drill first')

yelp_reviews = drill.query('''
      SELECT sum(case when t.attributes.Ambience.romantic='true' then 1 else 0 end) as Romantic,
      sum(case when t.attributes.Ambience.intimate='true' then 1 else 0 end) as Intimate,
      sum(case when t.attributes.Ambience.classy='true' then 1 else 0 end) as Classy,
      sum(case when t.attributes.Ambience.hipster='true' then 1 else 0 end) as Hipster,
      sum(case when t.attributes.Ambience.divey='true' then 1 else 0 end) as Divey,
      sum(case when t.attributes.Ambience.touristy='true' then 1 else 0 end) as Touristy,
      sum(case when t.attributes.Ambience.trendy='true' then 1 else 0 end) as Trendy,
      sum(case when t.attributes.Ambience.upscale='true' then 1 else 0 end) as Upscale,
      sum(case when t.attributes.Ambience.casual='true' then 1 else 0 end) as Casual
       from `mongo.274_BI`.`yelp_dataset`t where  true=repeated_contains(categories,'American')and t.stars>3.5 and t.city='Phoenix'


''')

print dumps(yelp_reviews)
Exemple #16
0
#!/usr/local/bin/python
from pydrill.client import PyDrill
import json
from bson.json_util import dumps
from bson import json_util
#print "Content-type: application/json\n\n";
print "Content-type: text/html\n\n"
#print """<p>hehy</p>"""
#drill = PyDrill(host='localhost', port=8047)
from pymongo import MongoClient
drill = PyDrill(host='localhost', port=8047)

if not drill.is_active():
    raise ImproperlyConfigured('Please run Drill first')

city = "Las Vegas"

yelp_reviews = drill.query('''
  select sum(case when t.attributes.`Price Range`=1 then 1 else 0 end) as One,
  sum(case when t.attributes.`Price Range`=2 then 1 else 0 end) as Two,
  sum(case when t.attributes.`Price Range`=3 then 1 else 0 end) as Three ,
  sum(case when t.attributes.`Price Range`=4 then 1 else 0 end) as Four from `mongo.274_BI`.`yelp_dataset` t
  where t.city='Phoenix' and true=repeated_contains(categories,'Restaurants')
  order by count(*) asc

''')

print dumps(yelp_reviews)

# pandas dataframe
Exemple #17
0
# IMPORTS
########################################################################################################################
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Output, Event
from flask import Flask
import os
from pydrill.client import PyDrill
from tb.modules.trending_brand_query_engine.query_engine import  tweets_per_minute,sentiment_query_engine, last_message_form_kafka
from kafka import KafkaConsumer
########################################################################################################################


consumer = KafkaConsumer('beer', group_id='ui2')
drill1 = PyDrill(host='localhost', port=8047)
drill2 = PyDrill(host='localhost', port=8047)

server = Flask('my app')
server.secret_key = os.environ.get('secret_key', 'secret')

app = dash.Dash('streaming-wind-app', server=server,
                url_base_pathname='/BEER/',
                csrf_protect=False)

app.layout = html.Div([
    html.Div([
        html.H2("Trending-Brand: Beer consumption"),
    ], className='banner'),
    html.Div([
        html.Div([
Exemple #18
0
def main():
    print("**init end2end**")
    Execution.getArgs()
    nvmlInit()
    dir_data_file = Settings.data["TestSettings"]["dataDirectory"]
    nRals = Settings.data["RunSettings"]["nRals"]

    drill = "drill"
    spark = "spark"
    compareResults = True
    if "compare_results" in Settings.data["RunSettings"]:
        compareResults = Settings.data["RunSettings"]["compare_results"]

    if (Settings.execution_mode == ExecutionMode.FULL and compareResults
            == "true") or Settings.execution_mode == ExecutionMode.GENERATOR:

        # Create Table Drill -----------------------------------------
        from pydrill.client import PyDrill

        drill = PyDrill(host="localhost", port=8047)
        createSchema.init_drill_schema(
            drill,
            Settings.data["TestSettings"]["dataDirectory"],
            bool_test=True)
        createSchema.init_drill_schema(
            drill,
            Settings.data["TestSettings"]["dataDirectory"],
            smiles_test=True,
            fileSchemaType=DataType.PARQUET)

        # Create Table Spark -------------------------------------------------
        from pyspark.sql import SparkSession

        spark = SparkSession.builder.appName("allE2ETest").getOrCreate()
        createSchema.init_spark_schema(
            spark, Settings.data["TestSettings"]["dataDirectory"])
        createSchema.init_spark_schema(
            spark,
            Settings.data["TestSettings"]["dataDirectory"],
            smiles_test=True,
            fileSchemaType=DataType.PARQUET)

    targetTestGroups = Settings.data["RunSettings"]["targetTestGroups"]

    # only innerJoinsTest will be with progress bar
    useProgressBar = False
    if "innerJoinsTest" in targetTestGroups:
        useProgressBar = True

    print("Using progress bar: ", useProgressBar)

    # Create Context For BlazingSQL
    bc, dask_client = init_context(useProgressBar=useProgressBar)

    runAllTests = (
        len(targetTestGroups) == 0
    )  # if targetTestGroups was empty the user wants to run all the tests

    if runAllTests or ("hiveFileTest" in targetTestGroups):
        hiveFileTest.main(dask_client, spark, dir_data_file, bc, nRals)

    if runAllTests or ("aggregationsWithoutGroupByTest" in targetTestGroups):
        aggregationsWithoutGroupByTest.main(dask_client, drill, dir_data_file,
                                            bc, nRals)

    if runAllTests or ("coalesceTest" in targetTestGroups):
        coalesceTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("columnBasisTest" in targetTestGroups):
        columnBasisTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("commonTableExpressionsTest" in targetTestGroups):
        commonTableExpressionsTest.main(dask_client, drill, dir_data_file, bc,
                                        nRals)

    if runAllTests or ("countDistinctTest" in targetTestGroups):
        countDistinctTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("countWithoutGroupByTest" in targetTestGroups):
        countWithoutGroupByTest.main(dask_client, drill, dir_data_file, bc,
                                     nRals)

    if runAllTests or ("dateTest" in targetTestGroups):
        dateTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("timestampTest" in targetTestGroups):
        timestampTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("toTimestampTest" in targetTestGroups):
        toTimestampTest.main(dask_client, spark, dir_data_file, bc, nRals)

    if runAllTests or ("dayOfWeekTest" in targetTestGroups):
        dayOfWeekTest.main(dask_client, spark, dir_data_file, bc, nRals)

    if runAllTests or ("fullOuterJoinsTest" in targetTestGroups):
        fullOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("groupByTest" in targetTestGroups):
        groupByTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("GroupByWitoutAggregations" in targetTestGroups):
        GroupByWitoutAggregations.main(dask_client, drill, dir_data_file, bc,
                                       nRals)

    if runAllTests or ("innerJoinsTest" in targetTestGroups):
        innerJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("crossJoinsTest" in targetTestGroups):
        crossJoinsTest.main(dask_client, spark, dir_data_file, bc, nRals)

    if runAllTests or ("leftOuterJoinsTest" in targetTestGroups):
        leftOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("nonEquiJoinsTest" in targetTestGroups):
        nonEquiJoinsTest.main(dask_client, drill, spark, dir_data_file, bc,
                              nRals)

    # loadDataTest.main(dask_client, bc) #check this

    if runAllTests or ("nestedQueriesTest" in targetTestGroups):
        nestedQueriesTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("orderbyTest" in targetTestGroups):
        orderbyTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("predicatesWithNulls" in targetTestGroups):
        predicatesWithNulls.main(dask_client, drill, spark, dir_data_file, bc,
                                 nRals)

    if runAllTests or ("stringTests" in targetTestGroups):
        stringTests.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("tablesFromPandasTest" in targetTestGroups):
        tablesFromPandasTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("unaryOpsTest" in targetTestGroups):
        unaryOpsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("unifyTablesTest" in targetTestGroups):
        unifyTablesTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("unionTest" in targetTestGroups):
        unionTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("useLimitTest" in targetTestGroups):
        useLimitTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("whereClauseTest" in targetTestGroups):
        whereClauseTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("bindableAliasTest" in targetTestGroups):
        bindableAliasTest.main(dask_client, drill, spark, dir_data_file, bc,
                               nRals)

    if runAllTests or ("booleanTest" in targetTestGroups):
        booleanTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("caseTest" in targetTestGroups):
        caseTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("castTest" in targetTestGroups):
        castTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("concatTest" in targetTestGroups):
        concatTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("literalTest" in targetTestGroups):
        literalTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("dirTest" in targetTestGroups):
        dirTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    # HDFS is not working yet
    # fileSystemHdfsTest.main(dask_client, drill, dir_data_file, bc)

    # HDFS is not working yet
    # mixedFileSystemTest.main(dask_client, drill, dir_data_file, bc)

    if runAllTests or ("likeTest" in targetTestGroups):
        likeTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("substringTest" in targetTestGroups):
        substringTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("stringCaseTest" in targetTestGroups):
        stringCaseTest.main(dask_client, drill, spark, dir_data_file, bc,
                            nRals)

    if runAllTests or ("wildCardTest" in targetTestGroups):
        wildCardTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("tpchQueriesTest" in targetTestGroups):
        tpchQueriesTest.main(dask_client, drill, spark, dir_data_file, bc,
                             nRals)

    if runAllTests or ("roundTest" in targetTestGroups):
        roundTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("fileSystemLocalTest" in targetTestGroups):
        fileSystemLocalTest.main(dask_client, drill, spark, dir_data_file, bc,
                                 nRals)

    if runAllTests or ("messageValidationTest" in targetTestGroups):
        messageValidationTest.main(dask_client, drill, dir_data_file, bc,
                                   nRals)

    testsWithNulls = Settings.data["RunSettings"]["testsWithNulls"]
    if testsWithNulls != "true":
        if Settings.execution_mode != ExecutionMode.GPUCI:
            if runAllTests or ("fileSystemS3Test" in targetTestGroups):
                fileSystemS3Test.main(dask_client, drill, dir_data_file, bc,
                                      nRals)

            if runAllTests or ("fileSystemGSTest" in targetTestGroups):
                fileSystemGSTest.main(dask_client, drill, dir_data_file, bc,
                                      nRals)

    if runAllTests or ("loggingTest" in targetTestGroups):
        loggingTest.main(dask_client, dir_data_file, bc, nRals)

    # timestampdiffTest.main(dask_client, spark, dir_data_file, bc, nRals)

    #TODO re enable this test once we have the new version of dask
    # https://github.com/dask/distributed/issues/4645
    # https://github.com/rapidsai/cudf/issues/7773
    #if runAllTests or ("smilesTest" in targetTestGroups):
    #    smilesTest.main(dask_client, spark, dir_data_file, bc, nRals)

    if testsWithNulls != "true":
        if runAllTests or ("jsonTest" in targetTestGroups):
            jsonTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("windowFunctionTest" in targetTestGroups):
        windowFunctionTest.main(dask_client, drill, spark, dir_data_file, bc,
                                nRals)

    if runAllTests or ("windowNoPartitionTest" in targetTestGroups):
        windowNoPartitionTest.main(dask_client, drill, spark, dir_data_file,
                                   bc, nRals)

    if testsWithNulls != "true":
        if runAllTests or ("concurrentTest" in targetTestGroups):
            concurrentTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if testsWithNulls == "true":
        if Settings.execution_mode != ExecutionMode.GPUCI:
            if runAllTests or ("tablesFromSQL" in targetTestGroups):
                tablesFromSQL.main(dask_client, drill, dir_data_file, bc,
                                   nRals)

    # WARNING!!! This Test must be the last one to test -------------------------------------------------------------------------------------------------------------------------------------------
    if runAllTests or ("configOptionsTest" in targetTestGroups):
        configOptionsTest.main(dask_client, drill, spark, dir_data_file, bc,
                               nRals)

    if Settings.execution_mode != ExecutionMode.GENERATOR:

        result, error_msgs = runTest.save_log(
            Settings.execution_mode == ExecutionMode.GPUCI)

        max = 0
        for i in range(0, len(Settings.memory_list)):
            if (Settings.memory_list[i].delta) > max:
                max = Settings.memory_list[i].delta

        print("MAX DELTA: " + str(max))
        print("""***********************************************************
              ********************""")

        for i in range(0, len(Settings.memory_list)):
            print(Settings.memory_list[i].name + ":" + "   Start Mem: " +
                  str(Settings.memory_list[i].start_mem) + "   End Mem: " +
                  str(Settings.memory_list[i].end_mem) + "   Diff: " +
                  str(Settings.memory_list[i].delta))

        return result, error_msgs

    return True, []
Exemple #19
0
    nvmlInit()

    drill = "drill"  # None
    spark = "spark"

    compareResults = True
    if "compare_results" in Settings.data["RunSettings"]:
        compareResults = Settings.data["RunSettings"]["compare_results"]

    if ((Settings.execution_mode == ExecutionMode.FULL
         and compareResults == "true")
            or Settings.execution_mode == ExecutionMode.GENERATOR):
        # Create Table Drill ------------------------------------------------
        from pydrill.client import PyDrill

        drill = PyDrill(host="localhost", port=8047)
        cs.init_drill_schema(drill,
                             Settings.data["TestSettings"]["dataDirectory"])

        # Create Table Spark -------------------------------------------------
        from pyspark.sql import SparkSession

        spark = SparkSession.builder.appName("timestampTest").getOrCreate()
        cs.init_spark_schema(spark,
                             Settings.data["TestSettings"]["dataDirectory"])

    # Create Context For BlazingSQL

    bc, dask_client = init_context()

    nRals = Settings.data["RunSettings"]["nRals"]
Exemple #20
0
#!/usr/local/bin/python
from pydrill.client import PyDrill
import json
from bson.json_util import dumps
from bson import json_util
#print "Content-type: application/json\n\n";
print "Content-type: text/html\n\n"
#print """<p>hehy</p>"""
#drill = PyDrill(host='localhost', port=8047)
from pymongo import MongoClient
drill = PyDrill(host='localhost', port=8047)

if not drill.is_active():
    raise ImproperlyConfigured('Please run Drill first')

yelp_reviews = drill.query('''
  select avg(t.latitude) as latitude, avg(t.longitude) as longitude from `mongo.274_BI`.`yelp_dataset`t
  where true=repeated_contains(categories,'Fast Food')and t.stars>3.5 and t.city='Pittsburgh'
''')

print dumps(yelp_reviews)

# pandas dataframe
 def setUp(self):
     self.drill = PyDrill(host='localhost', port=8047)
#!/usr/local/bin/python
from pydrill.client import PyDrill
import json
import cgi
import sys
from bson.json_util import dumps
from bson import json_util
print "Content-type: application/json\n\n"

#print "Content-type: application/json\n\n";
#print "Content-type: text/html\n\n";
#print """<p>hehy</p>"""
#drill = PyDrill(host='localhost', port=8047)
from pymongo import MongoClient
drill = PyDrill(host='localhost', port=8047)

if not drill.is_active():
    raise ImproperlyConfigured('Please run Drill first')
city = cgi.FieldStorage().value
#print [city]

city1 = json.loads(city)

query = "select sum(case when t.attributes.Ambience.romantic='true' then 1 else 0 end) as Romantic," + "sum(case when t.attributes.Ambience.intimate='true' then 1 else 0 end) as Intimate," + "sum(case when t.attributes.Ambience.classy='true' then 1 else 0 end) as Classy," + "sum(case when t.attributes.Ambience.hipster='true' then 1 else 0 end) as Hipster," + "sum(case when t.attributes.Ambience.divey='true' then 1 else 0 end) as Divey," + "sum(case when t.attributes.Ambience.touristy='true' then 1 else 0 end) as Touristy," + "sum(case when t.attributes.Ambience.trendy='true' then 1 else 0 end) as Trendy," + "sum(case when t.attributes.Ambience.upscale='true' then 1 else 0 end) as Upscale," + "sum(case when t.attributes.Ambience.casual='true' then 1 else 0 end) as Casual " + "from `mongo.274_BI`.`yelp_dataset`t where  true=repeated_contains(categories,'" + city1[
    "data"] + "')and t.stars>3.5 and t.city='" + city1["data2"] + "'"
#print query

yelp_reviews = drill.query(query)

print dumps(yelp_reviews)
#!/usr/bin/env python

from pydrill.client import PyDrill
drill = PyDrill(host='10.32.48.136',
                port=48047,
                auth="jlim:2019Jfm!",
                use_ssl=False,
                verify_certs=False)

if not drill.is_active():
    raise ImproperlyConfigured('Please run Drill first')

tenants = drill.query('''
  SELECT * FROM
  dfs.`/tsys/qa/internal/data/maprdb/tenants`
  LIMIT 5
''')

for result in tenants:
    print result
Exemple #24
0
def pydrill_instance():
    drill = PyDrill()
    return drill
#!/usr/local/bin/python
from pydrill.client import PyDrill
import json
from bson.json_util import dumps
from bson import json_util
#print "Content-type: application/json\n\n";
print "Content-type: text/html\n\n"
#print """<p>hehy</p>"""
#drill = PyDrill(host='localhost', port=8047)
from pymongo import MongoClient
drill = PyDrill(host='localhost', port=8047)

if not drill.is_active():
    raise ImproperlyConfigured('Please run Drill first')

yelp_reviews = drill.query('''
  SELECT city , count(*) totalreviews FROM
  `mongo.274_BI`.`yelp_dataset` where true=repeated_contains(categories,'Restaurants')
  group by city order by count(*) desc limit 5
''')

print dumps(yelp_reviews)

# pandas dataframe
#!/usr/local/bin/python
from pydrill.client import PyDrill
import json
from bson.json_util import dumps
from bson import json_util
#print "Content-type: application/json\n\n";
print "Content-type: text/html\n\n"
#print """<p>hehy</p>"""
#drill = PyDrill(host='localhost', port=8047)
from pymongo import MongoClient
drill = PyDrill(host='localhost', port=8047)

if not drill.is_active():
    raise ImproperlyConfigured('Please run Drill first')

city = "Las Vegas"

yelp_reviews = drill.query('''
  SELECT sum(case when t.attributes.Parking.garage='true' then 1 else 0 end) as Garage,
  sum(case when t.attributes.Parking.street='true' then 1 else 0 end) as Street,
  sum(case when t.attributes.Parking.validated='true' then 1 else 0 end) as Validated,
  sum(case when t.attributes.Parking.valet='true' then 1 else 0 end) as Valet,
  sum(case when t.attributes.Parking.lot='true' then 1 else 0 end) as Lot
   FROM `mongo.274_BI`.`yelp_dataset`t where t.city = 'Charlotte' and true=repeated_contains(categories,'Restaurants')

''')

print dumps(yelp_reviews)

# pandas dataframe
Exemple #27
0
    def process_result(self, sql, queue, projvartocols, coltotemplates, res_dict=None):
        c = 0
        try:
            if not self.drill.is_active():
                try:
                    self.drill = PyDrill(host=self.host, port=self.port)
                except Exception as ex:
                    print("Exception while connecting to Drill for query processing", ex)
                    return 0
            try:
                results = self.drill.query(sql, timeout=1000)
            except Exception as ex:
                print("Exception while running query to Drill for query processing", ex)
                return 0

            for row in results:
                c += 1
                # if res_dict is not None:
                #     rowtxt = ",".join(list(row.values()))
                #     if rowtxt in res_dict:
                #         continue
                #     else:
                #         res_dict.append(rowtxt)

                res = {}
                skip = False
                for r in row:
                    if row[r] == 'null':
                        skip = True
                        break
                    if '_' in r and r[:r.find("_")] in projvartocols:
                        s = r[:r.find("_")]
                        if s in res:
                            val = res[s]
                            if 'http://' in row[r]:
                                res[s] = row[r]
                            else:
                                res[s] = val.replace('{' + r[r.find("_") + 1:] + '}', row[r].replace(" ", '_'))
                        else:
                            if 'http://' in r:
                                res[s] = r
                            else:
                                res[s] = coltotemplates[s].replace('{' + r[r.find("_") + 1:] + '}',
                                                                   row[r].replace(" ", '_'))
                    elif r in projvartocols and r in coltotemplates:
                        if 'http://' in row[r]:
                            res[r] = row[r]
                        else:
                            res[r] = coltotemplates[r].replace('{' + projvartocols[r] + '}', row[r].replace(" ", '_'))
                    else:
                        res[r] = row[r]

                if not skip:
                    queue.put(res)
                    # if 'keggCompoundId' in res:
                    #     print(res['keggCompoundId'])
            return c
        except Exception as e:
            print("Exception while processing drill results", e, sql)
            logger.error(sql)
            logger.error("Exception while processing results:" + str(e))
            import traceback
            traceback.print_stack()
            return c
#!/usr/local/bin/python
from pydrill.client import PyDrill
import json
from bson.json_util import dumps
from bson import json_util
#print "Content-type: application/json\n\n";
print "Content-type: text/html\n\n"
#print """<p>hehy</p>"""
#drill = PyDrill(host='localhost', port=8047)
from pymongo import MongoClient

drill = PyDrill(host='localhost', port=8047)

if not drill.is_active():
    raise ImproperlyConfigured('Please run Drill first')

yelp_reviews = drill.query('''
  SELECT sum(case when t.attributes.`Alcohol`<>'none' then 1 else 0 end) as Alcohol,
  sum(case when t.attributes.`Good For Groups`='true' then 1 else 0 end) as `Good For Groups`,
  sum(case when t.attributes.`Accepts Credit Cards`='true' then 1 else 0 end) as `Accept Credit Cards` ,
  sum(case when t.attributes.`Wi-Fi`='free' then 1 else 0 end) as `Wi-Fi`,
   sum(case when t.attributes.`Take-out`='true' then 1 else 0 end) as `Take-Out`,
   sum(case when t.attributes.`Coat Check`='true' then 1 else 0 end) as `Coat Check`,
   sum(case when t.attributes.`Takes Reservations`='true' then 1 else 0 end) as `Takes Reservations`,
   sum(case when t.attributes.`Wheelchair Accessible`='true' then 1 else 0 end) as `Wheelchair Accessible`,
   sum(case when t.attributes.`Outdoor Seating`='true' then 1 else 0 end) as `Outdoor Seating`,
   sum(case when t.attributes.`Dogs Allowed`='true' then 1 else 0 end) as `Dogs Allowed`,
   sum(case when t.attributes.`Smoking`='outdoor' then 1 else 0 end) as `Smoking`,
   sum(case when t.attributes.`Waiter Service`='true' then 1 else 0 end) as `Waiter Service`,
   sum(case when t.attributes.`Caters`='true' then 1 else 0 end) as `Caters`,
   sum(case when t.attributes.`Has TV`='true' then 1 else 0 end) as `Has TV`,
Exemple #29
0
class DrillWrapper(object):

    def __init__(self, datasource, config, rdfmts, star):
        self.datasource = datasource
        self.rdfmts = rdfmts
        self.url = datasource.url
        self.params = datasource.params
        self.config = config
        self.drill = None
        self.df = None
        self.result = None
        self.star = star
        self.query = None
        self.prefixes = {}
        if ':' in self.url:
            self.host, self.port = self.url.split(':')
        else:
            self.host = self.url
            self.port = '8047'

        if len(self.datasource.mappings) == 0:
            self.mappings = self.config.load_mappings(self.datasource.mappingfiles, self.rdfmts)
        else:
            # self.mappings = self.config.mappings
            self.mappings = self.datasource.mappings

    def executeQuery(self, query, queue=Queue(), limit=-1, offset=0):
        """
        Entry point for query execution on csv files
        :param querystr: string query
        :return:
        """
        from time import time
        # start = time()
        # print("Start:", start)
        if len(self.mappings) == 0:
            print("Empty Mapping")
            queue.put('EOF')
            return []
        # querytxt = query
        self.query = qp.parse(query)
        self.prefixes = getPrefs(self.query.prefs)

        query_filters = [f for f in self.query.body.triples[0].triples if isinstance(f, Filter)]

        if limit > -1 or offset > -1:
            self.query.limit = limit
            self.query.offset = offset

        sqlquery, projvartocols, coltotemplates, filenametablename = self.translate(query_filters)
        # print(sqlquery)
        # totalres = 0
        if sqlquery is None or len(sqlquery) == 0:
            queue.put("EOF")
            return []
        try:
            start = time()
            try:
                self.drill = PyDrill(host=self.host, port=self.port)
            except Exception as ex:
                print("Exception while connecting to Drill", ex)
                queue.put("EOF")
                return
            if not self.drill.is_active():
                print('Exception: Please run Drill first')
                queue.put("EOF")
                return
            # print("Drill Initialization cost:", time() - start)
            logger.info("Drill Initialization cost:" + str(time() - start))
            start = time()
            if isinstance(sqlquery, list):
                sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0]
                if len(sqlquery) > 3:
                    sqlquery = " UNION ".join(sqlquery)
            if isinstance(sqlquery, list):
                sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0]
                # logger.info(" UNION ".join(sqlquery))
                processqueues = []
                processes = []
                res_dict = []
                for sql in sqlquery:
                    # processquery = Queue()
                    # self.run_union(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict)
                    # print(sql)
                    processquery = Queue()
                    processqueues.append(processquery)
                    p = Process(target=self.run_union, args=(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict,))
                    p.start()
                    processes.append(p)

                while len(processqueues) > 0:
                    toremove = []
                    try:
                        for q in processqueues:
                            if q.get(False) == 'EOF':
                                toremove.append(q)
                        for p in processes:
                            if p.is_alive():
                                p.terminate()
                    except:
                        pass
                    for q in toremove:
                        processqueues.remove(q)
                logger.info("Done running:")
                sw = " UNION ".join(sqlquery)
                logger.info(sw)
            else:
                card = 0
                # if limit == -1:
                limit = 1000
                if offset == -1:
                    offset = 0
                logger.info(sqlquery)
                # print(sqlquery)
                while True:
                    query_copy = sqlquery + " LIMIT " + str(limit) + " OFFSET " + str(offset)
                    cardinality = self.process_result(query_copy, queue, projvartocols, coltotemplates)
                    card += cardinality
                    if cardinality < limit:
                        break

                    offset = offset + limit
            # print("Exec in Drill took:", time() - start)
            logger.info("Exec in Drill took:" + str(time() - start))
        except Exception as e:
            print("Exception ", e)
            pass
        # print('End:', time(), "Total results:", totalres)
        # print("Drill finished after: ", (time()-start))
        queue.put("EOF")

    def run_union(self, sql, queue, projvartocols, coltotemplates, limit, processqueue, res_dict):

        card = 0
        # if limit == -1:
        limit = 1000
        offset = 0
        while True:
            query_copy = sql + " LIMIT " + str(limit) + " OFFSET " + str(offset)
            cardinality = self.process_result(query_copy, queue, projvartocols, coltotemplates, res_dict)
            card += cardinality
            if cardinality < limit:
                break

            offset = offset + limit

        processqueue.put("EOF")

    def process_result(self, sql, queue, projvartocols, coltotemplates, res_dict=None):
        c = 0
        try:
            if not self.drill.is_active():
                try:
                    self.drill = PyDrill(host=self.host, port=self.port)
                except Exception as ex:
                    print("Exception while connecting to Drill for query processing", ex)
                    return 0
            try:
                results = self.drill.query(sql, timeout=1000)
            except Exception as ex:
                print("Exception while running query to Drill for query processing", ex)
                return 0

            for row in results:
                c += 1
                # if res_dict is not None:
                #     rowtxt = ",".join(list(row.values()))
                #     if rowtxt in res_dict:
                #         continue
                #     else:
                #         res_dict.append(rowtxt)

                res = {}
                skip = False
                for r in row:
                    if row[r] == 'null':
                        skip = True
                        break
                    if '_' in r and r[:r.find("_")] in projvartocols:
                        s = r[:r.find("_")]
                        if s in res:
                            val = res[s]
                            if 'http://' in row[r]:
                                res[s] = row[r]
                            else:
                                res[s] = val.replace('{' + r[r.find("_") + 1:] + '}', row[r].replace(" ", '_'))
                        else:
                            if 'http://' in r:
                                res[s] = r
                            else:
                                res[s] = coltotemplates[s].replace('{' + r[r.find("_") + 1:] + '}',
                                                                   row[r].replace(" ", '_'))
                    elif r in projvartocols and r in coltotemplates:
                        if 'http://' in row[r]:
                            res[r] = row[r]
                        else:
                            res[r] = coltotemplates[r].replace('{' + projvartocols[r] + '}', row[r].replace(" ", '_'))
                    else:
                        res[r] = row[r]

                if not skip:
                    queue.put(res)
                    # if 'keggCompoundId' in res:
                    #     print(res['keggCompoundId'])
            return c
        except Exception as e:
            print("Exception while processing drill results", e, sql)
            logger.error(sql)
            logger.error("Exception while processing results:" + str(e))
            import traceback
            traceback.print_stack()
            return c

    def get_so_variables(self, triples, proj):
        tvars = []
        for t in triples:
            if not t.subject.constant:
                tvars.append(t.subject.name)
            # exclude variables that are not projected
            if not t.theobject.constant:# and t.theobject.name in proj:
                tvars.append(t.theobject.name)

        return tvars

    def getsqlfil(self, l, r, op, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias):
        if r is not None and '?' in r.name:
            var = r.name
            val = l.name
        else:
            var = l.name
            val = r.name
        # print(val)
        if '(' in var and ')' in var:
            var = var[var.find('(') + 1:var.find(')')]

        if len(var_pred_map) == 0 or var == self.star['triples'][0].subject.name:
            subjcol = subjmap.value
            splits = subjcol.split('{')
            coltotemplates[var[1:]] = subjcol
            column = []
            for sp in splits[1:]:
                column.append(sp[:sp.find('}')])

            if len(column) > 1:
                objfilters = []
                for col in column:
                    vcolumn = "`" + col + '`'

                    if '<' in val and '>' in val:
                        val = val.replace('<', '').replace('>', '')
                    if '"' not in val and "'" not in val:
                        val = "'" + val + "'"

                    if op == 'REGEX':
                        val = "LOWER('%" + val[1:-1] + "%')"
                        objectfilter = 'LOWER(' + tablealias + '.' + vcolumn + ") LIKE " + val
                    else:
                        objectfilter = tablealias + '.' + vcolumn + op + val
                    objfilters.append(objectfilter)

                return " AND ".join(objfilters)
            elif len(column) == 1:
                column = "`" + column[0] + '`'

                if '<' in val and '>' in val:
                    val = val.replace('<', '').replace('>', '').replace(splits[0], '')

                if '"' not in val and "'" not in val:
                    val = "'" + val + "'"

                if op == 'REGEX':
                    val = "LOWER('%" + val[1:-1] + "%')"
                    objectfilter = 'LOWER(' + tablealias + '.' + column + ") LIKE " + val
                else:
                    objectfilter = tablealias + '.' + column + op + val

                return objectfilter
        if var not in var_pred_map:
            return None

        p = var_pred_map[var]
        pmap, omap = predicate_object_map[p]
        if omap.objectt.resource_type == TripleMapType.TEMPLATE:
            coltotemplates[var[1:]] = omap.objectt.value
            splits = omap.objectt.value.split('{')
            column = []
            for sp in splits[1:]:
                column.append(sp[:sp.find('}')])
            val = val.replace(splits[0], "")

            if len(column) == 1:
                column = column[0]

        elif omap.objectt.resource_type == TripleMapType.REFERENCE:
            column = omap.objectt.value
        else:
            column = []
        if isinstance(column, list):
            if len(column) > 0:
                column = column[0]
        column = "`" + column + '`'

        if '<' in val and '>' in val:
            val = val.replace('<', '').replace('>', '')
        if '"' not in val and "'" not in val:
            val = "'" + val + "'"

        if op == 'REGEX':
            val = "LOWER('%" + val[1:-1] + "%')"
            objectfilter = 'LOWER(' + tablealias + '.' + column + ") LIKE " + val
        else:
            objectfilter = tablealias + '.' + column + op + val

        return objectfilter

    def get_Expression_value(self, exp, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias):

        left = exp.left
        right = exp.right
        op = exp.op

        if op in unaryFunctor:
            if isinstance(left, Expression) and isinstance(left.left, Argument):
                left = left.left
                fil = self.getsqlfil(left, right, op, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias)
                return fil

        elif op in binaryFunctor:
            if op == 'REGEX' and right.desc is not False:
                if isinstance(left, Expression):
                    if 'xsd:string' in left.op:
                        left = left.left
                        fil = self.getsqlfil(left, right, op, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias)
                        return fil
                    # else:
                    #     left = self.get_Expression_value(left, var_pred_map, subjmap,predicate_object_map, coltotemplates, tablealias)
                    #     right = self.get_Expression_value(right, var_pred_map, subjmap, predicate_object_map, coltotemplates,tablealias)
                else:
                    fil = self.getsqlfil(left, right, op, var_pred_map, subjmap, predicate_object_map,
                                         coltotemplates, tablealias)
                    return fil
                # return op + "(" + str(left) + "," + right.name + "," + right.desc + ")"
            else:
                return op + "(" + str(left) + "," + str(right) + ")"

        elif right is None:
            return op + str(left)

        else:
            if isinstance(left, Argument) and isinstance(right, Argument):
                fil = self.getsqlfil(left, right, op, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias)
                return fil
            if isinstance(left, Expression) and isinstance(right, Expression):
                leftexp = self.get_Expression_value(left, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias)
                rightexp = self.get_Expression_value(right, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias)
                if op == '||' or op == '|':
                    if leftexp is None or rightexp is None:
                        return None
                    return '(' + leftexp + ' OR ' + rightexp + ')'
                else:
                    if leftexp is None or rightexp is None:
                        return None
                    return '(' + leftexp + ' AND ' + rightexp + ')'
            # print(op, type(left), left, type(right), right)
            return "(" + str(exp.left) + " " + exp.op + " " + str(exp.right)

    def get_obj_filter(self, f, var_pred_map, subjmap, predicate_object_map, coltotemplates, tablealias):
        return self.get_Expression_value(f.expr, var_pred_map, subjmap, predicate_object_map, coltotemplates,
                                         tablealias)

    def makeJoin(self, mapping_preds, query_filters):

        coltotemplates = {}
        projections = {}
        projvartocol = {}
        objectfilters = []
        constfilters = []
        fromclauses = []
        database_name = ""
        i = 0
        tm_tablealias = {}
        subjects = {}

        projvartocols = {}
        query = ""

        for tm, predicate_object_map in mapping_preds.items():
            sparqlprojected = set(self.get_so_variables(self.star['triples'], [c.name for c in self.query.args]))
            tablealias = 'Ontario_' + str(i)
            if isinstance(predicate_object_map, SubjectMap):
                subjvar = self.star['triples'][0].subject.name
                var = subjvar
                var_pred_map = {subjvar: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'}
                if predicate_object_map.subject.resource_type == TripleMapType.TEMPLATE:
                    coltotemplates[subjvar[1:]] = predicate_object_map.subject.value
                    splits = predicate_object_map.subject.value.split('{')
                    column = []
                    for sp in splits[1:]:
                        column.append(sp[:sp.find('}')])
                    if len(column) == 1:
                        column = column[0]
                elif predicate_object_map.subject.resource_type == TripleMapType.REFERENCE:
                    column = predicate_object_map.subject.value
                else:
                    column = []
                if isinstance(column, list):
                    j = 0
                    for col in column:
                        if '[*]' in col:
                            col = col.replace('[*]', '')
                            vcolumn = "`" + col + '`'
                            projections[var[1:] + '_Ontario_' + str(j)] = "FLATTEN(" + tablealias + "." + vcolumn + ") AS " + var[1:] + '_Ontario_' + str(j)
                        else:
                            vcolumn = "`" + col + '`'
                            projections[var[1:] + '_Ontario_' + str(j)] = tablealias + "." + vcolumn + " AS " + var[1:] + '_Ontario_' + str(j)
                        projvartocol.setdefault(var[1:], []).append(col)
                        objectfilters.append(tablealias + '.' + vcolumn + " is not null ")
                        objectfilters.append(tablealias + '.' + vcolumn + " <> '' ")
                        j += 1
                else:
                    col = column
                    if '[*]' in col:
                        col = col.replace('[*]', '')
                        column = "`" + col + '`'
                        projections[var[1:]] = "FLATTEN(" + tablealias + "." + column + ") AS `" + var[1:] + '`'
                    else:
                        column = "`" + column + '`'
                        projections[var[1:]] = tablealias + "." + column + " AS `" + var[1:] + '`'
                    projvartocol[var[1:]] = col
                    objectfilters.append(tablealias + '.' + column + " is not null ")
                    objectfilters.append(tablealias + '.' + column + " <> '' ")
            else:
                var_pred_map = {var: pred for pred, var in self.star['predicates'].items() if
                                pred in predicate_object_map}
                column = []
                for var in sparqlprojected:
                    if var not in var_pred_map:
                        continue
                    p = var_pred_map[var]
                    pmap, omap = predicate_object_map[p]
                    if omap.objectt.resource_type == TripleMapType.TEMPLATE:
                        coltotemplates[var[1:]] = omap.objectt.value
                        splits = omap.objectt.value.split('{')
                        column = []
                        for sp in splits[1:]:
                            column.append(sp[:sp.find('}')])
                        if len(column) == 1:
                            column = column[0]
                    elif omap.objectt.resource_type == TripleMapType.REFERENCE:
                        column = omap.objectt.value
                    else:
                        column = []
                    if isinstance(column, list):
                        j = 0
                        for col in column:
                            if '[*]' in col:
                                col = col.replace('[*]', '')
                                vcolumn = "`" + col + '`'
                                projections[var[1:] + '_Ontario_' + str(j)] = "FLATTEN(" + tablealias + "." + vcolumn + ") AS " + var[1:] + '_Ontario_' + str(j)
                            else:
                                vcolumn = "`" + col + '`'
                                projections[var[1:] + '_Ontario_' + str(j)] = tablealias + "." + vcolumn + " AS `" + var[1:] + '_Ontario_' + str(j) + '`'

                            projvartocol.setdefault(var[1:], []).append(col)
                            objectfilters.append(tablealias + '.' + vcolumn + " is not null ")
                            objectfilters.append(tablealias + '.' + vcolumn + " <> '' ")
                            j += 1
                    else:
                        col = column
                        if '[*]' in col:
                            col = col.replace('[*]', '')
                            column = "`" + col + '`'
                            projections[var[1:]] = "FLATTEN(" + tablealias + "." + column + ") AS `" + var[1:] + '`'
                        else:
                            column = "`" + column + '`'
                            projections[var[1:]] = tablealias + "." + column + " AS `" + var[1:] + '`'
                        projvartocol[var[1:]] = col
                        objectfilters.append(tablealias + '.' + column + " is not null ")
                        objectfilters.append(tablealias + '.' + column + " <> '' ")
            for f in query_filters:
                #if len(set(f.getVars()).intersection(list(var_pred_map.keys()))) == len(set(f.getVars())):
                fil = self.get_obj_filter(f, var_pred_map, self.mappings[tm].subject_map.subject, predicate_object_map, coltotemplates, tablealias)
                if fil is not None and len(fil) > 0:
                    constfilters.append(fil)
            tm_tablealias[tablealias] = tm

            triplemap = self.mappings[tm]
            subjects[tm] = triplemap.subject_map.subject

            logicalsource = triplemap.logical_source
            data_source = logicalsource.data_source
            # tablename = data_source.name
            # database_name = logicalsource.iterator  #TODO: this is not correct, only works for LSLOD-Custom experiment
            database_name = data_source.name
            if '/' in database_name:
                database_name = database_name.split('/')[-1]
            tablename = data_source.name
            # TODO: change the paths, this works only for LSLOD-experiment
            if self.datasource.dstype == DataSourceType.LOCAL_TSV:
                # fileext = 'dfs.`/data/tsv/' + database_name + '/' + tablename + '.tsv`'
                fileext = 'dfs.`/data/tsv/' + tablename + '`'
            elif self.datasource.dstype == DataSourceType.LOCAL_CSV:
                # fileext = 'dfs.`/data/csv/' + database_name + '/' + tablename + '.csv`'
                fileext = 'dfs.`/data/csv/' + tablename + '`'
            elif self.datasource.dstype == DataSourceType.LOCAL_JSON:
                # fileext = 'dfs.`/data/json/' + database_name + '/' + tablename + '.json`'
                fileext = 'dfs.`/data/json/' + tablename + '`'
            elif self.datasource.dstype == DataSourceType.HADOOP_TSV:
                # fileext = 'hdfs.`/user/kemele/data/tsv/' + database_name + '/' + tablename + '.tsv`'
                fileext = 'hdfs.`/user/kemele/data/tsv/' + tablename + '`'
            elif self.datasource.dstype == DataSourceType.HADOOP_CSV:
                # fileext = 'hdfs.`/user/kemele/data/csv/' + database_name + '/' + tablename + '.csv`'
                fileext = 'hdfs.`/user/kemele/data/csv/' + tablename + '`'
            elif self.datasource.dstype == DataSourceType.HADOOP_JSON:
                # fileext = 'hdfs.`/user/kemele/data/json/' + database_name + '/' + tablename + '.json`'
                fileext = 'hdfs.`/user/kemele/data/json/' + tablename + '`'
            else:
                fileext = ''

            fromclauses.append(fileext + ' ' + tablealias)
            i += 1

            for var, p in var_pred_map.items():

                if '?' not in var:
                    pmap, omap = predicate_object_map[p]

                    if omap.objectt.resource_type == TripleMapType.TEMPLATE:
                        # omap.objectt.value
                        splits = omap.objectt.value.split('{')
                        column = []
                        for sp in splits[1:]:
                            column.append(sp[:sp.find('}')])

                        var = var.replace(splits[0], '').replace('}', '')
                        if '<' in var and '>' in var:
                            var = var[1:-1]
                        var = "'" + var + "'"
                    elif omap.objectt.resource_type == TripleMapType.REFERENCE:
                        column = omap.objectt.value
                        if "'" not in var and '"' not in var:
                            var = "'" + var + "'"
                        if '"' in var:
                            var = "'" + var[1:-1] + "'"
                    else:
                        column = []
                    if isinstance(column, list):
                        j = 0
                        for col in column:
                            vcolumn = "`" + col + '`'
                            constfilters.append(tablealias + "." + vcolumn + " = " + var)
                            j += 1
                    else:
                        column = "`" + column + '`'
                        constfilters.append(tablealias + "." + column + " = " + var)

        subj = self.star['triples'][0].subject.name if not self.star['triples'][0].subject.constant else None
        invalidsubj = False
        if subj is not None:
            filtersadded = []
            for tm, subject in subjects.items():
                subjcol = subject.value
                tablealias = [v for v in tm_tablealias if tm_tablealias[v] == tm][0]
                splits = subjcol.split('{')
                coltotemplates[subj[1:]] = subjcol
                column = []
                for sp in splits[1:]:
                    column.append(sp[:sp.find('}')])

                if len(column) > 1:
                    j = 0
                    for col in column:
                        vcolumn = "`" + col + '`'
                        projections[subj[1:] + '_Ontario_' + str(j)] = tablealias + "." + vcolumn + " AS `" + subj[1:] + '_Ontario_' + str(j) + '`'
                        projvartocol.setdefault(subj[1:], []).append(col)
                        objectfilters.append(tablealias + '.' + vcolumn + " is not null ")
                        objectfilters.append(tablealias + '.' + vcolumn + " <> '' ")
                        j += 1
                elif len(column) == 1:
                    col = column[0]
                    column = "`" + col + '`'
                    projections[subj[1:]] = tablealias + "." + column + " AS `" + subj[1:] + '`'
                    projvartocol[subj[1:]] = col

                    objectfilters.append(tablealias + '.' + column + " is not null ")
                    objectfilters.append(tablealias + '.' + column + " <> '' ")
        else:
            subj = self.star['triples'][0].subject.name
            for tm, subject in subjects.items():
                subjcol = subject.value
                tablealias = [v for v in tm_tablealias if tm_tablealias[v] == tm][0]
                splits = subjcol.split('{')
                column = []
                for sp in splits[1:]:
                    column.append(sp[:sp.find('}')])

                if len(splits[0]) > 0 and splits[0] not in subj:
                    invalidsubj = True
                    break
                var = subj.replace(splits[0], '').replace('}', '')

                if '<' in var and '>' in var:
                    var = var[1:-1]
                var = "'" + var + "'"
                # if isinstance(column, list):
                j = 0
                for col in column:
                    vcolumn = "`" + col + '`'
                    constfilters.append(tablealias + "." + vcolumn + " = " + var)
                    j += 1
        if invalidsubj:
            mapping_preds = []

        if len(subjects) > 1:
            aliases = list(tm_tablealias.keys())
            raliases = aliases.copy()
            raliases.reverse()
            compared = []
            for a1 in aliases:
                for a2 in aliases:
                    if a1 + a2 in compared:
                        continue
                    if a1 == a2:
                        continue
                    compared.append(a1 + a2)
                    compared.append(a2 + a1)
                    subj1 = subjects[tm_tablealias[a1]].value
                    subj2 = subjects[tm_tablealias[a2]].value
                    column1 = None
                    column2 = None
                    splits = subj1.split('{')
                    for sp in splits:
                        if '}' in sp:
                            column1 = sp[:sp.find('}')]
                            break
                    splits = subj2.split('{')
                    for sp in splits:
                        if '}' in sp:
                            column2 = sp[:sp.find('}')]
                            break
                    column1 = '`' + column1 + '`'
                    column2 = '`' + column2 + '`'
                    if column1 == column2:
                        objectfilters.append(a1 + '.' + column1 + "=" + a2 + "." + column2)
        objectfilters.extend(constfilters)
        if len(mapping_preds) > 0:
            fromcaluse = "\n FROM " + ", ".join(list(set(fromclauses)))
            distinct = ""
            if self.query.distinct:
                distinct = "DISTINCT "
            projections = " SELECT  " + distinct + ", ".join(list(set(projections.values())))
            if len(objectfilters) > 0:
                whereclause = "\n WHERE " + "\n\t AND ".join(list(set(objectfilters)))
            else:
                whereclause = ""

            sqlquery = projections + " " + fromcaluse + " " + whereclause
            return sqlquery, projvartocol, coltotemplates, database_name

        return query, projvartocols, coltotemplates, database_name

    def makeunion(self, tounions, query_filters, subjectunions=False):

        coltotemplates = {}
        projvartocols = {}
        database_name = ""
        unions = []
        rdfmts = list(tounions.keys())
        rdfmts = list(reversed(sorted(rdfmts)))
        # print(rdfmts)
        for rdfmt in rdfmts:
            mappingpreds = tounions[rdfmt]
            if subjectunions:
                for tm, submaps in mappingpreds.items():
                    un, projvartocols, coltotemplates, database_name = self.makeJoin({tm: submaps}, query_filters)
                    if un is not None and len(un) > 0:
                        unions.append(un)
            else:
                un, projvartocols, coltotemplates, database_name = self.makeJoin(mappingpreds, query_filters)
                if un is not None and len(un) > 0:
                    unions.append(un)

        #query = " UNION ".join(unions)
        # print(query)
        return unions, projvartocols, coltotemplates, database_name

    def translate(self, query_filters):
        rdfmts = self.star['rdfmts']
        starpreds = list(self.star['predicates'].keys())
        star_preds = [p for p in starpreds if '?' not in p]
        if 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in star_preds:
            star_preds.remove('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')

        touninon = {}
        completematch = {}

        if len(star_preds) == 0:
            subjectonly = False
            for tm, triplemap in self.mappings.items():
                for rdfmt in triplemap.subject_map.rdf_types:
                    if rdfmt in rdfmts:
                        if 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in starpreds:
                            touninon.setdefault(rdfmt, {})[tm] = triplemap.subject_map
                            subjectonly = True
                        else:
                            touninon.setdefault(rdfmt, {})[tm] = triplemap.predicate_object_map
            if len(touninon) > 1 or subjectonly:
                return self.makeunion(touninon, query_filters, subjectonly)
            elif len(touninon) == 1:
                query, projvartocols, coltotemplates, database_name = self.makeJoin(touninon[list(touninon.keys())[0]],
                                                                                    query_filters)
                return query, projvartocols, coltotemplates, database_name
            else:
                return None, None, None, None
        else:
            mapping_preds = {tm: triplemap for tm, triplemap in self.mappings.items() for p in star_preds if p in triplemap.predicate_object_map}
            for tm, triplemap in mapping_preds.items():
                for rdfmt in triplemap.subject_map.rdf_types:
                    if rdfmt in rdfmts and len(set(star_preds).intersection(list(triplemap.predicate_object_map.keys()))) == len(set(star_preds)):
                        completematch[rdfmt] = {}
                        completematch[rdfmt][tm] = triplemap.predicate_object_map
                    if rdfmt in rdfmts and len(set(star_preds).intersection(list(triplemap.predicate_object_map.keys()))) > 0:
                        touninon.setdefault(rdfmt, {})[tm] = triplemap.predicate_object_map
            if len(completematch) > 0:
                if len(completematch) == 1:
                    query, projvartocols, coltotemplates, database_name = self.makeJoin(
                        touninon[list(touninon.keys())[0]], query_filters)
                    return query, projvartocols, coltotemplates, database_name
                else:
                    return self.makeunion(completematch, query_filters)
            elif len(touninon) > 1:
                return self.makeunion(touninon, query_filters)
            elif len(touninon) == 1:
                query, projvartocols, coltotemplates, database_name = self.makeJoin(touninon[list(touninon.keys())[0]],
                                                                                    query_filters)
                return query, projvartocols, coltotemplates, database_name
            else:
                return None, None, None, None
 def setUp(self):
     self.drill = PyDrill(host='localhost', port=8047)
Exemple #31
0
    def executeQuery(self, query, queue=Queue(), limit=-1, offset=0):
        """
        Entry point for query execution on csv files
        :param querystr: string query
        :return:
        """
        from time import time
        # start = time()
        # print("Start:", start)
        if len(self.mappings) == 0:
            print("Empty Mapping")
            queue.put('EOF')
            return []
        # querytxt = query
        self.query = qp.parse(query)
        self.prefixes = getPrefs(self.query.prefs)

        query_filters = [f for f in self.query.body.triples[0].triples if isinstance(f, Filter)]

        if limit > -1 or offset > -1:
            self.query.limit = limit
            self.query.offset = offset

        sqlquery, projvartocols, coltotemplates, filenametablename = self.translate(query_filters)
        # print(sqlquery)
        # totalres = 0
        if sqlquery is None or len(sqlquery) == 0:
            queue.put("EOF")
            return []
        try:
            start = time()
            try:
                self.drill = PyDrill(host=self.host, port=self.port)
            except Exception as ex:
                print("Exception while connecting to Drill", ex)
                queue.put("EOF")
                return
            if not self.drill.is_active():
                print('Exception: Please run Drill first')
                queue.put("EOF")
                return
            # print("Drill Initialization cost:", time() - start)
            logger.info("Drill Initialization cost:" + str(time() - start))
            start = time()
            if isinstance(sqlquery, list):
                sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0]
                if len(sqlquery) > 3:
                    sqlquery = " UNION ".join(sqlquery)
            if isinstance(sqlquery, list):
                sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0]
                # logger.info(" UNION ".join(sqlquery))
                processqueues = []
                processes = []
                res_dict = []
                for sql in sqlquery:
                    # processquery = Queue()
                    # self.run_union(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict)
                    # print(sql)
                    processquery = Queue()
                    processqueues.append(processquery)
                    p = Process(target=self.run_union, args=(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict,))
                    p.start()
                    processes.append(p)

                while len(processqueues) > 0:
                    toremove = []
                    try:
                        for q in processqueues:
                            if q.get(False) == 'EOF':
                                toremove.append(q)
                        for p in processes:
                            if p.is_alive():
                                p.terminate()
                    except:
                        pass
                    for q in toremove:
                        processqueues.remove(q)
                logger.info("Done running:")
                sw = " UNION ".join(sqlquery)
                logger.info(sw)
            else:
                card = 0
                # if limit == -1:
                limit = 1000
                if offset == -1:
                    offset = 0
                logger.info(sqlquery)
                # print(sqlquery)
                while True:
                    query_copy = sqlquery + " LIMIT " + str(limit) + " OFFSET " + str(offset)
                    cardinality = self.process_result(query_copy, queue, projvartocols, coltotemplates)
                    card += cardinality
                    if cardinality < limit:
                        break

                    offset = offset + limit
            # print("Exec in Drill took:", time() - start)
            logger.info("Exec in Drill took:" + str(time() - start))
        except Exception as e:
            print("Exception ", e)
            pass
        # print('End:', time(), "Total results:", totalres)
        # print("Drill finished after: ", (time()-start))
        queue.put("EOF")
Exemple #32
0
#!/usr/local/bin/python
from pydrill.client import PyDrill
import json
from bson.json_util import dumps
from bson import json_util
#print "Content-type: application/json\n\n";
print "Content-type: text/html\n\n"
#print """<p>hehy</p>"""
#drill = PyDrill(host='localhost', port=8047)
from pymongo import MongoClient

drill = PyDrill(host='localhost', port=8047)

if not drill.is_active():
    raise ImproperlyConfigured('Please run Drill first')

yelp_reviews = drill.query('''
  select count(*) as `count` from `mongo.274_BI`.`yelp_dataset`t where true=repeated_contains(categories,'Restaurants') and t.city='Charlotte'
''')

print dumps(yelp_reviews)

# pandas dataframe
#!/usr/local/bin/python
from pydrill.client import PyDrill
import json
from bson.json_util import dumps
from bson import json_util
#print "Content-type: application/json\n\n";
print "Content-type: text/html\n\n"
#print """<p>hehy</p>"""
#drill = PyDrill(host='localhost', port=8047)
from pymongo import MongoClient

drill = PyDrill(host='localhost', port=8047)

if not drill.is_active():
    raise ImproperlyConfigured('Please run Drill first')

city = "Las Vegas"

yelp_reviews = drill.query('''
  select categories[0] as category, count(categories[0]) as number from `dfs.yelp`.`yelp_academic_dataset_business.json`
   where true=repeated_contains(categories,'Restaurants') and city='Charlotte' group by categories[0]
    order by count(categories[0]) desc limit 5

''')

print dumps(yelp_reviews)

# pandas dataframe
Exemple #34
0
    Execution.getArgs()
    
    nvmlInit()

    drill = "drill" #None
    spark = "spark"

    compareResults = True
    if 'compare_results' in Settings.data['RunSettings']:
        compareResults = Settings.data['RunSettings']['compare_results'] 

    if (Settings.execution_mode == ExecutionMode.FULL and compareResults == "true") or Settings.execution_mode == ExecutionMode.GENERATOR:
        # Create Table Drill ------------------------------------------------------------------------------------------------------
        from pydrill.client import PyDrill
        drill = PyDrill(host = 'localhost', port = 8047)
        cs.init_drill_schema(drill, Settings.data['TestSettings']['dataDirectory'])

        # Create Table Spark ------------------------------------------------------------------------------------------------------
        spark = SparkSession.builder.appName("timestampTest").getOrCreate()
        cs.init_spark_schema(spark, Settings.data['TestSettings']['dataDirectory'])

    #Create Context For BlazingSQL
    
    bc, dask_client = init_context()

    nRals = Settings.data['RunSettings']['nRals']

    main(dask_client, drill, spark, Settings.data['TestSettings']['dataDirectory'], bc, nRals)
    
    if Settings.execution_mode != ExecutionMode.GENERATOR:
def get_column_types(query):

    drill = PyDrill(host='localhost', port=8049)
    data = drill.query(query)
    columns = data.columns
    types = {}

    formattedQuery = sqlparse.format(query,
                                     reindent=True,
                                     keyword_case='upper')
    formattedQuery = formattedQuery.split('\n')
    print(sqlparse.format(query, reindent=True, keyword_case='upper'))
    inSelect = False
    inSubquery = False
    inFromClause = False
    fields = []

    fieldRegex = r'\s{7}\S'
    fieldSubquery = r'\s'
    subqueryFieldPattern = r'\s{2,3}\S'
    subqueryField = ""

    fromClause = ""
    functionPattern = r'\s+(\S+)\('

    fieldCount = 0
    for line in formattedQuery:
        functionMatchObject = re.match(functionPattern, line)

        if line.startswith('SELECT'):
            inSelect = True
            line = line.replace('SELECT', '')

            line = line.strip()
            #remove trailing comma
            if len(line) > 0:
                if line[-1:] == ",":
                    line = line[:-1]
            fields.append(line)

        # If the line is a function, assign the correct return type
        elif inSelect and inFromClause == False and functionMatchObject:
            print("FieldCount: " + str(fieldCount) + " " + line)
            functionCandidate = functionMatchObject.group(1)
            functionCandidate = functionCandidate.upper()
            if functionCandidate in _BIG_INT_FUNCTIONS:
                types[columns[fieldCount]] = "bigint"

            elif functionCandidate in _INT_FUNCTIONS:
                types[columns[fieldCount]] = "integer"

            elif functionCandidate in _FLOAT_FUNCTIONS:
                types[columns[fieldCount]] = "float"

            else:
                types[columns[fieldCount]] = "varchar"

            fieldCount += 1
            continue

        # Case for a regular field
        elif inSelect == True and re.match(fieldRegex, line):
            line = line.strip()
            # remove trailing comma from field name
            if len(line) > 0:
                if line[-1:] == ",":
                    line = line[:-1]
            fields.append(line)

        elif inSelect == True and line.startswith('FROM'):
            inSelect = False
            inFromClause = True
            if inSubquery:
                fields.append(subqueryField)
                inSubquery = False
            else:
                fromClause = fromClause + " " + line.strip()

        elif inFromClause == True and (line.startswith('WHERE')
                                       or line.startswith('GROUP')
                                       or line.startswith('ORDER')
                                       or line.startswith('HAVING')):
            inFromClause = False
            inSelect = False

        elif re.match(subqueryFieldPattern,
                      line) and inSubquery == False and inFromClause == False:
            inSubquery = True
            subqueryField = line.strip()

        elif inSubquery == True:
            subqueryField = subqueryField + " " + line.strip()
            if line.endswith(','):
                inSubquery = False
                fields.append(subqueryField)
                subqueryField = ""

        elif inSubquery == True and line == False:
            inSubquery = False
            fields.append(subqueryField)
            subqueryField = ""

        elif inFromClause == True:
            fromClause = fromClause + " " + line.strip()

        fieldCount += 1

    typeQuery = "SELECT"
    fieldCount = 0
    aliasPattern = r'AS\s`?[a-zA-Z_][a-zA-Z0-9-_$` ]*$'
    for field in fields:
        if re.search(aliasPattern, field):
            field = re.sub(aliasPattern, '', field)

        if fieldCount > 0:
            typeQuery += ","
        typeQuery = typeQuery + " " + field + " AS " + columns[
            fieldCount] + ", typeof( " + field + ") AS " + columns[
                fieldCount] + "_type"
        fieldCount += 1

    typeQuery += fromClause
    typeQuery += " LIMIT 1"
    typeQuery = sqlparse.format(typeQuery, reindent=True, keyword_case='upper')

    print(typeQuery)
    fieldQueryResult = drill.query(typeQuery).to_dataframe()
    tempTypes = fieldQueryResult.T.to_dict()[0]

    for column in columns:
        if column not in types.keys():
            types[column] = tempTypes[column + "_type"]

    print(types)
    return types
#!/usr/local/bin/python
from pydrill.client import PyDrill
import json
from bson.json_util import dumps
from bson import json_util
#print "Content-type: application/json\n\n";
print "Content-type: text/html\n\n";
#print """<p>hehy</p>"""
#drill = PyDrill(host='localhost', port=8047)
from pymongo import MongoClient
drill = PyDrill(host='localhost', port=8047)

if not drill.is_active():
    raise ImproperlyConfigured('Please run Drill first')

city="Las Vegas"

yelp_reviews = drill.query('''
  select sum(case when t.stars=1.0 then 1 else 0 end) as `1`,
  sum(case when t.stars=2.0 then 1 else 0 end) as `2`,
  sum(case when t.stars=2.5 then 1 else 0 end) as `3`,
   sum(case when t.stars=3.0 then 1 else 0 end) as `4`,
   sum(case when t.stars=3.5 then 1 else 0 end) as `5`,
   sum(case when t.stars=4.0 then 1 else 0 end) as `6`,
   sum(case when t.stars=4.5 then 1 else 0 end) as `7`,
   sum(case when t.stars=5.0 then 1 else 0 end) as `8`
   from `mongo.274_BI`.`yelp_dataset`t where t.city='Pittsburgh' and true=repeated_contains(categories,'Restaurants')

''')

print dumps(yelp_reviews)
class DataService:
    __drill = None
    __dictionary = queryDictionary.QueryDictionary

    def __init__(self):
        self.init_drill_connection();

    def init_drill_connection(self):
        self.__drill = PyDrill(host='localhost')
        is_drill_active = self.__drill.is_active()

        if is_drill_active:
            print("Drill is active: %s" % is_drill_active)
        else:
            print('''
            Drill is not active. Start your server in a terminal using command:
            cd /Users/Aymeric/apache-drill-1.5.0
            bin/drill-embedded
            ''')

    def get_users(self):
        query = self.__dictionary.get_users()
        results = self.__drill.query(query)
        return self.get_frame(results)

    def get_elite_users(self):
        query = self.__dictionary.get_elite()
        results = self.__drill.query(query)
        return self.get_frame(results)

    def get_elite_users_count(self):
        query = self.__dictionary.get_elite_count()
        results = self.__drill.query(query)
        return self.get_frame(results)

    def get_elite_users_tip(self):
        query = self.__dictionary.get_elite_tip()
        results = self.__drill.query(query)
        return self.get_frame(results)

    def get_elite_users_review(self):
        query = self.__dictionary.get_elite_review()
        results = self.__drill.query(query, 30)
        return self.get_frame(results)

    def get_restaurant_review(self):
        query = self.__dictionary.get_review()
        results = self.__drill.query(query, 30)
        return self.get_frame(results)

    def get_featureset1_but_votes(self):
        query = self.__dictionary.get_featureset1_but_votes()
        results = self.__drill.query(query, 300)
        return self.get_frame(results)

    def get_user_review(self, review_id):
        query = self.__dictionary.get_user_review(review_id)
        results = self.__drill.query(query, 30)
        return self.get_frame(results)

    def review_dates(self):
        query = self.__dictionary.review_dates()
        t0 = time.clock();
        print('started on: ' + str(datetime.datetime.now()))
        results = self.__drill.query(query, 600)
        print('finished on: ' + str(datetime.datetime.now()))
        return self.get_frame(results)

    @staticmethod
    def get_frame(results, number_of_records=None):
        return DataFrame(data=results.rows, columns=results.columns)

    @staticmethod
    def print_frame(frame, records_to_display=None):
        # frame = DataService.get_frame(results, records_to_display)
        print("\n")

        if records_to_display is None:
            print(frame.to_string(justify='right'))
        else:
            print(frame[:records_to_display].to_string(justify='left'))

    def main(self):
        self.print_frame(self.get_elite_users(), 10)