Exemple #1
0
    def execute_statement(
        self,
        bql_statement_ast,
        pretty=True,
        timing=False,
        plots=None,
        yes=False,
        debug=False,
        pandas_df=None,
        pandas_output=True,
        key_column=None,
    ):
        """
        Accepts a SINGLE BQL STATEMENT as input, parses it, and executes it if it was parsed
        successfully.

        If pretty=True, then the command output will be pretty-printed as a string.
        If pretty=False, then the command output will be returned as a python object.

        timing=True prints out how long the command took to execute.

        For commands that have visual results, plots=True will cause those to be displayed
        by matplotlib as graphics rather than being pretty-printed as text.
        (Note that the graphics will also be saved if the user added SAVE TO <filename> to the BQL.)
        """
        if timing:
            start_time = time.time()

        parser_out = None
        ##TODO move pyparsing objects out of client into parser
        if debug:
            parser_out = self.parser.parse_single_statement(bql_statement_ast)
        else:
            try:
                parser_out = self.parser.parse_single_statement(bql_statement_ast)
            except Exception as e:
                raise utils.BayesDBParseError(str(e))
        if parser_out is None:
            print "Could not parse command. Try typing 'help' for a list of all commands."
            return
        elif not parser_out:
            return

        method_name, args_dict, client_dict = parser_out
        if client_dict is None:
            client_dict = {}

        ## Do stuff now that you know the user's command, but before passing it to engine.
        if method_name == "execute_file":
            return dict(message="execute_file", bql_string=open(args_dict["filename"], "r").read())
        elif (method_name == "drop_btable") and (not yes):
            ## If dropping something, ask for confirmation.
            print "Are you sure you want to permanently delete this btable, and all associated models, without any way to get them back? Enter 'y' if yes."
            user_confirmation = raw_input()
            if "y" != user_confirmation.strip():
                return dict(message="Operation canceled by user.")
        elif (method_name == "drop_models") and (not yes):
            ## If dropping something, ask for confirmation.
            print "Are you sure you want to permanently delete model(s), without any way to get them back? Enter 'y' if yes."
            user_confirmation = raw_input()
            if "y" != user_confirmation.strip():
                return dict(message="Operation canceled by user.")
        elif method_name == "load_models":
            pklpath = client_dict["pkl_path"]
            try:
                models = pickle.load(gzip.open(self.parser.get_absolute_path(pklpath), "rb"))
            except IOError as e:
                if pklpath[-7:] != ".pkl.gz":
                    if pklpath[-4:] == ".pkl":
                        models = pickle.load(open(self.parser.get_absolute_path(pklpath), "rb"))
                    else:
                        pklpath = pklpath + ".pkl.gz"
                        models = pickle.load(gzip.open(self.parser.get_absolute_path(pklpath), "rb"))
                else:
                    raise utils.BayesDBError("Models file %s could not be found." % pklpath)
            args_dict["models"] = models
        elif method_name == "create_btable":
            if pandas_df is None:
                header, rows = data_utils.read_csv(client_dict["csv_path"])
            else:
                header, rows = data_utils.read_pandas_df(pandas_df)
            args_dict["header"] = header
            args_dict["raw_T_full"] = rows
            args_dict["key_column"] = key_column
            args_dict["subsample"] = False

            # Display warning messages and get confirmation if btable is too large.
            # Ask user if they want to turn on subsampling.
            max_columns = 200
            max_rows = 1000
            max_cells = 100000
            message = None
            if not yes:
                if len(rows[0]) > max_columns:
                    message = (
                        "The btable you are uploading has %d columns, but BayesDB is currently designed to support only %d columns. If you proceed, performance may suffer unless you set many columns' datatypes to 'ignore'. Would you like to continue? Enter 'y' if yes."
                        % (len(rows[0]), max_columns)
                    )
                if len(rows) > max_rows:
                    message = (
                        "The btable you are uploading has %d rows, but BayesDB is currently designed to support only %d rows. If you proceed, performance may suffer. Would you like to continue? Enter 'y' to continue without subsampling, 'n' to abort, 's' to continue by subsampling %d rows, or a positive integer to specify the number of rows to be subsampled."
                        % (len(rows), max_rows, max_rows)
                    )
                if len(rows[0]) * len(rows) > max_cells:
                    message = (
                        "The btable you are uploading has %d cells, but BayesDB is currently designed to support only %d cells. If you proceed, performance may suffer unless you enable subsampling. Enter 'y' to continue without subsampling, 'n' to abort, 's' to continue by subsampling %d rows, or a positive integer to specify the number of rows to be subsampled."
                        % (len(rows) * len(rows[0]), max_cells, max_rows)
                    )
                if message is not None:
                    print message
                    user_confirmation = raw_input()
                    if "y" == user_confirmation.strip():
                        pass
                    elif "n" == user_confirmation.strip():
                        return dict(message="Operation canceled by user.")
                    elif "s" == user_confirmation.strip():
                        args_dict["subsample"] = min(max_rows, len(rows))
                    elif utils.is_int(user_confirmation.strip()):
                        args_dict["subsample"] = int(user_confirmation.strip())
                    else:
                        return dict(message="Operation canceled by user.")
        elif method_name in ["label_columns", "update_metadata"]:
            if client_dict["source"] == "file":
                header, rows = data_utils.read_csv(client_dict["csv_path"])
                args_dict["mappings"] = {key: value for key, value in rows}

        ## Call engine.
        result = self.call_bayesdb_engine(method_name, args_dict, debug)

        ## If error occurred, exit now.
        if "error" in result and result["error"]:
            if pretty:
                print result["message"]
                return result["message"]
            else:
                return result

        ## Do stuff now that engine has given you output, but before printing the result.
        result = self.callback(method_name, args_dict, client_dict, result)

        assert type(result) != int

        if timing:
            end_time = time.time()
            print "Elapsed time: %.2f seconds." % (end_time - start_time)

        if plots is None:
            plots = "DISPLAY" in os.environ.keys()

        if "matrix" in result and (plots or client_dict["filename"]):
            # Plot matrices
            plotting_utils.plot_matrix(
                result["matrix"], result["column_names"], result["title"], client_dict["filename"]
            )
            if pretty:
                if "column_lists" in result:
                    print self.pretty_print(dict(column_lists=result["column_lists"]))
                return self.pretty_print(result)
            else:
                return result
        if "plot" in client_dict and client_dict["plot"]:
            if plots or client_dict["filename"]:
                # Plot generalized histograms or scatterplots
                plot_remove_key = method_name in ["select", "infer"]
                plotting_utils.plot_general_histogram(
                    result["columns"],
                    result["data"],
                    result["M_c"],
                    client_dict["filename"],
                    client_dict["scatter"],
                    remove_key=plot_remove_key,
                )
                return self.pretty_print(result)
            else:
                if "message" not in result:
                    result["message"] = ""
                result["message"] = (
                    "Your query indicates that you would like to make a plot, but in order to do so, you must either enable plotting in a window or specify a filename to save to by appending 'SAVE TO <filename>' to this command.\n"
                    + result["message"]
                )

        if pretty:
            pp = self.pretty_print(result)
            print pp

        if pandas_output and "data" in result and "columns" in result:
            result_pandas_df = data_utils.construct_pandas_df(result)
            return result_pandas_df
        else:
            return result
Exemple #2
0
    def execute_statement(self, bql_statement_ast, pretty=True, timing=False, plots=None, yes=False,
                          debug=False, pandas_df=None, pandas_output=True, key_column=None,
                          return_raw_result=False):
        """
        Accepts a SINGLE BQL STATEMENT as input, parses it, and executes it if it was parsed
        successfully.

        If pretty=True, then the command output will be pretty-printed as a string.
        If pretty=False, then the command output will be returned as a python object.

        timing=True prints out how long the command took to execute.

        For commands that have visual results, plots=True will cause those to be displayed
        by matplotlib as graphics rather than being pretty-printed as text.
        (Note that the graphics will also be saved if the user added SAVE TO <filename> to the BQL.)
        """
        if timing:
            start_time = time.time()

        parser_out = None
        # TODO move pyparsing objects out of client into parser
        if debug:
            parser_out = self.parser.parse_single_statement(bql_statement_ast)
        else:
            try:
                parser_out = self.parser.parse_single_statement(bql_statement_ast)
            except Exception as e:
                raise utils.BayesDBParseError(str(e))
        if parser_out is None:
            print("Could not parse command. Try typing 'help' for a list of all commands.")
            return
        elif not parser_out:
            return

        method_name, args_dict, client_dict = parser_out
        if client_dict is None:
            client_dict = {}

        # Do stuff now that you know the user's command, but before passing it to engine.
        if method_name == 'execute_file':
            return dict(message='execute_file', bql_string=open(args_dict['filename'], 'r').read())
        elif method_name == 'update_codebook':
            _, codebook_rows = data_utils.read_csv(client_dict['codebook_path'], has_header=True)
            # TODO: require specific codebook_header values? Or don't require a header,
            # and if the first value in the header is actually a data column name, assume
            # the first row is codebook data, not a header.

            # Create a dict indexed by column name
            codebook = dict()
            for codebook_row in codebook_rows:
                codebook[codebook_row[0]] = dict(zip(['short_name', 'description', 'value_map'],
                                                 codebook_row[1:]))

            args_dict['codebook'] = codebook
        elif (method_name == 'drop_btable') and (not yes):
            # If dropping something, ask for confirmation.
            print("Are you sure you want to permanently delete this btable, and all associated "
                  "models, without any way to get them back? Enter 'y' if yes.")
            user_confirmation = raw_input()
            if 'y' != user_confirmation.strip():
                return dict(message="Operation canceled by user.")
        elif (method_name == 'drop_models') and (not yes):
            # If dropping something, ask for confirmation.
            print("Are you sure you want to permanently delete model(s), without any way to get "
                  "them back? Enter 'y' if yes.")
            user_confirmation = raw_input()
            if 'y' != user_confirmation.strip():
                return dict(message="Operation canceled by user.")
        elif method_name == 'load_models':
            pklpath = client_dict['pkl_path']
            try:
                model_data = pickle.load(gzip.open(self.parser.get_absolute_path(pklpath), 'rb'))
            except IOError as e:
                if pklpath[-7:] != '.pkl.gz':
                    if pklpath[-4:] == '.pkl':
                        model_data = pickle.load(open(self.parser.get_absolute_path(pklpath), 'rb'))
                    else:
                        pklpath = pklpath + ".pkl.gz"
                        model_data = pickle.load(gzip.open(self.parser.get_absolute_path(pklpath),
                                                 'rb'))
                else:
                    raise utils.BayesDBError('Models file %s could not be found.' % pklpath)
            # This is the more recent version, where schema is stored with models.
            if 'schema' in model_data.keys():
                args_dict['models'] = model_data['models']
                args_dict['model_schema'] = model_data['schema']
            # This support older saved models, where only the model info was stored.
            else:
                args_dict['models'] = model_data
                args_dict['model_schema'] = None
        elif method_name == 'create_btable':
            if pandas_df is None:
                header, rows = data_utils.read_csv(client_dict['csv_path'])
            else:
                header, rows = data_utils.read_pandas_df(pandas_df)
            args_dict['header'] = header
            args_dict['raw_T_full'] = rows
            args_dict['key_column'] = key_column
            args_dict['subsample'] = False

            if 'codebook_path' in client_dict:
                _, codebook_rows = data_utils.read_csv(client_dict['codebook_path'],
                                                       has_header=True)
                # TODO: require specific codebook_header values? Or don't require a header,
                # and if the first value in the header is actually a data column name, assume
                # the first row is codebook data, not a header.

                # Create a dict indexed by column name
                codebook = dict()
                for codebook_row in codebook_rows:
                    codebook[codebook_row[0]] = dict(zip(['short_name', 'description', 'value_map'],
                                                     codebook_row[1:]))
                args_dict['codebook'] = codebook
            else:
                warning = dedent("""
                WARNING!

                You are creating a btable without a codebook, which will make interpretation
                of results more difficult. Codebooks should be in CSV format with each row
                corresponding to one column of the original data. The codebook should have four
                columns:

                1. actual column name
                2. short column description
                3. long column description
                4. value map (optional, only used for categorical columns - should be in JSON
                   format)
                """)
                print(warning)

            # Display warning messages and get confirmation if btable is too large.
            # Ask user if they want to turn on subsampling.
            max_columns = 200
            max_rows = 1000
            max_cells = 100000
            message = None
            if not yes:
                if len(rows[0]) > max_columns:
                    message = "The btable you are uploading has %d columns, but BayesDB is " \
                              "currently designed to support only %d columns. If you proceed, " \
                              "performance may suffer unless you set many columns' datatypes to " \
                              "'ignore'. Would you like to continue? Enter 'y' if yes." \
                              % (len(rows[0]), max_columns)
                if len(rows) > max_rows:
                    message = "The btable you are uploading has %d rows, but BayesDB is currently "\
                              "designed to support only %d rows. If you proceed, performance may "\
                              "suffer. Would you like to continue? Enter 'y' to continue without "\
                              "subsampling, 'n' to abort, 's' to continue by subsampling %d rows, "\
                              "or a positive integer to specify the number of rows to be "\
                              "subsampled." % (len(rows), max_rows, max_rows)
                if len(rows[0])*len(rows) > max_cells:
                    message = "The btable you are uploading has %d cells, but BayesDB is currently"\
                              " designed to support only %d cells. If you proceed, performance may"\
                              " suffer unless you enable subsampling. Enter 'y' to continue "\
                              " without subsampling, 'n' to abort, 's' to continue by subsampling "\
                              "%d rows, or a positive integer to specify the number of rows to be "\
                              "subsampled." % (len(rows)*len(rows[0]), max_cells, max_rows)
                if message is not None:
                    print(message)
                    user_confirmation = raw_input()
                    if 'y' == user_confirmation.strip():
                        pass
                    elif 'n' == user_confirmation.strip():
                        return dict(message="Operation canceled by user.")
                    elif 's' == user_confirmation.strip():
                        args_dict['subsample'] = min(max_rows, len(rows))
                    elif utils.is_int(user_confirmation.strip()):
                        args_dict['subsample'] = int(user_confirmation.strip())
                    else:
                        return dict(message="Operation canceled by user.")
        elif method_name in ['label_columns', 'update_metadata']:
            if client_dict['source'] == 'file':
                header, rows = data_utils.read_csv(client_dict['csv_path'])
                args_dict['mappings'] = {key: value for key, value in rows}

        # Call engine.
        result = self.call_bayesdb_engine(method_name, args_dict, debug)

        # If error occurred, exit now.
        if 'error' in result and result['error']:
            if pretty:
                print(result['message'])
                return result['message']
            else:
                return result

        # Do stuff now that engine has given you output, but before printing the result.
        result = self.callback(method_name, args_dict, client_dict, result)

        if return_raw_result:
            raw_result = {
                'result': result,
                'method_name': method_name,
                'client_dict': client_dict}
            print("returning raw result for %s" % (method_name))
            return raw_result

        assert type(result) != int

        if timing:
            end_time = time.time()
            print('Elapsed time: %.2f seconds.' % (end_time - start_time))

        if plots is None:
            plots = 'DISPLAY' in os.environ.keys()

        if 'matrix' in result and (plots or client_dict['filename']):
            # Plot matrices
            plotting_utils.plot_matrix(result['matrix'], result['column_names'], result['title'],
                                       client_dict['filename'])
            if pretty:
                if 'column_lists' in result:
                    print(self.pretty_print(dict(column_lists=result['column_lists'])))
                return self.pretty_print(result)
            else:
                return result
        if ('plot' in client_dict and client_dict['plot']):
            if (plots or client_dict['filename']):
                # Plot generalized histograms or scatterplots

                try:
                    plotting_M_c = result['metadata_full']['M_c_full']
                except KeyError:
                    plotting_M_c = result['M_c']

                plot_remove_key = method_name in ['select', 'infer']
                plotting_utils.plot_general_histogram(result['column_names'], result['data'],
                                                      plotting_M_c, result['schema_full'],
                                                      client_dict['filename'],
                                                      client_dict['scatter'],
                                                      remove_key=plot_remove_key)
                return self.pretty_print(result)
            else:
                if 'message' not in result:
                    result['message'] = ""
                result['message'] = "Your query indicates that you would like to make a plot, but "\
                                    "in order to do so, you must either enable plotting in a "\
                                    "window or specify a filename to save to by appending 'SAVE "\
                                    "TO <filename>' to this command.\n" + result['message']

        if pretty:
            pp = self.pretty_print(result)
            print(pp)

        # Print warnings last so they're readable without scrolling backwards.
        if 'warnings' in result:
            """ Pretty-print warnings. """
            for warning in result['warnings']:
                print('WARNING: %s' % warning)

        if pandas_output and 'data' in result and 'column_labels' in result:
            result_pandas_df = data_utils.construct_pandas_df(result)
            return result_pandas_df
        else:
            return result
Exemple #3
0
    def execute_statement(self,
                          bql_statement_ast,
                          pretty=True,
                          timing=False,
                          plots=None,
                          yes=False,
                          debug=False,
                          pandas_df=None,
                          pandas_output=True,
                          key_column=None,
                          return_raw_result=False,
                          force_output=False):
        """
        Accepts a SINGLE BQL STATEMENT as input, parses it, and executes it if it was parsed
        successfully.

        If pretty=True, then the command output will be pretty-printed as a string.
        If pretty=False, then the command output will be returned as a python object.
        If force_output=True, then results will be returned regardless of pretty

        timing=True prints out how long the command took to execute.

        For commands that have visual results, plots=True will cause those to be displayed
        by matplotlib as graphics rather than being pretty-printed as text.
        (Note that the graphics will also be saved if the user added SAVE TO <filename> to the BQL.)
        """
        if timing:
            start_time = time.time()

        parser_out = None
        # TODO move pyparsing objects out of client into parser
        if debug:
            parser_out = self.parser.parse_single_statement(bql_statement_ast)
        else:
            try:
                parser_out = self.parser.parse_single_statement(
                    bql_statement_ast)
            except Exception as e:
                raise utils.BayesDBParseError(str(e))
        if parser_out is None:
            print(
                "Could not parse command. Try typing 'help' for a list of all commands."
            )
            return
        elif not parser_out:
            return

        method_name, args_dict, client_dict = parser_out
        if client_dict is None:
            client_dict = {}

        # Do stuff now that you know the user's command, but before passing it to engine.
        if method_name == 'execute_file':
            return dict(message='execute_file',
                        bql_string=open(args_dict['filename'], 'r').read())
        elif method_name == 'update_codebook':
            _, codebook_rows = data_utils.read_csv(
                client_dict['codebook_path'], has_header=True)
            # TODO: require specific codebook_header values? Or don't require a header,
            # and if the first value in the header is actually a data column name, assume
            # the first row is codebook data, not a header.

            # Create a dict indexed by column name
            codebook = dict()
            for codebook_row in codebook_rows:
                codebook[codebook_row[0]] = dict(
                    zip(['short_name', 'description', 'value_map'],
                        codebook_row[1:]))

            args_dict['codebook'] = codebook
        elif (method_name == 'drop_btable') and (not yes):
            # If dropping something, ask for confirmation.
            print(
                "Are you sure you want to permanently delete this btable, and all associated "
                "models, without any way to get them back? Enter 'y' if yes.")
            user_confirmation = raw_input()
            if 'y' != user_confirmation.strip():
                return dict(message="Operation canceled by user.")
        elif (method_name == 'drop_models') and (not yes):
            # If dropping something, ask for confirmation.
            print(
                "Are you sure you want to permanently delete model(s), without any way to get "
                "them back? Enter 'y' if yes.")
            user_confirmation = raw_input()
            if 'y' != user_confirmation.strip():
                return dict(message="Operation canceled by user.")
        elif method_name == 'load_models':
            pklpath = client_dict['pkl_path']
            try:
                model_data = pickle.load(
                    gzip.open(self.parser.get_absolute_path(pklpath), 'rb'))
            except IOError as e:
                if pklpath[-7:] != '.pkl.gz':
                    if pklpath[-4:] == '.pkl':
                        model_data = pickle.load(
                            open(self.parser.get_absolute_path(pklpath), 'rb'))
                    else:
                        pklpath = pklpath + ".pkl.gz"
                        model_data = pickle.load(
                            gzip.open(self.parser.get_absolute_path(pklpath),
                                      'rb'))
                else:
                    raise utils.BayesDBError(
                        'Models file %s could not be found.' % pklpath)
            # This is the more recent version, where schema is stored with models.
            if 'schema' in model_data.keys():
                args_dict['models'] = model_data['models']
                args_dict['model_schema'] = model_data['schema']
            # This support older saved models, where only the model info was stored.
            else:
                args_dict['models'] = model_data
                args_dict['model_schema'] = None

            # Older versions of model_schema just had a str cctype as the dict items.
            # Newest version has a dict of cctype and parameters. Use this values to
            # test the recency of the models.
            model_schema = args_dict['model_schema']
            if model_schema:
                model_schema_itemtype = type(
                    model_schema[model_schema.keys()[0]])
            else:
                model_schema_itemtype = None

            if model_schema is None or model_schema_itemtype != dict:
                args_dict['model_schema'] = None
                if not yes:
                    print """WARNING! The models you are currently importing were saved without a schema
                        or without detailed column parameters (probably from a previous version).

                        If you are loading models into the same table from which you created them, problems
                        are unlikely, unless you have dropped models and then updated the schema.

                        If you are loading models into a different table from which you created them, you
                        should verify that the table schemas are the same.

                        Please use "SAVE MODELS FROM <btable> TO <filename.pkl.gz>" to create an updated copy of your models.

                        Are you sure you want to load these model(s)?
                        """
                    user_confirmation = raw_input()
                    if 'y' != user_confirmation.strip():
                        return dict(message="Operation canceled by user.")
        elif method_name == 'create_btable':
            if pandas_df is None:
                header, rows = data_utils.read_csv(client_dict['csv_path'])
            else:
                header, rows = data_utils.read_pandas_df(pandas_df)
            args_dict['header'] = header
            args_dict['raw_T_full'] = rows
            args_dict['key_column'] = key_column
            args_dict['subsample'] = False

            if 'codebook_path' in client_dict:
                _, codebook_rows = data_utils.read_csv(
                    client_dict['codebook_path'], has_header=True)
                # TODO: require specific codebook_header values? Or don't require a header,
                # and if the first value in the header is actually a data column name, assume
                # the first row is codebook data, not a header.

                # Create a dict indexed by column name
                codebook = dict()
                for codebook_row in codebook_rows:
                    codebook[codebook_row[0]] = dict(
                        zip(['short_name', 'description', 'value_map'],
                            codebook_row[1:]))
                args_dict['codebook'] = codebook
            else:
                warning = dedent("""
                WARNING!

                You are creating a btable without a codebook, which will make interpretation
                of results more difficult. Codebooks should be in CSV format with each row
                corresponding to one column of the original data. The codebook should have four
                columns:

                1. actual column name
                2. short column description
                3. long column description
                4. value map (optional, only used for categorical columns - should be in JSON
                   format)
                """)
                print(warning)

            # Display warning messages and get confirmation if btable is too large.
            # Ask user if they want to turn on subsampling.
            max_columns = 200
            max_rows = 1000
            max_cells = 100000
            message = None
            if not yes:
                if len(rows[0]) > max_columns:
                    message = "The btable you are uploading has %d columns, but BayesDB is " \
                              "currently designed to support only %d columns. If you proceed, " \
                              "performance may suffer unless you set many columns' datatypes to " \
                              "'ignore'. Would you like to continue? Enter 'y' if yes." \
                              % (len(rows[0]), max_columns)
                if len(rows) > max_rows:
                    message = "The btable you are uploading has %d rows, but BayesDB is currently "\
                              "designed to support only %d rows. If you proceed, performance may "\
                              "suffer. Would you like to continue? Enter 'y' to continue without "\
                              "subsampling, 'n' to abort, 's' to continue by subsampling %d rows, "\
                              "or a positive integer to specify the number of rows to be "\
                              "subsampled." % (len(rows), max_rows, max_rows)
                if len(rows[0]) * len(rows) > max_cells:
                    message = "The btable you are uploading has %d cells, but BayesDB is currently"\
                              " designed to support only %d cells. If you proceed, performance may"\
                              " suffer unless you enable subsampling. Enter 'y' to continue "\
                              " without subsampling, 'n' to abort, 's' to continue by subsampling "\
                              "%d rows, or a positive integer to specify the number of rows to be "\
                              "subsampled." % (len(rows)*len(rows[0]), max_cells, max_rows)
                if message is not None:
                    print(message)
                    user_confirmation = raw_input()
                    if 'y' == user_confirmation.strip():
                        pass
                    elif 'n' == user_confirmation.strip():
                        return dict(message="Operation canceled by user.")
                    elif 's' == user_confirmation.strip():
                        args_dict['subsample'] = min(max_rows, len(rows))
                    elif utils.is_int(user_confirmation.strip()):
                        args_dict['subsample'] = int(user_confirmation.strip())
                    else:
                        return dict(message="Operation canceled by user.")
        elif method_name in ['label_columns', 'update_metadata']:
            if client_dict['source'] == 'file':
                header, rows = data_utils.read_csv(client_dict['csv_path'])
                args_dict['mappings'] = {key: value for key, value in rows}

        # Call engine.
        result = self.call_bayesdb_engine(method_name, args_dict, debug)

        # If error occurred, exit now.
        if 'error' in result and result['error']:
            if pretty:
                print(result['message'])
                if force_output:
                    return result
                else:
                    return result['message']
            else:
                return result

        # Do stuff now that engine has given you output, but before printing the result.
        result = self.callback(method_name, args_dict, client_dict, result)

        if return_raw_result:
            raw_result = {
                'result': result,
                'method_name': method_name,
                'client_dict': client_dict
            }
            print("returning raw result for %s" % (method_name))
            return raw_result

        assert type(result) != int

        if timing:
            end_time = time.time()
            print('Elapsed time: %.2f seconds.' % (end_time - start_time))

        if plots is None:
            plots = 'DISPLAY' in os.environ.keys()

        if 'matrix' in result and (plots or client_dict['filename']):
            # Plot matrices
            plotting_utils.plot_matrix(result['matrix'],
                                       result['column_names'], result['title'],
                                       client_dict['filename'])
            if pretty:
                if 'column_lists' in result:
                    print(
                        self.pretty_print(
                            dict(column_lists=result['column_lists'])))

                if force_output:
                    return result
                else:
                    return self.pretty_print(result)
            else:
                return result
        if ('plot' in client_dict and client_dict['plot']):
            if (plots or client_dict['filename']):
                # Plot generalized histograms or scatterplots

                try:
                    plotting_M_c = result['metadata_full']['M_c_full']
                except KeyError:
                    plotting_M_c = result['M_c']

                plot_remove_key = method_name in ['select', 'infer']
                plotting_utils.plot_general_histogram(
                    result['column_names'],
                    result['data'],
                    plotting_M_c,
                    result['schema_full'],
                    client_dict['filename'],
                    client_dict['scatter'],
                    remove_key=plot_remove_key)
                return self.pretty_print(result)
            else:
                if 'message' not in result:
                    result['message'] = ""
                result['message'] = "Your query indicates that you would like to make a plot, but "\
                                    "in order to do so, you must either enable plotting in a "\
                                    "window or specify a filename to save to by appending 'SAVE "\
                                    "TO <filename>' to this command.\n" + result['message']

        if pretty:
            pp = self.pretty_print(result)
            print(pp)

        # Print warnings last so they're readable without scrolling backwards.
        if 'warnings' in result:
            """ Pretty-print warnings. """
            for warning in result['warnings']:
                print('WARNING: %s' % warning)

        if pandas_output and 'data' in result and 'column_labels' in result:
            result_pandas_df = data_utils.construct_pandas_df(result)
            return result_pandas_df
        else:
            return result
Exemple #4
0
    def execute_statement(
        self,
        bql_statement_ast,
        pretty=True,
        timing=False,
        plots=None,
        yes=False,
        debug=False,
        pandas_df=None,
        pandas_output=True,
        key_column=None,
        return_raw_result=False,
        force_output=False,
    ):
        """
        Accepts a SINGLE BQL STATEMENT as input, parses it, and executes it if it was parsed
        successfully.

        If pretty=True, then the command output will be pretty-printed as a string.
        If pretty=False, then the command output will be returned as a python object.
        If force_output=True, then results will be returned regardless of pretty

        timing=True prints out how long the command took to execute.

        For commands that have visual results, plots=True will cause those to be displayed
        by matplotlib as graphics rather than being pretty-printed as text.
        (Note that the graphics will also be saved if the user added SAVE TO <filename> to the BQL.)
        """
        if timing:
            start_time = time.time()

        parser_out = None
        # TODO move pyparsing objects out of client into parser
        if debug:
            parser_out = self.parser.parse_single_statement(bql_statement_ast)
        else:
            try:
                parser_out = self.parser.parse_single_statement(bql_statement_ast)
            except Exception as e:
                raise utils.BayesDBParseError(str(e))
        if parser_out is None:
            print ("Could not parse command. Try typing 'help' for a list of all commands.")
            return
        elif not parser_out:
            return

        method_name, args_dict, client_dict = parser_out
        if client_dict is None:
            client_dict = {}

        # Do stuff now that you know the user's command, but before passing it to engine.
        if method_name == "execute_file":
            return dict(message="execute_file", bql_string=open(args_dict["filename"], "r").read())
        elif method_name == "update_codebook":
            _, codebook_rows = data_utils.read_csv(client_dict["codebook_path"], has_header=True)
            # TODO: require specific codebook_header values? Or don't require a header,
            # and if the first value in the header is actually a data column name, assume
            # the first row is codebook data, not a header.

            # Create a dict indexed by column name
            codebook = dict()
            for codebook_row in codebook_rows:
                codebook[codebook_row[0]] = dict(zip(["short_name", "description", "value_map"], codebook_row[1:]))

            args_dict["codebook"] = codebook
        elif (method_name == "drop_btable") and (not yes):
            # If dropping something, ask for confirmation.
            print (
                "Are you sure you want to permanently delete this btable, and all associated "
                "models, without any way to get them back? Enter 'y' if yes."
            )
            user_confirmation = raw_input()
            if "y" != user_confirmation.strip():
                return dict(message="Operation canceled by user.")
        elif (method_name == "drop_models") and (not yes):
            # If dropping something, ask for confirmation.
            print (
                "Are you sure you want to permanently delete model(s), without any way to get "
                "them back? Enter 'y' if yes."
            )
            user_confirmation = raw_input()
            if "y" != user_confirmation.strip():
                return dict(message="Operation canceled by user.")
        elif method_name == "load_models":
            pklpath = client_dict["pkl_path"]
            try:
                model_data = pickle.load(gzip.open(self.parser.get_absolute_path(pklpath), "rb"))
            except IOError as e:
                if pklpath[-7:] != ".pkl.gz":
                    if pklpath[-4:] == ".pkl":
                        model_data = pickle.load(open(self.parser.get_absolute_path(pklpath), "rb"))
                    else:
                        pklpath = pklpath + ".pkl.gz"
                        model_data = pickle.load(gzip.open(self.parser.get_absolute_path(pklpath), "rb"))
                else:
                    raise utils.BayesDBError("Models file %s could not be found." % pklpath)
            # This is the more recent version, where schema is stored with models.
            if "schema" in model_data.keys():
                args_dict["models"] = model_data["models"]
                args_dict["model_schema"] = model_data["schema"]
            # This support older saved models, where only the model info was stored.
            else:
                args_dict["models"] = model_data
                args_dict["model_schema"] = None

            # Older versions of model_schema just had a str cctype as the dict items.
            # Newest version has a dict of cctype and parameters. Use this values to
            # test the recency of the models.
            model_schema = args_dict["model_schema"]
            if model_schema:
                model_schema_itemtype = type(model_schema[model_schema.keys()[0]])
            else:
                model_schema_itemtype = None

            if model_schema is None or model_schema_itemtype != dict:
                args_dict["model_schema"] = None
                if not yes:
                    print """WARNING! The models you are currently importing were saved without a schema
                        or without detailed column parameters (probably from a previous version).

                        If you are loading models into the same table from which you created them, problems
                        are unlikely, unless you have dropped models and then updated the schema.

                        If you are loading models into a different table from which you created them, you
                        should verify that the table schemas are the same.

                        Please use "SAVE MODELS FROM <btable> TO <filename.pkl.gz>" to create an updated copy of your models.

                        Are you sure you want to load these model(s)?
                        """
                    user_confirmation = raw_input()
                    if "y" != user_confirmation.strip():
                        return dict(message="Operation canceled by user.")
        elif method_name == "create_btable":
            if pandas_df is None:
                header, rows = data_utils.read_csv(client_dict["csv_path"])
            else:
                header, rows = data_utils.read_pandas_df(pandas_df)
            args_dict["header"] = header
            args_dict["raw_T_full"] = rows
            args_dict["key_column"] = key_column
            args_dict["subsample"] = False

            if "codebook_path" in client_dict:
                _, codebook_rows = data_utils.read_csv(client_dict["codebook_path"], has_header=True)
                # TODO: require specific codebook_header values? Or don't require a header,
                # and if the first value in the header is actually a data column name, assume
                # the first row is codebook data, not a header.

                # Create a dict indexed by column name
                codebook = dict()
                for codebook_row in codebook_rows:
                    codebook[codebook_row[0]] = dict(zip(["short_name", "description", "value_map"], codebook_row[1:]))
                args_dict["codebook"] = codebook
            else:
                warning = dedent(
                    """
                WARNING!

                You are creating a btable without a codebook, which will make interpretation
                of results more difficult. Codebooks should be in CSV format with each row
                corresponding to one column of the original data. The codebook should have four
                columns:

                1. actual column name
                2. short column description
                3. long column description
                4. value map (optional, only used for categorical columns - should be in JSON
                   format)
                """
                )
                print (warning)

            # Display warning messages and get confirmation if btable is too large.
            # Ask user if they want to turn on subsampling.
            max_columns = 200
            max_rows = 1000
            max_cells = 100000
            message = None
            if not yes:
                if len(rows[0]) > max_columns:
                    message = (
                        "The btable you are uploading has %d columns, but BayesDB is "
                        "currently designed to support only %d columns. If you proceed, "
                        "performance may suffer unless you set many columns' datatypes to "
                        "'ignore'. Would you like to continue? Enter 'y' if yes." % (len(rows[0]), max_columns)
                    )
                if len(rows) > max_rows:
                    message = (
                        "The btable you are uploading has %d rows, but BayesDB is currently "
                        "designed to support only %d rows. If you proceed, performance may "
                        "suffer. Would you like to continue? Enter 'y' to continue without "
                        "subsampling, 'n' to abort, 's' to continue by subsampling %d rows, "
                        "or a positive integer to specify the number of rows to be "
                        "subsampled." % (len(rows), max_rows, max_rows)
                    )
                if len(rows[0]) * len(rows) > max_cells:
                    message = (
                        "The btable you are uploading has %d cells, but BayesDB is currently"
                        " designed to support only %d cells. If you proceed, performance may"
                        " suffer unless you enable subsampling. Enter 'y' to continue "
                        " without subsampling, 'n' to abort, 's' to continue by subsampling "
                        "%d rows, or a positive integer to specify the number of rows to be "
                        "subsampled." % (len(rows) * len(rows[0]), max_cells, max_rows)
                    )
                if message is not None:
                    print (message)
                    user_confirmation = raw_input()
                    if "y" == user_confirmation.strip():
                        pass
                    elif "n" == user_confirmation.strip():
                        return dict(message="Operation canceled by user.")
                    elif "s" == user_confirmation.strip():
                        args_dict["subsample"] = min(max_rows, len(rows))
                    elif utils.is_int(user_confirmation.strip()):
                        args_dict["subsample"] = int(user_confirmation.strip())
                    else:
                        return dict(message="Operation canceled by user.")
        elif method_name in ["label_columns", "update_metadata"]:
            if client_dict["source"] == "file":
                header, rows = data_utils.read_csv(client_dict["csv_path"])
                args_dict["mappings"] = {key: value for key, value in rows}

        # Call engine.
        result = self.call_bayesdb_engine(method_name, args_dict, debug)

        # If error occurred, exit now.
        if "error" in result and result["error"]:
            if pretty:
                print (result["message"])
                if force_output:
                    return result
                else:
                    return result["message"]
            else:
                return result

        # Do stuff now that engine has given you output, but before printing the result.
        result = self.callback(method_name, args_dict, client_dict, result)

        if return_raw_result:
            raw_result = {"result": result, "method_name": method_name, "client_dict": client_dict}
            print ("returning raw result for %s" % (method_name))
            return raw_result

        assert type(result) != int

        if timing:
            end_time = time.time()
            print ("Elapsed time: %.2f seconds." % (end_time - start_time))

        if plots is None:
            plots = "DISPLAY" in os.environ.keys()

        if "matrix" in result and (plots or client_dict["filename"]):
            # Plot matrices
            plotting_utils.plot_matrix(
                result["matrix"], result["column_names"], result["title"], client_dict["filename"]
            )
            if pretty:
                if "column_lists" in result:
                    print (self.pretty_print(dict(column_lists=result["column_lists"])))

                if force_output:
                    return result
                else:
                    return self.pretty_print(result)
            else:
                return result
        if "plot" in client_dict and client_dict["plot"]:
            if plots or client_dict["filename"]:
                # Plot generalized histograms or scatterplots

                try:
                    plotting_M_c = result["metadata_full"]["M_c_full"]
                except KeyError:
                    plotting_M_c = result["M_c"]

                plot_remove_key = method_name in ["select", "infer"]
                plotting_utils.plot_general_histogram(
                    result["column_names"],
                    result["data"],
                    plotting_M_c,
                    result["schema_full"],
                    client_dict["filename"],
                    client_dict["scatter"],
                    remove_key=plot_remove_key,
                )
                return self.pretty_print(result)
            else:
                if "message" not in result:
                    result["message"] = ""
                result["message"] = (
                    "Your query indicates that you would like to make a plot, but "
                    "in order to do so, you must either enable plotting in a "
                    "window or specify a filename to save to by appending 'SAVE "
                    "TO <filename>' to this command.\n" + result["message"]
                )

        if pretty:
            pp = self.pretty_print(result)
            print (pp)

        # Print warnings last so they're readable without scrolling backwards.
        if "warnings" in result:
            """ Pretty-print warnings. """
            for warning in result["warnings"]:
                print ("WARNING: %s" % warning)

        if pandas_output and "data" in result and "column_labels" in result:
            result_pandas_df = data_utils.construct_pandas_df(result)
            return result_pandas_df
        else:
            return result